gnss-sdr/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/patches for generating volk.../2014-10-17_Patch_with_proto...

57095 lines
2.5 MiB

Binary files /Users/andres/Desktop/volk_gnsssdr/.DS_Store and /Users/andres/Desktop/volk_gnsssdr_original/.DS_Store differ
diff -rupN /Users/andres/Desktop/volk_gnsssdr/apps/volk_gnsssdr_profile.cc /Users/andres/Desktop/volk_gnsssdr_original/apps/volk_gnsssdr_profile.cc
--- /Users/andres/Desktop/volk_gnsssdr/apps/volk_gnsssdr_profile.cc 2014-10-17 05:07:25.000000000 +0200
+++ /Users/andres/Desktop/volk_gnsssdr_original/apps/volk_gnsssdr_profile.cc 2014-10-17 05:01:21.000000000 +0200
@@ -39,7 +39,7 @@ namespace fs = boost::filesystem;
void write_json(std::ofstream &json_file, std::vector<volk_gnsssdr_test_results_t> results) {
json_file << "{" << std::endl;
- json_file << " \"volk_gnsssdr_tests\": [" << std::endl;
+ json_file << " \"volk_tests\": [" << std::endl;
size_t len = results.size();
size_t i = 0;
BOOST_FOREACH(volk_gnsssdr_test_results_t &result, results) {
@@ -48,9 +48,9 @@ void write_json(std::ofstream &json_file
json_file << " \"vlen\": " << result.vlen << "," << std::endl;
json_file << " \"iter\": " << result.iter << "," << std::endl;
json_file << " \"best_arch_a\": \"" << result.best_arch_a
- << "\"," << std::endl;
+ << "\"," << std::endl;
json_file << " \"best_arch_u\": \"" << result.best_arch_u
- << "\"," << std::endl;
+ << "\"," << std::endl;
json_file << " \"results\": {" << std::endl;
size_t results_len = result.results.size();
size_t ri = 0;
@@ -84,26 +84,26 @@ int main(int argc, char *argv[]) {
// Adding program options
boost::program_options::options_description desc("Options");
desc.add_options()
- ("help,h", "Print help messages")
- ("benchmark,b",
- boost::program_options::value<bool>()->default_value( false )
- ->implicit_value( true ),
- "Run all kernels (benchmark mode)")
- ("tests-regex,R",
- boost::program_options::value<std::string>(),
- "Run tests matching regular expression.")
- ("json,j",
- boost::program_options::value<std::string>(),
- "JSON output file")
- ;
-
+ ("help,h", "Print help messages")
+ ("benchmark,b",
+ boost::program_options::value<bool>()->default_value( false )
+ ->implicit_value( true ),
+ "Run all kernels (benchmark mode)")
+ ("tests-regex,R",
+ boost::program_options::value<std::string>(),
+ "Run tests matching regular expression.")
+ ("json,j",
+ boost::program_options::value<std::string>(),
+ "JSON output file")
+ ;
+
// Handle the options that were given
boost::program_options::variables_map vm;
bool benchmark_mode;
std::string kernel_regex;
bool store_results = true;
std::ofstream json_file;
-
+
try {
boost::program_options::store(boost::program_options::parse_command_line(argc, argv, desc), vm);
boost::program_options::notify(vm);
@@ -123,20 +123,20 @@ int main(int argc, char *argv[]) {
return 1;
}
/** --help option
-*/
+ */
if ( vm.count("help") )
{
- std::cout << "The VOLK profiler." << std::endl
- << desc << std::endl;
- return 0;
+ std::cout << "The VOLK profiler." << std::endl
+ << desc << std::endl;
+ return 0;
}
-
+
if ( vm.count("json") )
{
json_file.open( vm["json"].as<std::string>().c_str() );
}
-
-
+
+
// Run tests
std::vector<volk_gnsssdr_test_results_t> results;
@@ -152,36 +152,84 @@ int main(int argc, char *argv[]) {
//VOLK_PROFILE(volk_gnsssdr_32u_popcnt, 0, 0, 2046, 10000, &results, benchmark_mode, kernel_regex);
//VOLK_PROFILE(volk_gnsssdr_64u_popcnt, 0, 0, 2046, 10000, &results, benchmark_mode, kernel_regex);
//VOLK_PROFILE(volk_gnsssdr_32fc_s32fc_multiply_32fc, 1e-4, lv_32fc_t(1.0, 0.5), 204602, 1000, &results, benchmark_mode, kernel_regex);
-
+
+ //GNSS-SDR PROTO-KERNELS
+ //lv_32fc_t sfv = lv_cmake((float)1, (float)2);
+ //example: VOLK_PROFILE(volk_gnsssdr_8ic_s8ic_multiply_8ic, 1e-4, sfv, 204602, 1000, &results, benchmark_mode, kernel_regex);
+
+ //CAN NOT BE TESTED YET BECAUSE VOLK MODULE DOES NOT SUPPORT IT:
+ //VOLK_PROFILE(volk_gnsssdr_s32f_x2_update_local_carrier_32fc, 1e-4, 0, 16007, 1, &results, benchmark_mode, kernel_regex);
+ //VOLK_PROFILE(volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc, 1e-4, 0, 7, 1, &results, benchmark_mode, kernel_regex);
+
+ VOLK_PROFILE(volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex);
+ VOLK_PROFILE(volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex);
+ VOLK_PROFILE(volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex);
+ VOLK_PROFILE(volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex);
+
+ VOLK_PROFILE(volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex);
+ VOLK_PROFILE(volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex);
+ VOLK_PROFILE(volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex);
+
+ VOLK_PROFILE(volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex);
+ VOLK_PROFILE(volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex);
+ VOLK_PROFILE(volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex);
+ VOLK_PROFILE(volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex);
+
+ VOLK_PROFILE(volk_gnsssdr_32fc_convert_16ic, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex);
+ VOLK_PROFILE(volk_gnsssdr_32fc_convert_8ic, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex);
+ VOLK_PROFILE(volk_gnsssdr_32fc_s32f_convert_8ic, 1e-4, 5, 16000, 250, &results, benchmark_mode, kernel_regex);
+
+ /*VOLK_PROFILE(volk_gnsssdr_32f_accumulator_s32f, 1e-4, 0, 204602, 10000, &results, benchmark_mode, kernel_regex);
+ VOLK_PROFILE(volk_gnsssdr_8i_accumulator_s8i, 1e-4, 0, 204602, 10000, &results, benchmark_mode, kernel_regex);
+ VOLK_PROFILE(volk_gnsssdr_32f_index_max_16u, 3, 0, 204602, 5000, &results, benchmark_mode, kernel_regex);
+ VOLK_PROFILE(volk_gnsssdr_8i_index_max_16u, 3, 0, 204602, 5000, &results, benchmark_mode, kernel_regex);
+ VOLK_PROFILE(volk_gnsssdr_8i_max_s8i, 3, 0, 204602, 5000, &results, benchmark_mode, kernel_regex);
+ VOLK_PROFILE(volk_gnsssdr_32f_x2_add_32f, 1e-4, 0, 204602, 10000, &results, benchmark_mode, kernel_regex);
+ VOLK_PROFILE(volk_gnsssdr_8i_x2_add_8i, 1e-4, 0, 204602, 10000, &results, benchmark_mode, kernel_regex);
+ VOLK_PROFILE(volk_gnsssdr_32fc_conjugate_32fc, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
+ VOLK_PROFILE(volk_gnsssdr_8ic_conjugate_8ic, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
+ VOLK_PROFILE(volk_gnsssdr_32fc_magnitude_squared_32f, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
+ VOLK_PROFILE(volk_gnsssdr_8ic_magnitude_squared_8i, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
+ VOLK_PROFILE(volk_gnsssdr_32fc_s32fc_multiply_32fc, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
+ VOLK_PROFILE(volk_gnsssdr_8ic_s8ic_multiply_8ic, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
+ VOLK_PROFILE(volk_gnsssdr_32fc_x2_dot_prod_32fc, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
+ VOLK_PROFILE(volk_gnsssdr_8ic_x2_dot_prod_8ic, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
+ VOLK_PROFILE(volk_gnsssdr_32fc_x2_multiply_32fc, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
+ VOLK_PROFILE(volk_gnsssdr_8ic_x2_multiply_8ic, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
+ VOLK_PROFILE(volk_gnsssdr_8u_x2_multiply_8u, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
+ VOLK_PROFILE(volk_gnsssdr_64f_accumulator_64f, 1e-4, 0, 16000, 1000, &results, benchmark_mode, kernel_regex);
+ VOLK_PROFILE(volk_gnsssdr_32f_s32f_convert_16i, 1e-4, 1, 204602, 250, &results, benchmark_mode, kernel_regex);
+ VOLK_PROFILE(volk_gnsssdr_16i_s32f_convert_32f, 1e-4, 1, 204602, 250, &results, benchmark_mode, kernel_regex);*/
+
// Until we can update the config on a kernel by kernel basis
- // do not overwrite volk_gnsssdr_config when using a regex.
+ // do not overwrite volk_config when using a regex.
if(store_results) {
char path[1024];
volk_gnsssdr_get_config_path(path);
-
+
const fs::path config_path(path);
-
+
if (not fs::exists(config_path.branch_path()))
{
std::cout << "Creating " << config_path.branch_path() << "..." << std::endl;
fs::create_directories(config_path.branch_path());
}
-
+
std::cout << "Writing " << config_path << "..." << std::endl;
std::ofstream config(config_path.string().c_str());
if(!config.is_open()) { //either we don't have write access or we don't have the dir yet
std::cout << "Error opening file " << config_path << std::endl;
}
-
+
config << "\
-#this file is generated by volk_gnsssdr_profile.\n\
-#the function name is followed by the preferred architecture.\n\
-";
-
+ #this file is generated by volk_profile.\n\
+ #the function name is followed by the preferred architecture.\n\
+ ";
+
BOOST_FOREACH(volk_gnsssdr_test_results_t result, results) {
config << result.config_name << " "
- << result.best_arch_a << " "
- << result.best_arch_u << std::endl;
+ << result.best_arch_a << " "
+ << result.best_arch_u << std::endl;
}
config.close();
}
Binary files /Users/andres/Desktop/volk_gnsssdr/kernels/.DS_Store and /Users/andres/Desktop/volk_gnsssdr_original/kernels/.DS_Store differ
diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/CommonMacros/CommonMacros.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/CommonMacros/CommonMacros.h
--- /Users/andres/Desktop/volk_gnsssdr/kernels/CommonMacros/CommonMacros.h 1970-01-01 01:00:00.000000000 +0100
+++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/CommonMacros/CommonMacros.h 2014-10-15 01:55:08.000000000 +0200
@@ -0,0 +1,174 @@
+/*!
+ * \file CommonMacros.h
+ * \brief Common macros used inside the volk protokernels.
+ * \authors <ul>
+ * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
+ * </ul>
+ *
+ * -------------------------------------------------------------------------
+ *
+ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
+ *
+ * GNSS-SDR is a software defined Global Navigation
+ * Satellite Systems receiver
+ *
+ * This file is part of GNSS-SDR.
+ *
+ * GNSS-SDR is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * at your option) any later version.
+ *
+ * GNSS-SDR is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * -------------------------------------------------------------------------
+ */
+#ifndef INCLUDED_gnsssdr_CommonMacros_u_H
+#define INCLUDED_gnsssdr_CommonMacros_u_H
+
+ #ifdef LV_HAVE_SSE4_1
+ /*!
+ \brief Macros for U_SSE4_1
+ */
+
+ #ifndef CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1
+ #define CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(input1, input2, real, imag)\
+ imag = _mm_srli_si128 (input1, 2);\
+ imag = _mm_blend_epi16 (input2, imag, 85);\
+ real = _mm_slli_si128 (input2, 2);\
+ real = _mm_blend_epi16 (real, input1, 85);
+ #endif /* CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1 */
+
+ #ifndef CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1
+ #define CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(input, input_i_1, input_i_2, output_i32, output_ps)\
+ input_i_1 = _mm_cvtepi16_epi32(input);\
+ input = _mm_srli_si128 (input, 8);\
+ input_i_2 = _mm_cvtepi16_epi32(input);\
+ output_i32 = _mm_add_epi32 (input_i_1, input_i_2);\
+ output_ps = _mm_cvtepi32_ps(output_i32);
+ #endif /* CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1 */
+
+ #ifndef CM_8IC_CONVERT_AND_ACC_32FC_U_SSE4_1
+ #define CM_8IC_CONVERT_AND_ACC_32FC_U_SSE4_1(input, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)\
+ input_i_1 = _mm_cvtepi8_epi32(input);\
+ input = _mm_srli_si128 (input, 4);\
+ input_i_2 = _mm_cvtepi8_epi32(input);\
+ input = _mm_srli_si128 (input, 4);\
+ output_i32_1 = _mm_add_epi32 (input_i_1, input_i_2);\
+ input_i_1 = _mm_cvtepi8_epi32(input);\
+ input = _mm_srli_si128 (input, 4);\
+ input_i_2 = _mm_cvtepi8_epi32(input);\
+ input = _mm_srli_si128 (input, 4);\
+ output_i32_2 = _mm_add_epi32 (input_i_1, input_i_2);\
+ output_i32 = _mm_add_epi32 (output_i32_1, output_i32_2);\
+ output_ps = _mm_cvtepi32_ps(output_i32);
+ #endif /* CM_8IC_CONVERT_AND_ACC_32FC_U_SSE4_1 */
+
+ #endif /* LV_HAVE_SSE4_1 */
+
+ #ifdef LV_HAVE_SSE2
+ /*!
+ \brief Macros for U_SSE2
+ */
+
+ #ifdef LV_HAVE_SSSE3
+ /*!
+ \brief Macros for U_SSSE3
+ */
+
+ #ifndef CM_8IC_X2_SCALAR_PRODUCT_16IC_X2_U_SSSE3
+ #define CM_8IC_X2_SCALAR_PRODUCT_16IC_X2_U_SSSE3(y, x, check_sign_sequence, rearrange_sequence, y_aux, x_abs, real_output, imag_output)\
+ y_aux = _mm_sign_epi8 (y, x);\
+ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence);\
+ real_output = _mm_maddubs_epi16 (x_abs, y_aux);\
+ \
+ y_aux = _mm_shuffle_epi8 (y, rearrange_sequence);\
+ y_aux = _mm_sign_epi8 (y_aux, x);\
+ imag_output = _mm_maddubs_epi16 (x_abs, y_aux);
+ #endif /* CM_8IC_X2_SCALAR_PRODUCT_16IC_X2_U_SSSE3 */
+
+ #endif /* LV_HAVE_SSSE3 */
+
+ #ifndef CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2
+ #define CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output)\
+ realx_mult_realy = _mm_mullo_epi16 (realx, realy);\
+ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);\
+ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);\
+ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);\
+ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);\
+ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
+ #endif /* CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2 */
+
+ #ifndef CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2
+ #define CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(input, mult1, real, imag)\
+ imag = _mm_srli_si128 (input, 1);\
+ imag = _mm_and_si128 (imag, mult1);\
+ real = _mm_and_si128 (input, mult1);
+ #endif /* CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2 */
+
+ #ifndef CM_8IC_CONVERT_AND_ACC_32FC_U_SSE2
+ #define CM_8IC_CONVERT_AND_ACC_32FC_U_SSE2(input, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)\
+ input_i_1 = _mm_unpacklo_epi8(_mm_setzero_si128(), input);\
+ input_i_2 = _mm_unpacklo_epi16(_mm_setzero_si128(), input_i_1);\
+ input_i_1 = _mm_unpackhi_epi16(_mm_setzero_si128(), input_i_1);\
+ input_i_1 = _mm_srai_epi32(input_i_1, 24);\
+ input_i_2 = _mm_srai_epi32(input_i_2, 24);\
+ output_i32 = _mm_add_epi32(input_i_1, input_i_2);\
+ output_ps_1 = _mm_cvtepi32_ps(output_i32);\
+ \
+ input_i_1 = _mm_unpackhi_epi8(_mm_setzero_si128(), input);\
+ input_i_2 = _mm_unpacklo_epi16(_mm_setzero_si128(), input_i_1);\
+ input_i_1 = _mm_unpackhi_epi16(_mm_setzero_si128(), input_i_1);\
+ input_i_1 = _mm_srai_epi32(input_i_1, 24);\
+ input_i_2 = _mm_srai_epi32(input_i_2, 24);\
+ output_i32 = _mm_add_epi32(input_i_1, input_i_2);\
+ output_ps_2 = _mm_cvtepi32_ps(output_i32);
+ #endif /* CM_8IC_CONVERT_AND_ACC_32FC_U_SSE2 */
+
+ #ifndef CM_8IC_CONTROLMINUS128_8IC_U_SSE2
+ #define CM_8IC_CONTROLMINUS128_8IC_U_SSE2(y, minus128, minus128control)\
+ minus128control = _mm_cmpeq_epi8 (y, minus128);\
+ y = _mm_sub_epi8 (y, minus128control);
+ #endif /* CM_8IC_CONTROLMINUS128_8IC_U_SSE2 */
+
+ #endif /* LV_HAVE_SSE2 */
+
+ #ifdef LV_HAVE_GENERIC
+ /*!
+ \brief Macros for U_GENERIC
+ */
+
+ #endif /* LV_HAVE_GENERIC */
+#endif /* INCLUDED_gnsssdr_CommonMacros_u_H */
+
+
+#ifndef INCLUDED_gnsssdr_CommonMacros_a_H
+#define INCLUDED_gnsssdr_CommonMacros_a_H
+
+ #ifdef LV_HAVE_SSE4_1
+ /*!
+ \brief Macros for A_SSE4_1
+ */
+
+ #endif /* LV_HAVE_SSE4_1 */
+
+ #ifdef LV_HAVE_SSE2
+ /*!
+ \brief Macros for U_SSE2
+ */
+
+ #endif /* LV_HAVE_SSE2 */
+
+ #ifdef LV_HAVE_GENERIC
+ /*!
+ \brief Macros for A_GENERIC
+ */
+
+ #endif /* LV_HAVE_GENERIC */
+#endif /* INCLUDED_gnsssdr_CommonMacros_a_H */
diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/CommonMacros/CommonMacros_16ic_cw_epl_corr_32fc.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/CommonMacros/CommonMacros_16ic_cw_epl_corr_32fc.h
--- /Users/andres/Desktop/volk_gnsssdr/kernels/CommonMacros/CommonMacros_16ic_cw_epl_corr_32fc.h 1970-01-01 01:00:00.000000000 +0100
+++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/CommonMacros/CommonMacros_16ic_cw_epl_corr_32fc.h 2014-10-15 01:55:08.000000000 +0200
@@ -0,0 +1,76 @@
+/*!
+ * \file CommonMacros_16ic_cw_corr_32fc.h
+ * \brief Common macros used inside the 16ic_cw_corr_32fc volk protokernels.
+ * \authors <ul>
+ * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
+ * </ul>
+ *
+ * -------------------------------------------------------------------------
+ *
+ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
+ *
+ * GNSS-SDR is a software defined Global Navigation
+ * Satellite Systems receiver
+ *
+ * This file is part of GNSS-SDR.
+ *
+ * GNSS-SDR is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * at your option) any later version.
+ *
+ * GNSS-SDR is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * -------------------------------------------------------------------------
+ */
+#ifndef INCLUDED_gnsssdr_CommonMacros_16ic_cw_corr_32fc_u_H
+#define INCLUDED_gnsssdr_CommonMacros_16ic_cw_corr_32fc_u_H
+#include "CommonMacros/CommonMacros.h"
+
+ #ifdef LV_HAVE_SSE4_1
+ /*!
+ \brief Macros for U_SSE4_1
+ */
+
+ #ifndef CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1
+ #define CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)\
+ CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy)\
+ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(real_bb_signal_sample, imag_bb_signal_sample, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output)\
+ CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(real_output, input_i_1, input_i_2, output_i32, real_output_ps)\
+ CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(imag_output, input_i_1, input_i_2, output_i32, imag_output_ps)
+ #endif /* CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1 */
+
+ #endif /* LV_HAVE_SSE4_1 */
+
+ #ifdef LV_HAVE_GENERIC
+ /*!
+ \brief Macros for U_GENERIC
+ */
+
+ #endif /* LV_HAVE_GENERIC */
+#endif /* INCLUDED_gnsssdr_CommonMacros_16ic_cw_corr_32fc_u_H */
+
+
+#ifndef INCLUDED_gnsssdr_CommonMacros_16ic_cw_corr_32fc_a_H
+#define INCLUDED_gnsssdr_CommonMacros_16ic_cw_corr_32fc_a_H
+
+ #ifdef LV_HAVE_SSE4_1
+ /*!
+ \brief Macros for A_SSE4_1
+ */
+
+ #endif /* LV_HAVE_SSE4_1 */
+
+ #ifdef LV_HAVE_GENERIC
+ /*!
+ \brief Macros for A_GENERIC
+ */
+
+ #endif /* LV_HAVE_GENERIC */
+#endif /* INCLUDED_gnsssdr_CommonMacros_16ic_cw_corr_32fc_a_H */
diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h
--- /Users/andres/Desktop/volk_gnsssdr/kernels/CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h 1970-01-01 01:00:00.000000000 +0100
+++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h 2014-10-15 01:55:08.000000000 +0200
@@ -0,0 +1,114 @@
+/*!
+ * \file CommonMacros_8ic_cw_corr_32fc.h
+ * \brief Common macros used inside the 8ic_cw_corr_32fc volk protokernels.
+ * \authors <ul>
+ * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
+ * </ul>
+ *
+ * -------------------------------------------------------------------------
+ *
+ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
+ *
+ * GNSS-SDR is a software defined Global Navigation
+ * Satellite Systems receiver
+ *
+ * This file is part of GNSS-SDR.
+ *
+ * GNSS-SDR is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * at your option) any later version.
+ *
+ * GNSS-SDR is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * -------------------------------------------------------------------------
+ */
+#ifndef INCLUDED_gnsssdr_CommonMacros_8ic_cw_corr_32fc_u_H
+#define INCLUDED_gnsssdr_CommonMacros_8ic_cw_corr_32fc_u_H
+#include "CommonMacros/CommonMacros.h"
+
+ #ifdef LV_HAVE_SSE4_1
+ /*!
+ \brief Macros for U_SSE4_1
+ */
+
+ #ifndef CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1
+ #define CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)\
+ CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy)\
+ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(real_bb_signal_sample, imag_bb_signal_sample, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output)\
+ \
+ imag_output = _mm_slli_si128 (imag_output, 1);\
+ output = _mm_blendv_epi8 (imag_output, real_output, mult1);\
+ \
+ CM_8IC_CONVERT_AND_ACC_32FC_U_SSE4_1(output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
+ #endif /* CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1 */
+
+ #ifndef CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1
+ #define CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)\
+ CM_8IC_CONTROLMINUS128_8IC_U_SSE2(y, minus128, minus128control)\
+ CM_8IC_X2_SCALAR_PRODUCT_16IC_X2_U_SSSE3(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output)\
+ CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(real_output, input_i_1, input_i_2, output_i32, real_output_ps)\
+ CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(imag_output, input_i_1, input_i_2, output_i32, imag_output_ps)
+ #endif /* CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1 */
+
+ #ifndef CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1
+ #define CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)\
+ CM_8IC_X2_SCALAR_PRODUCT_16IC_X2_U_SSSE3(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output)\
+ CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(real_output, input_i_1, input_i_2, output_i32, real_output_ps)\
+ CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(imag_output, input_i_1, input_i_2, output_i32, imag_output_ps)
+ #endif /* CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1 */
+
+ #endif /* LV_HAVE_SSE4_1 */
+
+ #ifdef LV_HAVE_SSE2
+ /*!
+ \brief Macros for U_SSE2
+ */
+
+ #ifndef CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2
+ #define CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)\
+ CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy)\
+ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(real_bb_signal_sample, imag_bb_signal_sample, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output)\
+ \
+ real_output = _mm_and_si128 (real_output, mult1);\
+ imag_output = _mm_and_si128 (imag_output, mult1);\
+ imag_output = _mm_slli_si128 (imag_output, 1);\
+ output = _mm_or_si128 (real_output, imag_output);\
+ \
+ CM_8IC_CONVERT_AND_ACC_32FC_U_SSE2(output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
+ #endif /* CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2 */
+
+ #endif /* LV_HAVE_SSE2 */
+
+ #ifdef LV_HAVE_GENERIC
+ /*!
+ \brief Macros for U_GENERIC
+ */
+
+ #endif /* LV_HAVE_GENERIC */
+#endif /* INCLUDED_gnsssdr_CommonMacros_8ic_cw_corr_32fc_u_H */
+
+
+#ifndef INCLUDED_gnsssdr_CommonMacros_8ic_cw_corr_32fc_a_H
+#define INCLUDED_gnsssdr_CommonMacros_8ic_cw_corr_32fc_a_H
+
+ #ifdef LV_HAVE_SSE4_1
+ /*!
+ \brief Macros for A_SSE4_1
+ */
+
+ #endif /* LV_HAVE_SSE4_1 */
+
+ #ifdef LV_HAVE_GENERIC
+ /*!
+ \brief Macros for A_GENERIC
+ */
+
+ #endif /* LV_HAVE_GENERIC */
+#endif /* INCLUDED_gnsssdr_CommonMacros_8ic_cw_corr_32fc_a_H */
diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/CommonMacros/README.txt /Users/andres/Desktop/volk_gnsssdr_original/kernels/CommonMacros/README.txt
--- /Users/andres/Desktop/volk_gnsssdr/kernels/CommonMacros/README.txt 1970-01-01 01:00:00.000000000 +0100
+++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/CommonMacros/README.txt 2014-10-15 01:55:08.000000000 +0200
@@ -0,0 +1,34 @@
+####################################################################
+Common Macros inside volk_gnsssdr module
+####################################################################
+
+First of all, sorry for making you need to read this: macros are evil, they can not be debugged, you do not know where the errors come from, syntax is annoying.. BUT this is the only way I found that allows to share one piece of code between various proto-kernels without performance penalties.
+Inline functions have been tested, and they introduce a really small time penalty, but it becomes huge because of long loops, with thousands of samples.
+
+####################################################################
+Syntax
+####################################################################
+
+In order to allow better understanding of the code I created the macros with an specific syntax.
+
+1) Inside CommonMacros.h you will find macros for common operations. I will explain the syntax with an example:
+
+example: CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output)
+
+First of all, you find the characters “CM”, which means CommonMacros. After that the type and the amount of inputs is placed: “_16IC_X4” (16 bits complex integers, four inputs). The syntax for type is the same as the one used with volk protokernels, refer to GNURadio documentation for more help. The it comes the name of the macro (“_SCALAR_PRODUCT”), and after that the type and the amount of outputs (“_16IC_X2”). Finally it is placed the SSE minimum version needed to run (“_U_SSE2”). In the arguments you will find (from left to right) the inputs (four inputs: realx, imagx, realy, imagy), some variables that the macro needs to work (realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy) and finally the outputs (two outputs: real_output, imag_output).
+The variables that the macro needs are specified when calling it in order to avoid after-compile problems: if you want to use a macro you will need to declare all the variables it needs before, or you will not be able to compile.
+
+2) Inside all the other headers, CommonMacros_XXXXXX.h you will find macros for a specific group of proto-kernels. The syntax is the same as the CommonMacros.h
+
+####################################################################
+Workflow
+####################################################################
+
+In order to use the macros easily, I usually test the code without macros inside a testing proto-kernel, where you are able to test it, debug it and use breakpoints.
+When it works I place code inside a macro an I test it again.
+
+####################################################################
+Why macros
+####################################################################
+1) They are the only way I could find for sharing code between proto-kernels without performance penalty.
+2) It is true that they are really difficult to debug, but if you work with them responsibly it is not so hard. Volk_gnsssdr checks all the SSE proto-kernels implementations results against the generic implementation results, so if your macro is not working you will appreciate it after profiling it.
\ No newline at end of file
diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16i_s32f_convert_32f.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_16i_s32f_convert_32f.h
--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16i_s32f_convert_32f.h 1970-01-01 01:00:00.000000000 +0100
+++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_16i_s32f_convert_32f.h 2014-10-15 01:55:08.000000000 +0200
@@ -0,0 +1,241 @@
+#ifndef INCLUDED_volk_gnsssdr_16i_s32f_convert_32f_u_H
+#define INCLUDED_volk_gnsssdr_16i_s32f_convert_32f_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+ /*!
+ \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+ \param inputVector The 16 bit input data buffer
+ \param outputVector The floating point output data buffer
+ \param scalar The value divided against each point in the output buffer
+ \param num_points The number of data values to be converted
+ \note Output buffer does NOT need to be properly aligned
+ */
+static inline void volk_gnsssdr_16i_s32f_convert_32f_u_sse4_1(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
+
+ float* outputVectorPtr = outputVector;
+ __m128 invScalar = _mm_set_ps1(1.0/scalar);
+ int16_t* inputPtr = (int16_t*)inputVector;
+ __m128i inputVal;
+ __m128i inputVal2;
+ __m128 ret;
+
+ for(;number < eighthPoints; number++){
+
+ // Load the 8 values
+ inputVal = _mm_loadu_si128((__m128i*)inputPtr);
+
+ // Shift the input data to the right by 64 bits ( 8 bytes )
+ inputVal2 = _mm_srli_si128(inputVal, 8);
+
+ // Convert the lower 4 values into 32 bit words
+ inputVal = _mm_cvtepi16_epi32(inputVal);
+ inputVal2 = _mm_cvtepi16_epi32(inputVal2);
+
+ ret = _mm_cvtepi32_ps(inputVal);
+ ret = _mm_mul_ps(ret, invScalar);
+ _mm_storeu_ps(outputVectorPtr, ret);
+ outputVectorPtr += 4;
+
+ ret = _mm_cvtepi32_ps(inputVal2);
+ ret = _mm_mul_ps(ret, invScalar);
+ _mm_storeu_ps(outputVectorPtr, ret);
+
+ outputVectorPtr += 4;
+
+ inputPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for(; number < num_points; number++){
+ outputVector[number] =((float)(inputVector[number])) / scalar;
+ }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+ /*!
+ \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+ \param inputVector The 16 bit input data buffer
+ \param outputVector The floating point output data buffer
+ \param scalar The value divided against each point in the output buffer
+ \param num_points The number of data values to be converted
+ \note Output buffer does NOT need to be properly aligned
+ */
+static inline void volk_gnsssdr_16i_s32f_convert_32f_u_sse(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float* outputVectorPtr = outputVector;
+ __m128 invScalar = _mm_set_ps1(1.0/scalar);
+ int16_t* inputPtr = (int16_t*)inputVector;
+ __m128 ret;
+
+ for(;number < quarterPoints; number++){
+ ret = _mm_set_ps((float)(inputPtr[3]), (float)(inputPtr[2]), (float)(inputPtr[1]), (float)(inputPtr[0]));
+
+ ret = _mm_mul_ps(ret, invScalar);
+ _mm_storeu_ps(outputVectorPtr, ret);
+
+ inputPtr += 4;
+ outputVectorPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ outputVector[number] = (float)(inputVector[number]) / scalar;
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+ /*!
+ \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+ \param inputVector The 16 bit input data buffer
+ \param outputVector The floating point output data buffer
+ \param scalar The value divided against each point in the output buffer
+ \param num_points The number of data values to be converted
+ \note Output buffer does NOT need to be properly aligned
+ */
+static inline void volk_gnsssdr_16i_s32f_convert_32f_generic(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
+ float* outputVectorPtr = outputVector;
+ const int16_t* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_gnsssdr_16i_s32f_convert_32f_u_H */
+#ifndef INCLUDED_volk_gnsssdr_16i_s32f_convert_32f_a_H
+#define INCLUDED_volk_gnsssdr_16i_s32f_convert_32f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+ /*!
+ \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+ \param inputVector The 16 bit input data buffer
+ \param outputVector The floating point output data buffer
+ \param scalar The value divided against each point in the output buffer
+ \param num_points The number of data values to be converted
+ */
+static inline void volk_gnsssdr_16i_s32f_convert_32f_a_sse4_1(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
+
+ float* outputVectorPtr = outputVector;
+ __m128 invScalar = _mm_set_ps1(1.0/scalar);
+ int16_t* inputPtr = (int16_t*)inputVector;
+ __m128i inputVal;
+ __m128i inputVal2;
+ __m128 ret;
+
+ for(;number < eighthPoints; number++){
+
+ // Load the 8 values
+ inputVal = _mm_loadu_si128((__m128i*)inputPtr);
+
+ // Shift the input data to the right by 64 bits ( 8 bytes )
+ inputVal2 = _mm_srli_si128(inputVal, 8);
+
+ // Convert the lower 4 values into 32 bit words
+ inputVal = _mm_cvtepi16_epi32(inputVal);
+ inputVal2 = _mm_cvtepi16_epi32(inputVal2);
+
+ ret = _mm_cvtepi32_ps(inputVal);
+ ret = _mm_mul_ps(ret, invScalar);
+ _mm_storeu_ps(outputVectorPtr, ret);
+ outputVectorPtr += 4;
+
+ ret = _mm_cvtepi32_ps(inputVal2);
+ ret = _mm_mul_ps(ret, invScalar);
+ _mm_storeu_ps(outputVectorPtr, ret);
+
+ outputVectorPtr += 4;
+
+ inputPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for(; number < num_points; number++){
+ outputVector[number] =((float)(inputVector[number])) / scalar;
+ }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+ /*!
+ \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+ \param inputVector The 16 bit input data buffer
+ \param outputVector The floating point output data buffer
+ \param scalar The value divided against each point in the output buffer
+ \param num_points The number of data values to be converted
+ */
+static inline void volk_gnsssdr_16i_s32f_convert_32f_a_sse(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float* outputVectorPtr = outputVector;
+ __m128 invScalar = _mm_set_ps1(1.0/scalar);
+ int16_t* inputPtr = (int16_t*)inputVector;
+ __m128 ret;
+
+ for(;number < quarterPoints; number++){
+ ret = _mm_set_ps((float)(inputPtr[3]), (float)(inputPtr[2]), (float)(inputPtr[1]), (float)(inputPtr[0]));
+
+ ret = _mm_mul_ps(ret, invScalar);
+ _mm_storeu_ps(outputVectorPtr, ret);
+
+ inputPtr += 4;
+ outputVectorPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ outputVector[number] = (float)(inputVector[number]) / scalar;
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+ /*!
+ \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+ \param inputVector The 16 bit input data buffer
+ \param outputVector The floating point output data buffer
+ \param scalar The value divided against each point in the output buffer
+ \param num_points The number of data values to be converted
+ */
+static inline void volk_gnsssdr_16i_s32f_convert_32f_a_generic(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
+ float* outputVectorPtr = outputVector;
+ const int16_t* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_gnsssdr_16i_s32f_convert_32f_a_H */
diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3.h
--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3.h 1970-01-01 01:00:00.000000000 +0100
+++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3.h 2014-10-15 01:55:08.000000000 +0200
@@ -0,0 +1,461 @@
+/*!
+ * \file volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3.h
+ * \brief Volk protokernel: performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation with 32 bits vectors
+ * \authors <ul>
+ * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
+ * </ul>
+ *
+ * Volk protokernel that performs the carrier wipe-off mixing and the
+ * Early, Prompt, and Late correlation with 32 bits vectors (16 bits the
+ * real part and 16 bits the imaginary part):
+ * - The carrier wipe-off is done by multiplying the input signal by the
+ * carrier (multiplication of 32 bits vectors) It returns the input
+ * signal in base band (BB)
+ * - Early values are calculated by multiplying the input signal in BB by the
+ * early code (multiplication of 32 bits vectors), accumulating the results
+ * - Prompt values are calculated by multiplying the input signal in BB by the
+ * prompt code (multiplication of 32 bits vectors), accumulating the results
+ * - Late values are calculated by multiplying the input signal in BB by the
+ * late code (multiplication of 32 bits vectors), accumulating the results
+ *
+ * -------------------------------------------------------------------------
+ *
+ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
+ *
+ * GNSS-SDR is a software defined Global Navigation
+ * Satellite Systems receiver
+ *
+ * This file is part of GNSS-SDR.
+ *
+ * GNSS-SDR is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * at your option) any later version.
+ *
+ * GNSS-SDR is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * -------------------------------------------------------------------------
+ */
+
+#ifndef INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_u_H
+#define INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk_gnsssdr/volk_gnsssdr_complex.h>
+#include <float.h>
+#include <string.h>
+
+#ifdef LV_HAVE_SSE4_1
+#include "smmintrin.h"
+#include "CommonMacros/CommonMacros_16ic_cw_epl_corr_32fc.h"
+#include "CommonMacros/CommonMacros.h"
+ /*!
+ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
+ \param input The input signal input
+ \param carrier The carrier signal input
+ \param E_code Early PRN code replica input
+ \param P_code Early PRN code replica input
+ \param L_code Early PRN code replica input
+ \param E_out Early correlation output
+ \param P_out Early correlation output
+ \param L_out Early correlation output
+ \param num_points The number of complex values in vectors
+ */
+static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_u_sse4_1(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points)
+{
+ const unsigned int sse_iters = num_points / 8;
+
+ __m128i x1, x2, y1, y2, real_bb_signal_sample, imag_bb_signal_sample;
+ __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output;
+
+ __m128 real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc;
+ __m128i input_i_1, input_i_2, output_i32;
+ __m128 real_output_ps, imag_output_ps;
+
+ float E_out_real = 0;
+ float E_out_imag = 0;
+ float P_out_real = 0;
+ float P_out_imag = 0;
+ float L_out_real = 0;
+ float L_out_imag = 0;
+
+ const lv_16sc_t* input_ptr = input;
+ const lv_16sc_t* carrier_ptr = carrier;
+
+ const lv_16sc_t* E_code_ptr = E_code;
+ lv_32fc_t* E_out_ptr = E_out;
+ const lv_16sc_t* L_code_ptr = L_code;
+ lv_32fc_t* L_out_ptr = L_out;
+ const lv_16sc_t* P_code_ptr = P_code;
+ lv_32fc_t* P_out_ptr = P_out;
+
+ *E_out_ptr = 0;
+ *P_out_ptr = 0;
+ *L_out_ptr = 0;
+
+ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
+
+ real_E_code_acc = _mm_setzero_ps();
+ imag_E_code_acc = _mm_setzero_ps();
+ real_P_code_acc = _mm_setzero_ps();
+ imag_P_code_acc = _mm_setzero_ps();
+ real_L_code_acc = _mm_setzero_ps();
+ imag_L_code_acc = _mm_setzero_ps();
+
+ if (sse_iters>0)
+ {
+ for(int number = 0;number < sse_iters; number++){
+
+ //Perform the carrier wipe-off
+ x1 = _mm_lddqu_si128((__m128i*)input_ptr);
+ input_ptr += 4;
+ x2 = _mm_lddqu_si128((__m128i*)input_ptr);
+
+ y1 = _mm_lddqu_si128((__m128i*)carrier_ptr);
+ carrier_ptr += 4;
+ y2 = _mm_lddqu_si128((__m128i*)carrier_ptr);
+
+ CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(x1, x2, realx, imagx)
+ CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy)
+ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample)
+
+ //Get early values
+ y1 = _mm_lddqu_si128((__m128i*)E_code_ptr);
+ E_code_ptr += 4;
+ y2 = _mm_lddqu_si128((__m128i*)E_code_ptr);
+
+ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
+
+ //Adds the float 32 results
+ real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps);
+ imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps);
+
+ //Get prompt values
+ y1 = _mm_lddqu_si128((__m128i*)P_code_ptr);
+ P_code_ptr += 4;
+ y2 = _mm_lddqu_si128((__m128i*)P_code_ptr);
+
+ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
+
+ real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps);
+ imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps);
+
+ //Get late values
+ y1 = _mm_lddqu_si128((__m128i*)L_code_ptr);
+ L_code_ptr += 4;
+ y2 = _mm_lddqu_si128((__m128i*)L_code_ptr);
+
+ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
+
+ real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps);
+ imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps);
+
+ input_ptr += 4;
+ carrier_ptr += 4;
+ E_code_ptr += 4;
+ P_code_ptr += 4;
+ L_code_ptr += 4;
+ }
+
+ __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4];
+
+ _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector
+ _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector
+ _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector
+ _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector
+ _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector
+ _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector
+
+ for (int i = 0; i<4; ++i)
+ {
+ E_out_real += real_E_dotProductVector[i];
+ E_out_imag += imag_E_dotProductVector[i];
+ P_out_real += real_P_dotProductVector[i];
+ P_out_imag += imag_P_dotProductVector[i];
+ L_out_real += real_L_dotProductVector[i];
+ L_out_imag += imag_L_dotProductVector[i];
+ }
+ *E_out_ptr = lv_cmake(E_out_real, E_out_imag);
+ *P_out_ptr = lv_cmake(P_out_real, P_out_imag);
+ *L_out_ptr = lv_cmake(L_out_real, L_out_imag);
+ }
+
+ lv_16sc_t bb_signal_sample;
+ for(int i=0; i < num_points%8; ++i)
+ {
+ //Perform the carrier wipe-off
+ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
+ // Now get early, late, and prompt values for each
+ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++));
+ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++));
+ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++));
+ }
+
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
+ \param input The input signal input
+ \param carrier The carrier signal input
+ \param E_code Early PRN code replica input
+ \param P_code Early PRN code replica input
+ \param L_code Early PRN code replica input
+ \param E_out Early correlation output
+ \param P_out Early correlation output
+ \param L_out Early correlation output
+ \param num_points The number of complex values in vectors
+ */
+static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_generic(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points)
+{
+ lv_16sc_t bb_signal_sample;
+ lv_16sc_t tmp1;
+ lv_16sc_t tmp2;
+ lv_16sc_t tmp3;
+
+ bb_signal_sample = lv_cmake(0, 0);
+
+ *E_out = 0;
+ *P_out = 0;
+ *L_out = 0;
+ // perform Early, Prompt and Late correlation
+
+ for(int i=0; i < num_points; ++i)
+ {
+ //Perform the carrier wipe-off
+ bb_signal_sample = input[i] * carrier[i];
+
+ tmp1 = bb_signal_sample * E_code[i];
+ tmp2 = bb_signal_sample * P_code[i];
+ tmp3 = bb_signal_sample * L_code[i];
+
+ // Now get early, late, and prompt values for each
+ *E_out += (lv_32fc_t)tmp1;
+ *P_out += (lv_32fc_t)tmp2;
+ *L_out += (lv_32fc_t)tmp3;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+#endif /* INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_u_H */
+
+
+#ifndef INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_a_H
+#define INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk_gnsssdr/volk_gnsssdr_complex.h>
+#include <float.h>
+#include <string.h>
+
+#ifdef LV_HAVE_SSE4_1
+#include "smmintrin.h"
+#include "CommonMacros/CommonMacros_16ic_cw_epl_corr_32fc.h"
+#include "CommonMacros/CommonMacros.h"
+/*!
+ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
+ \param input The input signal input
+ \param carrier The carrier signal input
+ \param E_code Early PRN code replica input
+ \param P_code Early PRN code replica input
+ \param L_code Early PRN code replica input
+ \param E_out Early correlation output
+ \param P_out Early correlation output
+ \param L_out Early correlation output
+ \param num_points The number of complex values in vectors
+ */
+static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_a_sse4_1(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points)
+{
+ const unsigned int sse_iters = num_points / 8;
+
+ __m128i x1, x2, y1, y2, real_bb_signal_sample, imag_bb_signal_sample;
+ __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output;
+
+ __m128 real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc;
+ __m128i input_i_1, input_i_2, output_i32;
+ __m128 real_output_ps, imag_output_ps;
+
+ float E_out_real = 0;
+ float E_out_imag = 0;
+ float P_out_real = 0;
+ float P_out_imag = 0;
+ float L_out_real = 0;
+ float L_out_imag = 0;
+
+ const lv_16sc_t* input_ptr = input;
+ const lv_16sc_t* carrier_ptr = carrier;
+
+ const lv_16sc_t* E_code_ptr = E_code;
+ lv_32fc_t* E_out_ptr = E_out;
+ const lv_16sc_t* L_code_ptr = L_code;
+ lv_32fc_t* L_out_ptr = L_out;
+ const lv_16sc_t* P_code_ptr = P_code;
+ lv_32fc_t* P_out_ptr = P_out;
+
+ *E_out_ptr = 0;
+ *P_out_ptr = 0;
+ *L_out_ptr = 0;
+
+ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
+
+ real_E_code_acc = _mm_setzero_ps();
+ imag_E_code_acc = _mm_setzero_ps();
+ real_P_code_acc = _mm_setzero_ps();
+ imag_P_code_acc = _mm_setzero_ps();
+ real_L_code_acc = _mm_setzero_ps();
+ imag_L_code_acc = _mm_setzero_ps();
+
+ if (sse_iters>0)
+ {
+ for(int number = 0;number < sse_iters; number++){
+
+ //Perform the carrier wipe-off
+ x1 = _mm_load_si128((__m128i*)input_ptr);
+ input_ptr += 4;
+ x2 = _mm_load_si128((__m128i*)input_ptr);
+
+ y1 = _mm_load_si128((__m128i*)carrier_ptr);
+ carrier_ptr += 4;
+ y2 = _mm_load_si128((__m128i*)carrier_ptr);
+
+ CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(x1, x2, realx, imagx)
+ CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy)
+ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample)
+
+ //Get early values
+ y1 = _mm_load_si128((__m128i*)E_code_ptr);
+ E_code_ptr += 4;
+ y2 = _mm_load_si128((__m128i*)E_code_ptr);
+
+ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
+
+ //Adds the float 32 results
+ real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps);
+ imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps);
+
+ //Get prompt values
+ y1 = _mm_load_si128((__m128i*)P_code_ptr);
+ P_code_ptr += 4;
+ y2 = _mm_load_si128((__m128i*)P_code_ptr);
+
+ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
+
+ real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps);
+ imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps);
+
+ //Get late values
+ y1 = _mm_load_si128((__m128i*)L_code_ptr);
+ L_code_ptr += 4;
+ y2 = _mm_load_si128((__m128i*)L_code_ptr);
+
+ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
+
+ real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps);
+ imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps);
+
+ input_ptr += 4;
+ carrier_ptr += 4;
+ E_code_ptr += 4;
+ P_code_ptr += 4;
+ L_code_ptr += 4;
+ }
+
+ __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4];
+
+ _mm_store_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector
+ _mm_store_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector
+ _mm_store_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector
+ _mm_store_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector
+ _mm_store_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector
+ _mm_store_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector
+
+ for (int i = 0; i<4; ++i)
+ {
+ E_out_real += real_E_dotProductVector[i];
+ E_out_imag += imag_E_dotProductVector[i];
+ P_out_real += real_P_dotProductVector[i];
+ P_out_imag += imag_P_dotProductVector[i];
+ L_out_real += real_L_dotProductVector[i];
+ L_out_imag += imag_L_dotProductVector[i];
+ }
+ *E_out_ptr = lv_cmake(E_out_real, E_out_imag);
+ *P_out_ptr = lv_cmake(P_out_real, P_out_imag);
+ *L_out_ptr = lv_cmake(L_out_real, L_out_imag);
+ }
+
+ lv_16sc_t bb_signal_sample;
+ for(int i=0; i < num_points%8; ++i)
+ {
+ //Perform the carrier wipe-off
+ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
+ // Now get early, late, and prompt values for each
+ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++));
+ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++));
+ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++));
+ }
+
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
+ \param input The input signal input
+ \param carrier The carrier signal input
+ \param E_code Early PRN code replica input
+ \param P_code Early PRN code replica input
+ \param L_code Early PRN code replica input
+ \param E_out Early correlation output
+ \param P_out Early correlation output
+ \param L_out Early correlation output
+ \param num_points The number of complex values in vectors
+ */
+static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_a_generic(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points)
+{
+ lv_16sc_t bb_signal_sample;
+ lv_16sc_t tmp1;
+ lv_16sc_t tmp2;
+ lv_16sc_t tmp3;
+
+ bb_signal_sample = lv_cmake(0, 0);
+
+ *E_out = 0;
+ *P_out = 0;
+ *L_out = 0;
+ // perform Early, Prompt and Late correlation
+
+ for(int i=0; i < num_points; ++i)
+ {
+ //Perform the carrier wipe-off
+ bb_signal_sample = input[i] * carrier[i];
+
+ tmp1 = bb_signal_sample * E_code[i];
+ tmp2 = bb_signal_sample * P_code[i];
+ tmp3 = bb_signal_sample * L_code[i];
+
+ // Now get early, late, and prompt values for each
+ *E_out += (lv_32fc_t)tmp1;
+ *P_out += (lv_32fc_t)tmp2;
+ *L_out += (lv_32fc_t)tmp3;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+#endif /* INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_a_H */
diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3.h
--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3.h 1970-01-01 01:00:00.000000000 +0100
+++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3.h 2014-10-15 01:55:08.000000000 +0200
@@ -0,0 +1,1568 @@
+/*!
+ * \file volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3.h
+ * \brief Volk protokernel: performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation with 32 bits vectors using different methods: inside u_sse4_1_first there is one method, inside u_sse4_1_second there is another... This protokernel has been created to test the performance of different methods.
+ * \authors <ul>
+ * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
+ * </ul>
+ *
+ * Volk protokernel that performs the carrier wipe-off mixing and the
+ * Early, Prompt, and Late correlation with 32 bits vectors (16 bits the
+ * real part and 16 bits the imaginary part):
+ * - The carrier wipe-off is done by multiplying the input signal by the
+ * carrier (multiplication of 32 bits vectors) It returns the input
+ * signal in base band (BB)
+ * - Early values are calculated by multiplying the input signal in BB by the
+ * early code (multiplication of 32 bits vectors), accumulating the results
+ * - Prompt values are calculated by multiplying the input signal in BB by the
+ * prompt code (multiplication of 32 bits vectors), accumulating the results
+ * - Late values are calculated by multiplying the input signal in BB by the
+ * late code (multiplication of 32 bits vectors), accumulating the results
+ *
+ * -------------------------------------------------------------------------
+ *
+ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
+ *
+ * GNSS-SDR is a software defined Global Navigation
+ * Satellite Systems receiver
+ *
+ * This file is part of GNSS-SDR.
+ *
+ * GNSS-SDR is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * at your option) any later version.
+ *
+ * GNSS-SDR is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * -------------------------------------------------------------------------
+ */
+
+#ifndef INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_u_H
+#define INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk_gnsssdr/volk_gnsssdr_complex.h>
+#include <float.h>
+#include <string.h>
+
+#ifdef LV_HAVE_SSE4_1
+#include "smmintrin.h"
+ /*!
+ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
+ \param input The input signal input
+ \param carrier The carrier signal input
+ \param E_code Early PRN code replica input
+ \param P_code Early PRN code replica input
+ \param L_code Early PRN code replica input
+ \param E_out Early correlation output
+ \param P_out Early correlation output
+ \param L_out Early correlation output
+ \param num_points The number of complex values in vectors
+ */
+static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_u_sse4_1_first(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points)
+{
+ const unsigned int sse_iters = num_points / 4;
+
+ __m128i x, y, yaux, yl, yh, tmp1, tmp2, z, bb_signal_sample, bb_signal_sample_suffled;
+
+ __m128 z_ps_1, z_ps_2, z_E, z_P, z_L;
+ __m128i z_i_1, z_i_2;
+
+ lv_32fc_t dotProduct_E;
+ lv_32fc_t dotProduct_P;
+ lv_32fc_t dotProduct_L;
+
+ z_E = _mm_setzero_ps();
+ z_P = _mm_setzero_ps();
+ z_L = _mm_setzero_ps();
+
+ const lv_16sc_t* _input = input;
+ const lv_16sc_t* _carrier = carrier;
+ const lv_16sc_t* _E_code = E_code;
+ const lv_16sc_t* _P_code = P_code;
+ const lv_16sc_t* _L_code = L_code;
+
+ if (sse_iters>0)
+ {
+ for(int number = 0;number < sse_iters; number++)
+ {
+ //Perform the carrier wipe-off
+ x = _mm_lddqu_si128((__m128i*)_input); // Load the ar + ai, br + bi as ar,ai,br,bi
+ y = _mm_lddqu_si128((__m128i*)_carrier); // Load the cr + ci, dr + di as cr,ci,dr,di
+
+ // Load yl with cr,cr,dr,dr
+ // Load yh with ci,ci,di,di
+ yaux = _mm_shuffle_epi8 (y, _mm_set_epi8 (15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0));
+ yl = _mm_unpacklo_epi16(yaux, yaux);
+ yh = _mm_unpackhi_epi16(yaux, yaux);
+
+ tmp1 = _mm_mullo_epi16(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+
+ x = _mm_shuffle_epi8 (x, _mm_set_epi8 (13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2)); // Re-arrange x to be ai,ar,bi,br
+
+ tmp2 = _mm_mullo_epi16(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+ tmp2 = _mm_mullo_epi16(tmp2,_mm_set_epi16 (1, -1, 1, -1, 1, -1, 1, -1));
+ bb_signal_sample = _mm_add_epi16(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+ bb_signal_sample_suffled = _mm_shuffle_epi8 (bb_signal_sample, _mm_set_epi8 (13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2)); // Re-arrange bb_signal_sample to be ai,ar,bi,br
+
+ // correlation E,P,L (3x vector scalar product)
+ // Early
+ y = _mm_lddqu_si128((__m128i*)_E_code); // Load the cr + ci, dr + di as cr,ci,dr,di
+
+ yaux = _mm_shuffle_epi8 (y, _mm_set_epi8 (15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0));
+ yl = _mm_unpacklo_epi16(yaux, yaux);
+ yh = _mm_unpackhi_epi16(yaux, yaux);
+
+ tmp1 = _mm_mullo_epi16(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+
+ tmp2 = _mm_mullo_epi16(bb_signal_sample_suffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+ tmp2 = _mm_mullo_epi16(tmp2,_mm_set_epi16 (1, -1, 1, -1, 1, -1, 1, -1));
+ z = _mm_add_epi16(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+ z_i_1 = _mm_cvtepi16_epi32(z);
+ z_ps_1 = _mm_cvtepi32_ps(z_i_1);
+ z = _mm_srli_si128 (z, 8);
+ z_i_2 = _mm_cvtepi16_epi32(z);
+ z_ps_2 = _mm_cvtepi32_ps(z_i_2);
+
+ z_E = _mm_add_ps(z_E, z_ps_1); // Add the complex multiplication results together
+ z_E = _mm_add_ps(z_E, z_ps_2); // Add the complex multiplication results together
+
+ // Prompt
+ y = _mm_lddqu_si128((__m128i*)_P_code); // Load the cr + ci, dr + di as cr,ci,dr,di
+
+ yaux = _mm_shuffle_epi8 (y, _mm_set_epi8 (15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0));
+ yl = _mm_unpacklo_epi16(yaux, yaux);
+ yh = _mm_unpackhi_epi16(yaux, yaux);
+
+ tmp1 = _mm_mullo_epi16(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+
+ tmp2 = _mm_mullo_epi16(bb_signal_sample_suffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+ tmp2 = _mm_mullo_epi16(tmp2,_mm_set_epi16 (1, -1, 1, -1, 1, -1, 1, -1));
+ z = _mm_add_epi16(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+ z_i_1 = _mm_cvtepi16_epi32(z);
+ z_ps_1 = _mm_cvtepi32_ps(z_i_1);
+ z = _mm_srli_si128 (z, 8);
+ z_i_2 = _mm_cvtepi16_epi32(z);
+ z_ps_2 = _mm_cvtepi32_ps(z_i_2);
+
+ z_P = _mm_add_ps(z_P, z_ps_1); // Add the complex multiplication results together
+ z_P = _mm_add_ps(z_P, z_ps_2); // Add the complex multiplication results together
+
+ // Late
+ y = _mm_lddqu_si128((__m128i*)_L_code); // Load the cr + ci, dr + di as cr,ci,dr,di
+
+ yaux = _mm_shuffle_epi8 (y, _mm_set_epi8 (15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0));
+ yl = _mm_unpacklo_epi16(yaux, yaux);
+ yh = _mm_unpackhi_epi16(yaux, yaux);
+
+ tmp1 = _mm_mullo_epi16(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+
+ tmp2 = _mm_mullo_epi16(bb_signal_sample_suffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+ tmp2 = _mm_mullo_epi16(tmp2,_mm_set_epi16 (1, -1, 1, -1, 1, -1, 1, -1));
+ z = _mm_add_epi16(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+ z_i_1 = _mm_cvtepi16_epi32(z);
+ z_ps_1 = _mm_cvtepi32_ps(z_i_1);
+ z = _mm_srli_si128 (z, 8);
+ z_i_2 = _mm_cvtepi16_epi32(z);
+ z_ps_2 = _mm_cvtepi32_ps(z_i_2);
+
+ z_L = _mm_add_ps(z_L, z_ps_1); // Add the complex multiplication results together
+ z_L = _mm_add_ps(z_L, z_ps_2); // Add the complex multiplication results together
+
+ _input += 4;
+ _carrier += 4;
+ _E_code += 4;
+ _L_code += 4;
+ _P_code += 4;
+ }
+
+ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_E[2];
+ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_P[2];
+ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_L[2];
+
+ _mm_storeu_ps((float*)dotProductVector_E,z_E); // Store the results back into the dot product vector
+ _mm_storeu_ps((float*)dotProductVector_P,z_P); // Store the results back into the dot product vector
+ _mm_storeu_ps((float*)dotProductVector_L,z_L); // Store the results back into the dot product vector
+
+ dotProduct_E = ( dotProductVector_E[0] + dotProductVector_E[1] );
+ dotProduct_P = ( dotProductVector_P[0] + dotProductVector_P[1] );
+ dotProduct_L = ( dotProductVector_L[0] + dotProductVector_L[1] );
+ }
+
+ for(int i=0; i < num_points%4; ++i)
+ {
+ dotProduct_E += (lv_32fc_t)((*_input) * (*_E_code++)*(*_carrier));
+ dotProduct_P += (lv_32fc_t)((*_input) * (*_P_code++)*(*_carrier));
+ dotProduct_L += (lv_32fc_t)((*_input++) * (*_L_code++)*(*_carrier++));
+ }
+
+ *E_out = dotProduct_E;
+ *P_out = dotProduct_P;
+ *L_out = dotProduct_L;
+
+
+
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#ifdef LV_HAVE_SSE4_1
+#include "smmintrin.h"
+/*!
+ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
+ \param input The input signal input
+ \param carrier The carrier signal input
+ \param E_code Early PRN code replica input
+ \param P_code Early PRN code replica input
+ \param L_code Early PRN code replica input
+ \param E_out Early correlation output
+ \param P_out Early correlation output
+ \param L_out Early correlation output
+ \param num_points The number of complex values in vectors
+ */
+static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_u_sse4_1_second(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points)
+{
+ const unsigned int sse_iters = num_points / 8;
+
+ __m128i x1, x2, y1, y2, real_bb_signal_sample, imag_bb_signal_sample;
+ __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output;
+
+ __m128 real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc;
+ __m128i real_output_i_1, real_output_i_2, imag_output_i_1, imag_output_i_2;
+ __m128 real_output_ps_1, real_output_ps_2, imag_output_ps_1, imag_output_ps_2;
+
+ float E_out_real = 0;
+ float E_out_imag = 0;
+ float P_out_real = 0;
+ float P_out_imag = 0;
+ float L_out_real = 0;
+ float L_out_imag = 0;
+
+ const lv_16sc_t* input_ptr = input;
+ const lv_16sc_t* carrier_ptr = carrier;
+
+ const lv_16sc_t* E_code_ptr = E_code;
+ lv_32fc_t* E_out_ptr = E_out;
+ const lv_16sc_t* L_code_ptr = L_code;
+ lv_32fc_t* L_out_ptr = L_out;
+ const lv_16sc_t* P_code_ptr = P_code;
+ lv_32fc_t* P_out_ptr = P_out;
+
+ *E_out_ptr = 0;
+ *P_out_ptr = 0;
+ *L_out_ptr = 0;
+
+ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
+
+ real_E_code_acc = _mm_setzero_ps();
+ imag_E_code_acc = _mm_setzero_ps();
+ real_P_code_acc = _mm_setzero_ps();
+ imag_P_code_acc = _mm_setzero_ps();
+ real_L_code_acc = _mm_setzero_ps();
+ imag_L_code_acc = _mm_setzero_ps();
+
+ if (sse_iters>0)
+ {
+ for(int number = 0;number < sse_iters; number++){
+
+ //Perform the carrier wipe-off
+ x1 = _mm_lddqu_si128((__m128i*)input_ptr);
+ input_ptr += 4;
+ x2 = _mm_lddqu_si128((__m128i*)input_ptr);
+
+ y1 = _mm_lddqu_si128((__m128i*)carrier_ptr);
+ carrier_ptr += 4;
+ y2 = _mm_lddqu_si128((__m128i*)carrier_ptr);
+
+ imagx = _mm_srli_si128 (x1, 2);
+ imagx = _mm_blend_epi16 (x2, imagx, 85);
+ realx = _mm_slli_si128 (x2, 2);
+ realx = _mm_blend_epi16 (realx, x1, 85);
+
+ imagy = _mm_srli_si128 (y1, 2);
+ imagy = _mm_blend_epi16 (y2, imagy, 85);
+ realy = _mm_slli_si128 (y2, 2);
+ realy = _mm_blend_epi16 (realy, y1, 85);
+
+ realx_mult_realy = _mm_mullo_epi16 (realx, realy);
+ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);
+ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);
+ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);
+
+ real_bb_signal_sample = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
+ imag_bb_signal_sample = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
+
+ //Get early values
+ y1 = _mm_lddqu_si128((__m128i*)E_code_ptr);
+ E_code_ptr += 4;
+ y2 = _mm_lddqu_si128((__m128i*)E_code_ptr);
+
+ imagy = _mm_srli_si128 (y1, 2);
+ imagy = _mm_blend_epi16 (y2, imagy, 85);
+ realy = _mm_slli_si128 (y2, 2);
+ realy = _mm_blend_epi16 (realy, y1, 85);
+
+ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy);
+ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy);
+ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy);
+ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy);
+
+ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
+ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
+
+ real_output_i_1 = _mm_cvtepi16_epi32(real_output);
+ real_output_ps_1 = _mm_cvtepi32_ps(real_output_i_1);
+ real_output = _mm_srli_si128 (real_output, 8);
+ real_output_i_2 = _mm_cvtepi16_epi32(real_output);
+ real_output_ps_2 = _mm_cvtepi32_ps(real_output_i_2);
+
+ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output);
+ imag_output_ps_1 = _mm_cvtepi32_ps(imag_output_i_1);
+ imag_output = _mm_srli_si128 (imag_output, 8);
+ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output);
+ imag_output_ps_2 = _mm_cvtepi32_ps(imag_output_i_2);
+
+ real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps_1);
+ real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps_2);
+ imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps_1);
+ imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps_2);
+
+ //Get prompt values
+ y1 = _mm_lddqu_si128((__m128i*)P_code_ptr);
+ P_code_ptr += 4;
+ y2 = _mm_lddqu_si128((__m128i*)P_code_ptr);
+
+ imagy = _mm_srli_si128 (y1, 2);
+ imagy = _mm_blend_epi16 (y2, imagy, 85);
+ realy = _mm_slli_si128 (y2, 2);
+ realy = _mm_blend_epi16 (realy, y1, 85);
+
+ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy);
+ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy);
+ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy);
+ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy);
+
+ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
+ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
+
+ real_output_i_1 = _mm_cvtepi16_epi32(real_output);
+ real_output_ps_1 = _mm_cvtepi32_ps(real_output_i_1);
+ real_output = _mm_srli_si128 (real_output, 8);
+ real_output_i_2 = _mm_cvtepi16_epi32(real_output);
+ real_output_ps_2 = _mm_cvtepi32_ps(real_output_i_2);
+
+ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output);
+ imag_output_ps_1 = _mm_cvtepi32_ps(imag_output_i_1);
+ imag_output = _mm_srli_si128 (imag_output, 8);
+ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output);
+ imag_output_ps_2 = _mm_cvtepi32_ps(imag_output_i_2);
+
+ real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps_1);
+ real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps_2);
+ imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps_1);
+ imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps_2);
+
+ //Get late values
+ y1 = _mm_lddqu_si128((__m128i*)L_code_ptr);
+ L_code_ptr += 4;
+ y2 = _mm_lddqu_si128((__m128i*)L_code_ptr);
+
+ imagy = _mm_srli_si128 (y1, 2);
+ imagy = _mm_blend_epi16 (y2, imagy, 85);
+ realy = _mm_slli_si128 (y2, 2);
+ realy = _mm_blend_epi16 (realy, y1, 85);
+
+ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy);
+ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy);
+ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy);
+ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy);
+
+ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
+ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
+
+ real_output_i_1 = _mm_cvtepi16_epi32(real_output);
+ real_output_ps_1 = _mm_cvtepi32_ps(real_output_i_1);
+ real_output = _mm_srli_si128 (real_output, 8);
+ real_output_i_2 = _mm_cvtepi16_epi32(real_output);
+ real_output_ps_2 = _mm_cvtepi32_ps(real_output_i_2);
+
+ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output);
+ imag_output_ps_1 = _mm_cvtepi32_ps(imag_output_i_1);
+ imag_output = _mm_srli_si128 (imag_output, 8);
+ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output);
+ imag_output_ps_2 = _mm_cvtepi32_ps(imag_output_i_2);
+
+ real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps_1);
+ real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps_2);
+ imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps_1);
+ imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps_2);
+
+ input_ptr += 4;
+ carrier_ptr += 4;
+ E_code_ptr += 4;
+ L_code_ptr += 4;
+ P_code_ptr += 4;
+ }
+
+ __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4];
+
+ _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector
+ _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector
+ _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector
+ _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector
+ _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector
+ _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector
+
+ for (int i = 0; i<4; ++i)
+ {
+ E_out_real += real_E_dotProductVector[i];
+ E_out_imag += imag_E_dotProductVector[i];
+ P_out_real += real_P_dotProductVector[i];
+ P_out_imag += imag_P_dotProductVector[i];
+ L_out_real += real_L_dotProductVector[i];
+ L_out_imag += imag_L_dotProductVector[i];
+ }
+ *E_out_ptr = lv_cmake(E_out_real, E_out_imag);
+ *P_out_ptr = lv_cmake(P_out_real, P_out_imag);
+ *L_out_ptr = lv_cmake(L_out_real, L_out_imag);
+ }
+
+ lv_16sc_t bb_signal_sample;
+ for(int i=0; i < num_points%8; ++i)
+ {
+ //Perform the carrier wipe-off
+ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
+ // Now get early, late, and prompt values for each
+ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++));
+ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++));
+ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++));
+ }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#ifdef LV_HAVE_SSE4_1
+#include "smmintrin.h"
+/*!
+ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
+ \param input The input signal input
+ \param carrier The carrier signal input
+ \param E_code Early PRN code replica input
+ \param P_code Early PRN code replica input
+ \param L_code Early PRN code replica input
+ \param E_out Early correlation output
+ \param P_out Early correlation output
+ \param L_out Early correlation output
+ \param num_points The number of complex values in vectors
+ */
+static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_u_sse4_1_third(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points)
+{
+ const unsigned int sse_iters = num_points / 8;
+ unsigned int index = 0;
+ unsigned int indexPlus4 = 0;
+
+ __m128i x1, x2, y1, y2, real_bb_signal_sample, imag_bb_signal_sample;
+ __m128i realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, real_output_i32, imag_output_i32;
+
+ __m128 real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc;
+ __m128i real_output_i_1, real_output_i_2, imag_output_i_1, imag_output_i_2;
+ __m128 real_output_ps, imag_output_ps;
+
+ float E_out_real = 0;
+ float E_out_imag = 0;
+ float P_out_real = 0;
+ float P_out_imag = 0;
+ float L_out_real = 0;
+ float L_out_imag = 0;
+
+ const lv_16sc_t* input_ptr = input;
+ const lv_16sc_t* carrier_ptr = carrier;
+
+ const lv_16sc_t* E_code_ptr = E_code;
+ lv_32fc_t* E_out_ptr = E_out;
+ const lv_16sc_t* L_code_ptr = L_code;
+ lv_32fc_t* L_out_ptr = L_out;
+ const lv_16sc_t* P_code_ptr = P_code;
+ lv_32fc_t* P_out_ptr = P_out;
+
+ *E_out_ptr = 0;
+ *P_out_ptr = 0;
+ *L_out_ptr = 0;
+
+ real_E_code_acc = _mm_setzero_ps();
+ imag_E_code_acc = _mm_setzero_ps();
+ real_P_code_acc = _mm_setzero_ps();
+ imag_P_code_acc = _mm_setzero_ps();
+ real_L_code_acc = _mm_setzero_ps();
+ imag_L_code_acc = _mm_setzero_ps();
+
+ if (sse_iters>0)
+ {
+ for(index = 0;index < 8*sse_iters; index+=8){
+ indexPlus4 = index + 4;
+ //Perform the carrier wipe-off
+ x1 = _mm_lddqu_si128((__m128i*)&input_ptr[index]);
+ x2 = _mm_lddqu_si128((__m128i*)&input_ptr[indexPlus4]);
+
+ y1 = _mm_lddqu_si128((__m128i*)&carrier_ptr[index]);
+ y2 = _mm_lddqu_si128((__m128i*)&carrier_ptr[indexPlus4]);
+
+ imagx = _mm_srli_si128 (x1, 2);
+ imagx = _mm_blend_epi16 (x2, imagx, 85);
+ realx = _mm_slli_si128 (x2, 2);
+ realx = _mm_blend_epi16 (realx, x1, 85);
+
+ imagy = _mm_srli_si128 (y1, 2);
+ imagy = _mm_blend_epi16 (y2, imagy, 85);
+ realy = _mm_slli_si128 (y2, 2);
+ realy = _mm_blend_epi16 (realy, y1, 85);
+
+ realx_mult_realy = _mm_mullo_epi16 (realx, realy);
+ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);
+ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);
+ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);
+
+ real_bb_signal_sample = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
+ imag_bb_signal_sample = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
+
+ //Get early values
+ y1 = _mm_lddqu_si128((__m128i*)&E_code_ptr[index]);
+ y2 = _mm_lddqu_si128((__m128i*)&E_code_ptr[indexPlus4]);
+
+ imagy = _mm_srli_si128 (y1, 2);
+ imagy = _mm_blend_epi16 (y2, imagy, 85);
+ realy = _mm_slli_si128 (y2, 2);
+ realy = _mm_blend_epi16 (realy, y1, 85);
+
+ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy);
+ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy);
+ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy);
+ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy);
+
+ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
+ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
+
+ real_output_i_1 = _mm_cvtepi16_epi32(real_output);
+ real_output = _mm_srli_si128 (real_output, 8);
+ real_output_i_2 = _mm_cvtepi16_epi32(real_output);
+ real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2);
+ real_output_ps = _mm_cvtepi32_ps(real_output_i32);
+
+ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output);
+ imag_output = _mm_srli_si128 (imag_output, 8);
+ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output);
+ imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2);
+ imag_output_ps = _mm_cvtepi32_ps(imag_output_i32);
+
+ real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps);
+ imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps);
+
+ //Get prompt values
+ y1 = _mm_lddqu_si128((__m128i*)&P_code_ptr[index]);
+ y2 = _mm_lddqu_si128((__m128i*)&P_code_ptr[indexPlus4]);
+
+ imagy = _mm_srli_si128 (y1, 2);
+ imagy = _mm_blend_epi16 (y2, imagy, 85);
+ realy = _mm_slli_si128 (y2, 2);
+ realy = _mm_blend_epi16 (realy, y1, 85);
+
+ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy);
+ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy);
+ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy);
+ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy);
+
+ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
+ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
+
+ real_output_i_1 = _mm_cvtepi16_epi32(real_output);
+ real_output = _mm_srli_si128 (real_output, 8);
+ real_output_i_2 = _mm_cvtepi16_epi32(real_output);
+ real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2);
+ real_output_ps = _mm_cvtepi32_ps(real_output_i32);
+
+ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output);
+ imag_output = _mm_srli_si128 (imag_output, 8);
+ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output);
+ imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2);
+ imag_output_ps = _mm_cvtepi32_ps(imag_output_i32);
+
+ real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps);
+ imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps);
+
+ //Get late values
+ y1 = _mm_lddqu_si128((__m128i*)&L_code_ptr[index]);
+ y2 = _mm_lddqu_si128((__m128i*)&L_code_ptr[indexPlus4]);
+
+ imagy = _mm_srli_si128 (y1, 2);
+ imagy = _mm_blend_epi16 (y2, imagy, 85);
+ realy = _mm_slli_si128 (y2, 2);
+ realy = _mm_blend_epi16 (realy, y1, 85);
+
+ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy);
+ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy);
+ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy);
+ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy);
+
+ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
+ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
+
+ real_output_i_1 = _mm_cvtepi16_epi32(real_output);
+ real_output = _mm_srli_si128 (real_output, 8);
+ real_output_i_2 = _mm_cvtepi16_epi32(real_output);
+ real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2);
+ real_output_ps = _mm_cvtepi32_ps(real_output_i32);
+
+ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output);
+ imag_output = _mm_srli_si128 (imag_output, 8);
+ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output);
+ imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2);
+ imag_output_ps = _mm_cvtepi32_ps(imag_output_i32);
+
+ real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps);
+ imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps);
+ }
+
+ __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4];
+
+ _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector
+ _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector
+ _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector
+ _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector
+ _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector
+ _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector
+
+ for (int i = 0; i<4; ++i)
+ {
+ E_out_real += real_E_dotProductVector[i];
+ E_out_imag += imag_E_dotProductVector[i];
+ P_out_real += real_P_dotProductVector[i];
+ P_out_imag += imag_P_dotProductVector[i];
+ L_out_real += real_L_dotProductVector[i];
+ L_out_imag += imag_L_dotProductVector[i];
+ }
+ *E_out_ptr = lv_cmake(E_out_real, E_out_imag);
+ *P_out_ptr = lv_cmake(P_out_real, P_out_imag);
+ *L_out_ptr = lv_cmake(L_out_real, L_out_imag);
+ }
+
+ lv_16sc_t bb_signal_sample;
+ for(; index < num_points; index++)
+ {
+ //Perform the carrier wipe-off
+ bb_signal_sample = input_ptr[index] * carrier_ptr[index];
+ // Now get early, late, and prompt values for each
+ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * E_code_ptr[index]);
+ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * P_code_ptr[index]);
+ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * L_code_ptr[index]);
+ }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#ifdef LV_HAVE_SSE4_1
+#include "smmintrin.h"
+/*!
+ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
+ \param input The input signal input
+ \param carrier The carrier signal input
+ \param E_code Early PRN code replica input
+ \param P_code Early PRN code replica input
+ \param L_code Early PRN code replica input
+ \param E_out Early correlation output
+ \param P_out Early correlation output
+ \param L_out Early correlation output
+ \param num_points The number of complex values in vectors
+ */
+static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_u_sse4_1_fourth(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points)
+{
+ const unsigned int sse_iters = num_points / 8;
+
+ __m128i x1, x2, y1, y2, real_bb_signal_sample, imag_bb_signal_sample;
+ __m128i realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, real_output_i32, imag_output_i32;
+
+ __m128 real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc;
+ __m128i real_output_i_1, real_output_i_2, imag_output_i_1, imag_output_i_2;
+ __m128 real_output_ps, imag_output_ps;
+
+ float E_out_real = 0;
+ float E_out_imag = 0;
+ float P_out_real = 0;
+ float P_out_imag = 0;
+ float L_out_real = 0;
+ float L_out_imag = 0;
+
+ const lv_16sc_t* input_ptr = input;
+ const lv_16sc_t* carrier_ptr = carrier;
+
+ const lv_16sc_t* E_code_ptr = E_code;
+ lv_32fc_t* E_out_ptr = E_out;
+ const lv_16sc_t* L_code_ptr = L_code;
+ lv_32fc_t* L_out_ptr = L_out;
+ const lv_16sc_t* P_code_ptr = P_code;
+ lv_32fc_t* P_out_ptr = P_out;
+
+ *E_out_ptr = 0;
+ *P_out_ptr = 0;
+ *L_out_ptr = 0;
+
+ real_E_code_acc = _mm_setzero_ps();
+ imag_E_code_acc = _mm_setzero_ps();
+ real_P_code_acc = _mm_setzero_ps();
+ imag_P_code_acc = _mm_setzero_ps();
+ real_L_code_acc = _mm_setzero_ps();
+ imag_L_code_acc = _mm_setzero_ps();
+
+ if (sse_iters>0)
+ {
+ for(int number = 0;number < sse_iters; number++){
+
+ //Perform the carrier wipe-off
+ x1 = _mm_lddqu_si128((__m128i*)input_ptr);
+ input_ptr += 4;
+ x2 = _mm_lddqu_si128((__m128i*)input_ptr);
+
+ y1 = _mm_lddqu_si128((__m128i*)carrier_ptr);
+ carrier_ptr += 4;
+ y2 = _mm_lddqu_si128((__m128i*)carrier_ptr);
+
+ imagx = _mm_srli_si128 (x1, 2);
+ imagx = _mm_blend_epi16 (x2, imagx, 85);
+ realx = _mm_slli_si128 (x2, 2);
+ realx = _mm_blend_epi16 (realx, x1, 85);
+
+ imagy = _mm_srli_si128 (y1, 2);
+ imagy = _mm_blend_epi16 (y2, imagy, 85);
+ realy = _mm_slli_si128 (y2, 2);
+ realy = _mm_blend_epi16 (realy, y1, 85);
+
+ realx_mult_realy = _mm_mullo_epi16 (realx, realy);
+ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);
+ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);
+ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);
+
+ real_bb_signal_sample = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
+ imag_bb_signal_sample = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
+
+ //Get early values
+ y1 = _mm_lddqu_si128((__m128i*)E_code_ptr);
+ E_code_ptr += 4;
+ y2 = _mm_lddqu_si128((__m128i*)E_code_ptr);
+
+ imagy = _mm_srli_si128 (y1, 2);
+ imagy = _mm_blend_epi16 (y2, imagy, 85);
+ realy = _mm_slli_si128 (y2, 2);
+ realy = _mm_blend_epi16 (realy, y1, 85);
+
+ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy);
+ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy);
+ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy);
+ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy);
+
+ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
+ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
+
+ real_output_i_1 = _mm_cvtepi16_epi32(real_output);
+ real_output = _mm_srli_si128 (real_output, 8);
+ real_output_i_2 = _mm_cvtepi16_epi32(real_output);
+ real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2);
+ real_output_ps = _mm_cvtepi32_ps(real_output_i32);
+
+ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output);
+ imag_output = _mm_srli_si128 (imag_output, 8);
+ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output);
+ imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2);
+ imag_output_ps = _mm_cvtepi32_ps(imag_output_i32);
+
+ real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps);
+ imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps);
+
+ //Get prompt values
+ y1 = _mm_lddqu_si128((__m128i*)P_code_ptr);
+ P_code_ptr += 4;
+ y2 = _mm_lddqu_si128((__m128i*)P_code_ptr);
+
+ imagy = _mm_srli_si128 (y1, 2);
+ imagy = _mm_blend_epi16 (y2, imagy, 85);
+ realy = _mm_slli_si128 (y2, 2);
+ realy = _mm_blend_epi16 (realy, y1, 85);
+
+ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy);
+ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy);
+ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy);
+ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy);
+
+ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
+ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
+
+ real_output_i_1 = _mm_cvtepi16_epi32(real_output);
+ real_output = _mm_srli_si128 (real_output, 8);
+ real_output_i_2 = _mm_cvtepi16_epi32(real_output);
+ real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2);
+ real_output_ps = _mm_cvtepi32_ps(real_output_i32);
+
+ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output);
+ imag_output = _mm_srli_si128 (imag_output, 8);
+ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output);
+ imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2);
+ imag_output_ps = _mm_cvtepi32_ps(imag_output_i32);
+
+ real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps);
+ imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps);
+
+ //Get late values
+ y1 = _mm_lddqu_si128((__m128i*)L_code_ptr);
+ L_code_ptr += 4;
+ y2 = _mm_lddqu_si128((__m128i*)L_code_ptr);
+
+ imagy = _mm_srli_si128 (y1, 2);
+ imagy = _mm_blend_epi16 (y2, imagy, 85);
+ realy = _mm_slli_si128 (y2, 2);
+ realy = _mm_blend_epi16 (realy, y1, 85);
+
+ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy);
+ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy);
+ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy);
+ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy);
+
+ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
+ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
+
+ real_output_i_1 = _mm_cvtepi16_epi32(real_output);
+ real_output = _mm_srli_si128 (real_output, 8);
+ real_output_i_2 = _mm_cvtepi16_epi32(real_output);
+ real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2);
+ real_output_ps = _mm_cvtepi32_ps(real_output_i32);
+
+ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output);
+ imag_output = _mm_srli_si128 (imag_output, 8);
+ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output);
+ imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2);
+ imag_output_ps = _mm_cvtepi32_ps(imag_output_i32);
+
+ real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps);
+ imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps);
+
+ input_ptr += 4;
+ carrier_ptr += 4;
+ E_code_ptr += 4;
+ L_code_ptr += 4;
+ P_code_ptr += 4;
+ }
+
+ __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4];
+
+ _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector
+ _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector
+ _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector
+ _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector
+ _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector
+ _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector
+
+ for (int i = 0; i<4; ++i)
+ {
+ E_out_real += real_E_dotProductVector[i];
+ E_out_imag += imag_E_dotProductVector[i];
+ P_out_real += real_P_dotProductVector[i];
+ P_out_imag += imag_P_dotProductVector[i];
+ L_out_real += real_L_dotProductVector[i];
+ L_out_imag += imag_L_dotProductVector[i];
+ }
+ *E_out_ptr = lv_cmake(E_out_real, E_out_imag);
+ *P_out_ptr = lv_cmake(P_out_real, P_out_imag);
+ *L_out_ptr = lv_cmake(L_out_real, L_out_imag);
+ }
+
+ lv_16sc_t bb_signal_sample;
+ for(int i=0; i < num_points%8; ++i)
+ {
+ //Perform the carrier wipe-off
+ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
+ // Now get early, late, and prompt values for each
+ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++));
+ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++));
+ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++));
+ }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#ifdef LV_HAVE_SSE4_1
+#include "smmintrin.h"
+#include "CommonMacros/CommonMacros.h"
+/*!
+ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
+ \param input The input signal input
+ \param carrier The carrier signal input
+ \param E_code Early PRN code replica input
+ \param P_code Early PRN code replica input
+ \param L_code Early PRN code replica input
+ \param E_out Early correlation output
+ \param P_out Early correlation output
+ \param L_out Early correlation output
+ \param num_points The number of complex values in vectors
+ */
+
+static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_u_sse4_1_fifth(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points)
+{
+ const unsigned int sse_iters = num_points / 8;
+
+ __m128i realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy;
+ __m128i input_i_1, input_i_2, output_i32;
+
+ __m128i x1, x2, y1, y2, real_bb_signal_sample, imag_bb_signal_sample;
+ __m128i realx, imagx, realy, imagy, real_output, imag_output;
+
+ __m128 real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc;
+ __m128 real_output_ps, imag_output_ps;
+
+ float E_out_real = 0;
+ float E_out_imag = 0;
+ float P_out_real = 0;
+ float P_out_imag = 0;
+ float L_out_real = 0;
+ float L_out_imag = 0;
+
+ const lv_16sc_t* input_ptr = input;
+ const lv_16sc_t* carrier_ptr = carrier;
+
+ const lv_16sc_t* E_code_ptr = E_code;
+ lv_32fc_t* E_out_ptr = E_out;
+ const lv_16sc_t* L_code_ptr = L_code;
+ lv_32fc_t* L_out_ptr = L_out;
+ const lv_16sc_t* P_code_ptr = P_code;
+ lv_32fc_t* P_out_ptr = P_out;
+
+ *E_out_ptr = 0;
+ *P_out_ptr = 0;
+ *L_out_ptr = 0;
+
+ real_E_code_acc = _mm_setzero_ps();
+ imag_E_code_acc = _mm_setzero_ps();
+ real_P_code_acc = _mm_setzero_ps();
+ imag_P_code_acc = _mm_setzero_ps();
+ real_L_code_acc = _mm_setzero_ps();
+ imag_L_code_acc = _mm_setzero_ps();
+
+ if (sse_iters>0)
+ {
+ for(int number = 0;number < sse_iters; number++){
+
+ //Perform the carrier wipe-off
+ x1 = _mm_lddqu_si128((__m128i*)input_ptr);
+ input_ptr += 4;
+ x2 = _mm_lddqu_si128((__m128i*)input_ptr);
+
+ y1 = _mm_lddqu_si128((__m128i*)carrier_ptr);
+ carrier_ptr += 4;
+ y2 = _mm_lddqu_si128((__m128i*)carrier_ptr);
+
+ CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(x1, x2, realx, imagx)
+ CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy)
+ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample)
+
+ //Get early values
+ y1 = _mm_lddqu_si128((__m128i*)E_code_ptr);
+ E_code_ptr += 4;
+ y2 = _mm_lddqu_si128((__m128i*)E_code_ptr);
+
+ CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy)
+ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(real_bb_signal_sample, imag_bb_signal_sample, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output)
+
+ CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(real_output, input_i_1, input_i_2, output_i32, real_output_ps)
+ CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(imag_output, input_i_1, input_i_2, output_i32, imag_output_ps)
+
+ real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps);
+ imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps);
+
+ //Get prompt values
+ y1 = _mm_lddqu_si128((__m128i*)P_code_ptr);
+ P_code_ptr += 4;
+ y2 = _mm_lddqu_si128((__m128i*)P_code_ptr);
+
+ CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy)
+ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(real_bb_signal_sample, imag_bb_signal_sample, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output)
+
+ CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(real_output, input_i_1, input_i_2, output_i32, real_output_ps)
+ CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(imag_output, input_i_1, input_i_2, output_i32, imag_output_ps)
+
+ real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps);
+ imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps);
+
+ //Get late values
+ y1 = _mm_lddqu_si128((__m128i*)L_code_ptr);
+ L_code_ptr += 4;
+ y2 = _mm_lddqu_si128((__m128i*)L_code_ptr);
+
+ CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy)
+ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(real_bb_signal_sample, imag_bb_signal_sample, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output)
+
+ CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(real_output, input_i_1, input_i_2, output_i32, real_output_ps)
+ CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(imag_output, input_i_1, input_i_2, output_i32, imag_output_ps)
+
+ real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps);
+ imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps);
+
+ input_ptr += 4;
+ carrier_ptr += 4;
+ E_code_ptr += 4;
+ L_code_ptr += 4;
+ P_code_ptr += 4;
+ }
+
+ __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4];
+
+ _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector
+ _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector
+ _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector
+ _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector
+ _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector
+ _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector
+
+ for (int i = 0; i<4; ++i)
+ {
+ E_out_real += real_E_dotProductVector[i];
+ E_out_imag += imag_E_dotProductVector[i];
+ P_out_real += real_P_dotProductVector[i];
+ P_out_imag += imag_P_dotProductVector[i];
+ L_out_real += real_L_dotProductVector[i];
+ L_out_imag += imag_L_dotProductVector[i];
+ }
+ *E_out_ptr = lv_cmake(E_out_real, E_out_imag);
+ *P_out_ptr = lv_cmake(P_out_real, P_out_imag);
+ *L_out_ptr = lv_cmake(L_out_real, L_out_imag);
+ }
+
+ lv_16sc_t bb_signal_sample;
+ for(int i=0; i < num_points%8; ++i)
+ {
+ //Perform the carrier wipe-off
+ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
+ // Now get early, late, and prompt values for each
+ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++));
+ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++));
+ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++));
+ }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#ifdef LV_HAVE_SSE4_1
+#include "smmintrin.h"
+#include "CommonMacros/CommonMacros_16ic_cw_epl_corr_32fc.h"
+#include "CommonMacros/CommonMacros.h"
+/*!
+ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
+ \param input The input signal input
+ \param carrier The carrier signal input
+ \param E_code Early PRN code replica input
+ \param P_code Early PRN code replica input
+ \param L_code Early PRN code replica input
+ \param E_out Early correlation output
+ \param P_out Early correlation output
+ \param L_out Early correlation output
+ \param num_points The number of complex values in vectors
+ */
+
+static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_u_sse4_1_sixth(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points)
+{
+ const unsigned int sse_iters = num_points / 8;
+
+ __m128i realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy;
+ __m128i input_i_1, input_i_2, output_i32;
+
+ __m128i x1, x2, y1, y2, real_bb_signal_sample, imag_bb_signal_sample;
+ __m128i realx, imagx, realy, imagy, real_output, imag_output;
+
+ __m128 real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc;
+ __m128 real_output_ps, imag_output_ps;
+
+ float E_out_real = 0;
+ float E_out_imag = 0;
+ float P_out_real = 0;
+ float P_out_imag = 0;
+ float L_out_real = 0;
+ float L_out_imag = 0;
+
+ const lv_16sc_t* input_ptr = input;
+ const lv_16sc_t* carrier_ptr = carrier;
+
+ const lv_16sc_t* E_code_ptr = E_code;
+ lv_32fc_t* E_out_ptr = E_out;
+ const lv_16sc_t* L_code_ptr = L_code;
+ lv_32fc_t* L_out_ptr = L_out;
+ const lv_16sc_t* P_code_ptr = P_code;
+ lv_32fc_t* P_out_ptr = P_out;
+
+ *E_out_ptr = 0;
+ *P_out_ptr = 0;
+ *L_out_ptr = 0;
+
+ real_E_code_acc = _mm_setzero_ps();
+ imag_E_code_acc = _mm_setzero_ps();
+ real_P_code_acc = _mm_setzero_ps();
+ imag_P_code_acc = _mm_setzero_ps();
+ real_L_code_acc = _mm_setzero_ps();
+ imag_L_code_acc = _mm_setzero_ps();
+
+ if (sse_iters>0)
+ {
+ for(int number = 0;number < sse_iters; number++){
+
+ //Perform the carrier wipe-off
+ x1 = _mm_lddqu_si128((__m128i*)input_ptr);
+ input_ptr += 4;
+ x2 = _mm_lddqu_si128((__m128i*)input_ptr);
+
+ y1 = _mm_lddqu_si128((__m128i*)carrier_ptr);
+ carrier_ptr += 4;
+ y2 = _mm_lddqu_si128((__m128i*)carrier_ptr);
+
+ CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(x1, x2, realx, imagx)
+ CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy)
+ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample)
+
+ //Get early values
+ y1 = _mm_lddqu_si128((__m128i*)E_code_ptr);
+ E_code_ptr += 4;
+ y2 = _mm_lddqu_si128((__m128i*)E_code_ptr);
+
+ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
+
+ real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps);
+ imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps);
+
+ //Get prompt values
+ y1 = _mm_lddqu_si128((__m128i*)P_code_ptr);
+ P_code_ptr += 4;
+ y2 = _mm_lddqu_si128((__m128i*)P_code_ptr);
+
+ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
+
+ real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps);
+ imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps);
+
+ //Get late values
+ y1 = _mm_lddqu_si128((__m128i*)L_code_ptr);
+ L_code_ptr += 4;
+ y2 = _mm_lddqu_si128((__m128i*)L_code_ptr);
+
+ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
+
+ real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps);
+ imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps);
+
+ input_ptr += 4;
+ carrier_ptr += 4;
+ E_code_ptr += 4;
+ L_code_ptr += 4;
+ P_code_ptr += 4;
+ }
+
+ __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4];
+
+ _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector
+ _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector
+ _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector
+ _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector
+ _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector
+ _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector
+
+ for (int i = 0; i<4; ++i)
+ {
+ E_out_real += real_E_dotProductVector[i];
+ E_out_imag += imag_E_dotProductVector[i];
+ P_out_real += real_P_dotProductVector[i];
+ P_out_imag += imag_P_dotProductVector[i];
+ L_out_real += real_L_dotProductVector[i];
+ L_out_imag += imag_L_dotProductVector[i];
+ }
+ *E_out_ptr = lv_cmake(E_out_real, E_out_imag);
+ *P_out_ptr = lv_cmake(P_out_real, P_out_imag);
+ *L_out_ptr = lv_cmake(L_out_real, L_out_imag);
+ }
+
+ lv_16sc_t bb_signal_sample;
+ for(int i=0; i < num_points%8; ++i)
+ {
+ //Perform the carrier wipe-off
+ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
+ // Now get early, late, and prompt values for each
+ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++));
+ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++));
+ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++));
+ }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
+ \param input The input signal input
+ \param carrier The carrier signal input
+ \param E_code Early PRN code replica input
+ \param P_code Early PRN code replica input
+ \param L_code Early PRN code replica input
+ \param E_out Early correlation output
+ \param P_out Early correlation output
+ \param L_out Early correlation output
+ \param num_points The number of complex values in vectors
+ */
+static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_generic(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points)
+{
+ lv_16sc_t bb_signal_sample;
+ lv_16sc_t tmp1;
+ lv_16sc_t tmp2;
+ lv_16sc_t tmp3;
+
+ bb_signal_sample = lv_cmake(0, 0);
+
+ *E_out = 0;
+ *P_out = 0;
+ *L_out = 0;
+ // perform Early, Prompt and Late correlation
+
+ for(int i=0; i < num_points; ++i)
+ {
+ //Perform the carrier wipe-off
+ bb_signal_sample = input[i] * carrier[i];
+
+ tmp1 = bb_signal_sample * E_code[i];
+ tmp2 = bb_signal_sample * P_code[i];
+ tmp3 = bb_signal_sample * L_code[i];
+
+ // Now get early, late, and prompt values for each
+ *E_out += (lv_32fc_t)tmp1;
+ *P_out += (lv_32fc_t)tmp2;
+ *L_out += (lv_32fc_t)tmp3;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+#endif /* INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_u_H */
+
+
+#ifndef INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_a_H
+#define INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk_gnsssdr/volk_gnsssdr_complex.h>
+#include <float.h>
+#include <string.h>
+//
+//#ifdef LV_HAVE_SSE4_1
+//#include "smmintrin.h"
+///*!
+// \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
+// \param input The input signal input
+// \param carrier The carrier signal input
+// \param E_code Early PRN code replica input
+// \param P_code Early PRN code replica input
+// \param L_code Early PRN code replica input
+// \param E_out Early correlation output
+// \param P_out Early correlation output
+// \param L_out Early correlation output
+// \param num_points The number of complex values in vectors
+// */
+//static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_a_sse4_1(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points)
+//{
+// const unsigned int sse_iters = num_points / 8;
+//
+// __m128i x1, x2, y1, y2, real_bb_signal_sample, imag_bb_signal_sample;
+// __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output;
+//
+// __m128 real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc;
+// __m128i real_output_i_1, real_output_i_2, imag_output_i_1, imag_output_i_2;
+// __m128 real_output_ps_1, real_output_ps_2, imag_output_ps_1, imag_output_ps_2;
+//
+// float E_out_real = 0;
+// float E_out_imag = 0;
+// float P_out_real = 0;
+// float P_out_imag = 0;
+// float L_out_real = 0;
+// float L_out_imag = 0;
+//
+// const lv_16sc_t* input_ptr = input;
+// const lv_16sc_t* carrier_ptr = carrier;
+//
+// const lv_16sc_t* E_code_ptr = E_code;
+// lv_32fc_t* E_out_ptr = E_out;
+// const lv_16sc_t* L_code_ptr = L_code;
+// lv_32fc_t* L_out_ptr = L_out;
+// const lv_16sc_t* P_code_ptr = P_code;
+// lv_32fc_t* P_out_ptr = P_out;
+//
+// *E_out_ptr = 0;
+// *P_out_ptr = 0;
+// *L_out_ptr = 0;
+//
+// mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
+//
+// real_E_code_acc = _mm_setzero_ps();
+// imag_E_code_acc = _mm_setzero_ps();
+// real_P_code_acc = _mm_setzero_ps();
+// imag_P_code_acc = _mm_setzero_ps();
+// real_L_code_acc = _mm_setzero_ps();
+// imag_L_code_acc = _mm_setzero_ps();
+//
+// if (sse_iters>0)
+// {
+// for(int number = 0;number < sse_iters; number++){
+//
+// //Perform the carrier wipe-off
+// x1 = _mm_lddqu_si128((__m128i*)input_ptr);
+// input_ptr += 4;
+// x2 = _mm_lddqu_si128((__m128i*)input_ptr);
+//
+// y1 = _mm_lddqu_si128((__m128i*)carrier_ptr);
+// carrier_ptr += 4;
+// y2 = _mm_lddqu_si128((__m128i*)carrier_ptr);
+//
+// imagx = _mm_srli_si128 (x1, 2);
+// imagx = _mm_blend_epi16 (x2, imagx, 85);
+// realx = _mm_slli_si128 (x2, 2);
+// realx = _mm_blend_epi16 (realx, x1, 85);
+//
+// imagy = _mm_srli_si128 (y1, 2);
+// imagy = _mm_blend_epi16 (y2, imagy, 85);
+// realy = _mm_slli_si128 (y2, 2);
+// realy = _mm_blend_epi16 (realy, y1, 85);
+//
+// realx_mult_realy = _mm_mullo_epi16 (realx, realy);
+// imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);
+// realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);
+// imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);
+//
+// real_bb_signal_sample = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
+// imag_bb_signal_sample = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
+//
+// //Get early values
+// y1 = _mm_lddqu_si128((__m128i*)E_code_ptr);
+// E_code_ptr += 4;
+// y2 = _mm_lddqu_si128((__m128i*)E_code_ptr);
+//
+// imagy = _mm_srli_si128 (y1, 2);
+// imagy = _mm_blend_epi16 (y2, imagy, 85);
+// realy = _mm_slli_si128 (y2, 2);
+// realy = _mm_blend_epi16 (realy, y1, 85);
+//
+// realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy);
+// imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy);
+// realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy);
+// imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy);
+//
+// real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
+// imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
+//
+// real_output_i_1 = _mm_cvtepi16_epi32(real_output);
+// real_output_ps_1 = _mm_cvtepi32_ps(real_output_i_1);
+// real_output = _mm_srli_si128 (real_output, 8);
+// real_output_i_2 = _mm_cvtepi16_epi32(real_output);
+// real_output_ps_2 = _mm_cvtepi32_ps(real_output_i_2);
+//
+// imag_output_i_1 = _mm_cvtepi16_epi32(imag_output);
+// imag_output_ps_1 = _mm_cvtepi32_ps(imag_output_i_1);
+// imag_output = _mm_srli_si128 (imag_output, 8);
+// imag_output_i_2 = _mm_cvtepi16_epi32(imag_output);
+// imag_output_ps_2 = _mm_cvtepi32_ps(imag_output_i_2);
+//
+// real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps_1);
+// real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps_2);
+// imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps_1);
+// imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps_2);
+//
+// //Get prompt values
+// y1 = _mm_lddqu_si128((__m128i*)P_code_ptr);
+// P_code_ptr += 4;
+// y2 = _mm_lddqu_si128((__m128i*)P_code_ptr);
+//
+// imagy = _mm_srli_si128 (y1, 2);
+// imagy = _mm_blend_epi16 (y2, imagy, 85);
+// realy = _mm_slli_si128 (y2, 2);
+// realy = _mm_blend_epi16 (realy, y1, 85);
+//
+// realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy);
+// imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy);
+// realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy);
+// imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy);
+//
+// real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
+// imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
+//
+// real_output_i_1 = _mm_cvtepi16_epi32(real_output);
+// real_output_ps_1 = _mm_cvtepi32_ps(real_output_i_1);
+// real_output = _mm_srli_si128 (real_output, 8);
+// real_output_i_2 = _mm_cvtepi16_epi32(real_output);
+// real_output_ps_2 = _mm_cvtepi32_ps(real_output_i_2);
+//
+// imag_output_i_1 = _mm_cvtepi16_epi32(imag_output);
+// imag_output_ps_1 = _mm_cvtepi32_ps(imag_output_i_1);
+// imag_output = _mm_srli_si128 (imag_output, 8);
+// imag_output_i_2 = _mm_cvtepi16_epi32(imag_output);
+// imag_output_ps_2 = _mm_cvtepi32_ps(imag_output_i_2);
+//
+// real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps_1);
+// real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps_2);
+// imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps_1);
+// imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps_2);
+//
+// //Get late values
+// y1 = _mm_lddqu_si128((__m128i*)L_code_ptr);
+// L_code_ptr += 4;
+// y2 = _mm_lddqu_si128((__m128i*)L_code_ptr);
+//
+// imagy = _mm_srli_si128 (y1, 2);
+// imagy = _mm_blend_epi16 (y2, imagy, 85);
+// realy = _mm_slli_si128 (y2, 2);
+// realy = _mm_blend_epi16 (realy, y1, 85);
+//
+// realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy);
+// imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy);
+// realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy);
+// imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy);
+//
+// real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
+// imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
+//
+// real_output_i_1 = _mm_cvtepi16_epi32(real_output);
+// real_output_ps_1 = _mm_cvtepi32_ps(real_output_i_1);
+// real_output = _mm_srli_si128 (real_output, 8);
+// real_output_i_2 = _mm_cvtepi16_epi32(real_output);
+// real_output_ps_2 = _mm_cvtepi32_ps(real_output_i_2);
+//
+// imag_output_i_1 = _mm_cvtepi16_epi32(imag_output);
+// imag_output_ps_1 = _mm_cvtepi32_ps(imag_output_i_1);
+// imag_output = _mm_srli_si128 (imag_output, 8);
+// imag_output_i_2 = _mm_cvtepi16_epi32(imag_output);
+// imag_output_ps_2 = _mm_cvtepi32_ps(imag_output_i_2);
+//
+// real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps_1);
+// real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps_2);
+// imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps_1);
+// imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps_2);
+//
+// input_ptr += 4;
+// carrier_ptr += 4;
+// E_code_ptr += 4;
+// L_code_ptr += 4;
+// P_code_ptr += 4;
+// }
+//
+// __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4];
+// __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4];
+// __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4];
+// __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4];
+// __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4];
+// __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4];
+//
+// _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector
+// _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector
+// _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector
+// _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector
+// _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector
+// _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector
+//
+// for (int i = 0; i<4; ++i)
+// {
+// E_out_real += real_E_dotProductVector[i];
+// E_out_imag += imag_E_dotProductVector[i];
+// P_out_real += real_P_dotProductVector[i];
+// P_out_imag += imag_P_dotProductVector[i];
+// L_out_real += real_L_dotProductVector[i];
+// L_out_imag += imag_L_dotProductVector[i];
+// }
+// *E_out_ptr = lv_cmake(E_out_real, E_out_imag);
+// *P_out_ptr = lv_cmake(P_out_real, P_out_imag);
+// *L_out_ptr = lv_cmake(L_out_real, L_out_imag);
+// }
+//
+// lv_16sc_t bb_signal_sample;
+// for(int i=0; i < num_points%8; ++i)
+// {
+// //Perform the carrier wipe-off
+// bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
+// // Now get early, late, and prompt values for each
+// *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++));
+// *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++));
+// *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++));
+// }
+//}
+//#endif /* LV_HAVE_SSE4_1 */
+//
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
+ \param input The input signal input
+ \param carrier The carrier signal input
+ \param E_code Early PRN code replica input
+ \param P_code Early PRN code replica input
+ \param L_code Early PRN code replica input
+ \param E_out Early correlation output
+ \param P_out Early correlation output
+ \param L_out Early correlation output
+ \param num_points The number of complex values in vectors
+ */
+static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_a_generic(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points)
+{
+ lv_16sc_t bb_signal_sample;
+ lv_16sc_t tmp1;
+ lv_16sc_t tmp2;
+ lv_16sc_t tmp3;
+
+ bb_signal_sample = lv_cmake(0, 0);
+
+ *E_out = 0;
+ *P_out = 0;
+ *L_out = 0;
+ // perform Early, Prompt and Late correlation
+
+ for(int i=0; i < num_points; ++i)
+ {
+ //Perform the carrier wipe-off
+ bb_signal_sample = input[i] * carrier[i];
+
+ tmp1 = bb_signal_sample * E_code[i];
+ tmp2 = bb_signal_sample * P_code[i];
+ tmp3 = bb_signal_sample * L_code[i];
+
+ // Now get early, late, and prompt values for each
+ *E_out += (lv_32fc_t)tmp1;
+ *P_out += (lv_32fc_t)tmp2;
+ *L_out += (lv_32fc_t)tmp3;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+#endif /* INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_a_H */
diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5.h
--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5.h 1970-01-01 01:00:00.000000000 +0100
+++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5.h 2014-10-15 01:55:08.000000000 +0200
@@ -0,0 +1,595 @@
+/*!
+ * \file volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5.h
+ * \brief Volk protokernel: performs the carrier wipe-off mixing and the Very early, Early, Prompt, Late and very late correlation with 32 bits vectors and returns float32 values.
+ * \authors <ul>
+ * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
+ * </ul>
+ *
+ * Volk protokernel that performs the carrier wipe-off mixing and the
+ * Very Early, Early, Prompt, Late and Very Late correlation with 32 bits vectors (16 bits the
+ * real part and 16 bits the imaginary part) and accumulates into float32 values, returning them:
+ * - The carrier wipe-off is done by multiplying the input signal by the
+ * carrier (multiplication of 32 bits vectors) It returns the input
+ * signal in base band (BB)
+ * - Very Early values are calculated by multiplying the input signal in BB by the
+ * very early code (multiplication of 32 bits vectors), converting that to float32 and accumulating the results
+ * - Early values are calculated by multiplying the input signal in BB by the
+ * early code (multiplication of 32 bits vectors), converting that to float32 and accumulating the results
+ * - Prompt values are calculated by multiplying the input signal in BB by the
+ * prompt code (multiplication of 32 bits vectors), converting that to float32 and accumulating the results
+ * - Late values are calculated by multiplying the input signal in BB by the
+ * late code (multiplication of 32 bits vectors), converting that to float32 and accumulating the results
+ * - Very Late values are calculated by multiplying the input signal in BB by the
+ * very late code (multiplication of 32 bits vectors), converting that to float32 and accumulating the results
+ *
+ * -------------------------------------------------------------------------
+ *
+ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
+ *
+ * GNSS-SDR is a software defined Global Navigation
+ * Satellite Systems receiver
+ *
+ * This file is part of GNSS-SDR.
+ *
+ * GNSS-SDR is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * at your option) any later version.
+ *
+ * GNSS-SDR is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * -------------------------------------------------------------------------
+ */
+
+#ifndef INCLUDED_gnsssdr_volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5_u_H
+#define INCLUDED_gnsssdr_volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk_gnsssdr/volk_gnsssdr_complex.h>
+#include <float.h>
+#include <string.h>
+
+#ifdef LV_HAVE_SSE4_1
+#include "smmintrin.h"
+#include "CommonMacros/CommonMacros_16ic_cw_epl_corr_32fc.h"
+#include "CommonMacros/CommonMacros.h"
+ /*!
+ \brief Performs the carrier wipe-off mixing and the Very Early, Early, Prompt, Late and Very Vate correlation
+ \param input The input signal input
+ \param carrier The carrier signal input
+ \param VE_code Very Early PRN code replica input
+ \param E_code Early PRN code replica input
+ \param P_code Prompt PRN code replica input
+ \param L_code Late PRN code replica input
+ \param VL_code Very Late PRN code replica input
+ \param VE_out Very Early correlation output
+ \param E_out Early correlation output
+ \param P_out Prompt correlation output
+ \param L_out Late correlation output
+ \param VL_out Very Late correlation output
+ \param num_points The number of complex values in vectors
+ */
+static inline void volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5_u_sse4_1(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* VE_code, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, const lv_16sc_t* VL_code, unsigned int num_points)
+{
+ const unsigned int sse_iters = num_points / 8;
+
+ __m128i x1, x2, y1, y2, real_bb_signal_sample, imag_bb_signal_sample;
+ __m128i realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output;
+
+ __m128 real_VE_code_acc, imag_VE_code_acc, real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc, real_VL_code_acc, imag_VL_code_acc;
+ __m128i input_i_1, input_i_2, output_i32;
+ __m128 real_output_ps, imag_output_ps;
+
+ float VE_out_real = 0;
+ float VE_out_imag = 0;
+ float E_out_real = 0;
+ float E_out_imag = 0;
+ float P_out_real = 0;
+ float P_out_imag = 0;
+ float L_out_real = 0;
+ float L_out_imag = 0;
+ float VL_out_real = 0;
+ float VL_out_imag = 0;
+
+ const lv_16sc_t* input_ptr = input;
+ const lv_16sc_t* carrier_ptr = carrier;
+
+ const lv_16sc_t* VE_code_ptr = VE_code;
+ lv_32fc_t* VE_out_ptr = VE_out;
+ const lv_16sc_t* E_code_ptr = E_code;
+ lv_32fc_t* E_out_ptr = E_out;
+ const lv_16sc_t* L_code_ptr = L_code;
+ lv_32fc_t* L_out_ptr = L_out;
+ const lv_16sc_t* P_code_ptr = P_code;
+ lv_32fc_t* P_out_ptr = P_out;
+ const lv_16sc_t* VL_code_ptr = VL_code;
+ lv_32fc_t* VL_out_ptr = VL_out;
+
+ *VE_out_ptr = 0;
+ *E_out_ptr = 0;
+ *P_out_ptr = 0;
+ *L_out_ptr = 0;
+ *VL_out_ptr = 0;
+
+ real_VE_code_acc = _mm_setzero_ps();
+ imag_VE_code_acc = _mm_setzero_ps();
+ real_E_code_acc = _mm_setzero_ps();
+ imag_E_code_acc = _mm_setzero_ps();
+ real_P_code_acc = _mm_setzero_ps();
+ imag_P_code_acc = _mm_setzero_ps();
+ real_L_code_acc = _mm_setzero_ps();
+ imag_L_code_acc = _mm_setzero_ps();
+ real_VL_code_acc = _mm_setzero_ps();
+ imag_VL_code_acc = _mm_setzero_ps();
+
+ if (sse_iters>0)
+ {
+ for(int number = 0;number < sse_iters; number++){
+
+ //Perform the carrier wipe-off
+ x1 = _mm_lddqu_si128((__m128i*)input_ptr);
+ input_ptr += 4;
+ x2 = _mm_lddqu_si128((__m128i*)input_ptr);
+
+ y1 = _mm_lddqu_si128((__m128i*)carrier_ptr);
+ carrier_ptr += 4;
+ y2 = _mm_lddqu_si128((__m128i*)carrier_ptr);
+
+ CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(x1, x2, realx, imagx)
+ CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy)
+ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample)
+
+ //Get very early values
+ y1 = _mm_lddqu_si128((__m128i*)VE_code_ptr);
+ VE_code_ptr += 4;
+ y2 = _mm_lddqu_si128((__m128i*)VE_code_ptr);
+
+ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
+
+ real_VE_code_acc = _mm_add_ps (real_VE_code_acc, real_output_ps);
+ imag_VE_code_acc = _mm_add_ps (imag_VE_code_acc, imag_output_ps);
+
+ //Get early values
+ y1 = _mm_lddqu_si128((__m128i*)E_code_ptr);
+ E_code_ptr += 4;
+ y2 = _mm_lddqu_si128((__m128i*)E_code_ptr);
+
+ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
+
+ real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps);
+ imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps);
+
+ //Get prompt values
+ y1 = _mm_lddqu_si128((__m128i*)P_code_ptr);
+ P_code_ptr += 4;
+ y2 = _mm_lddqu_si128((__m128i*)P_code_ptr);
+
+ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
+
+ real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps);
+ imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps);
+
+ //Get late values
+ y1 = _mm_lddqu_si128((__m128i*)L_code_ptr);
+ L_code_ptr += 4;
+ y2 = _mm_lddqu_si128((__m128i*)L_code_ptr);
+
+ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
+
+ real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps);
+ imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps);
+
+ //Get very late values
+ y1 = _mm_lddqu_si128((__m128i*)VL_code_ptr);
+ VL_code_ptr += 4;
+ y2 = _mm_lddqu_si128((__m128i*)VL_code_ptr);
+
+ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
+
+ real_VL_code_acc = _mm_add_ps (real_VL_code_acc, real_output_ps);
+ imag_VL_code_acc = _mm_add_ps (imag_VL_code_acc, imag_output_ps);
+
+ input_ptr += 4;
+ carrier_ptr += 4;
+ VE_code_ptr += 4;
+ E_code_ptr += 4;
+ P_code_ptr += 4;
+ L_code_ptr += 4;
+ VL_code_ptr += 4;
+ }
+
+ __VOLK_ATTR_ALIGNED(16) float real_VE_dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(16) float imag_VE_dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(16) float real_VL_dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(16) float imag_VL_dotProductVector[4];
+
+ _mm_storeu_ps((float*)real_VE_dotProductVector,real_VE_code_acc); // Store the results back into the dot product vector
+ _mm_storeu_ps((float*)imag_VE_dotProductVector,imag_VE_code_acc); // Store the results back into the dot product vector
+ _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector
+ _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector
+ _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector
+ _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector
+ _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector
+ _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector
+ _mm_storeu_ps((float*)real_VL_dotProductVector,real_VL_code_acc); // Store the results back into the dot product vector
+ _mm_storeu_ps((float*)imag_VL_dotProductVector,imag_VL_code_acc); // Store the results back into the dot product vector
+
+ for (int i = 0; i<4; ++i)
+ {
+ VE_out_real += real_VE_dotProductVector[i];
+ VE_out_imag += imag_VE_dotProductVector[i];
+ E_out_real += real_E_dotProductVector[i];
+ E_out_imag += imag_E_dotProductVector[i];
+ P_out_real += real_P_dotProductVector[i];
+ P_out_imag += imag_P_dotProductVector[i];
+ L_out_real += real_L_dotProductVector[i];
+ L_out_imag += imag_L_dotProductVector[i];
+ VL_out_real += real_VL_dotProductVector[i];
+ VL_out_imag += imag_VL_dotProductVector[i];
+ }
+ *VE_out_ptr = lv_cmake(VE_out_real, VE_out_imag);
+ *E_out_ptr = lv_cmake(E_out_real, E_out_imag);
+ *P_out_ptr = lv_cmake(P_out_real, P_out_imag);
+ *L_out_ptr = lv_cmake(L_out_real, L_out_imag);
+ *VL_out_ptr = lv_cmake(VL_out_real, VL_out_imag);
+ }
+
+ lv_16sc_t bb_signal_sample;
+ for(int i=0; i < num_points%8; ++i)
+ {
+ //Perform the carrier wipe-off
+ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
+ // Now get early, late, and prompt values for each
+ *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VE_code_ptr++));
+ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++));
+ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++));
+ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++));
+ *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VL_code_ptr++));
+ }
+
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Performs the carrier wipe-off mixing and the Very Early, Early, Prompt, Late and Very Vate correlation
+ \param input The input signal input
+ \param carrier The carrier signal input
+ \param VE_code Very Early PRN code replica input
+ \param E_code Early PRN code replica input
+ \param P_code Prompt PRN code replica input
+ \param L_code Late PRN code replica input
+ \param VL_code Very Late PRN code replica input
+ \param VE_out Very Early correlation output
+ \param E_out Early correlation output
+ \param P_out Prompt correlation output
+ \param L_out Late correlation output
+ \param VL_out Very Late correlation output
+ \param num_points The number of complex values in vectors
+ */
+static inline void volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* VE_code, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, const lv_16sc_t* VL_code, unsigned int num_points)
+{
+ lv_16sc_t bb_signal_sample;
+ lv_16sc_t tmp1;
+ lv_16sc_t tmp2;
+ lv_16sc_t tmp3;
+ lv_16sc_t tmp4;
+ lv_16sc_t tmp5;
+
+ bb_signal_sample = lv_cmake(0, 0);
+
+ *VE_out = 0;
+ *E_out = 0;
+ *P_out = 0;
+ *L_out = 0;
+ *VL_out = 0;
+ // perform Early, Prompt and Late correlation
+
+ for(int i=0; i < num_points; ++i)
+ {
+ //Perform the carrier wipe-off
+ bb_signal_sample = input[i] * carrier[i];
+
+ tmp1 = bb_signal_sample * VE_code[i];
+ tmp2 = bb_signal_sample * E_code[i];
+ tmp3 = bb_signal_sample * P_code[i];
+ tmp4 = bb_signal_sample * L_code[i];
+ tmp5 = bb_signal_sample * VL_code[i];
+
+ // Now get early, late, and prompt values for each
+ *VE_out += (lv_32fc_t)tmp1;
+ *E_out += (lv_32fc_t)tmp2;
+ *P_out += (lv_32fc_t)tmp3;
+ *L_out += (lv_32fc_t)tmp4;
+ *VL_out += (lv_32fc_t)tmp5;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+#endif /* INCLUDED_gnsssdr_volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5_u_H */
+
+
+#ifndef INCLUDED_gnsssdr_volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5_a_H
+#define INCLUDED_gnsssdr_volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk_gnsssdr/volk_gnsssdr_complex.h>
+#include <float.h>
+#include <string.h>
+
+#ifdef LV_HAVE_SSE4_1
+#include "smmintrin.h"
+#include "CommonMacros/CommonMacros_16ic_cw_epl_corr_32fc.h"
+#include "CommonMacros/CommonMacros.h"
+/*!
+ \brief Performs the carrier wipe-off mixing and the Very Early, Early, Prompt, Late and Very Vate correlation
+ \param input The input signal input
+ \param carrier The carrier signal input
+ \param VE_code Very Early PRN code replica input
+ \param E_code Early PRN code replica input
+ \param P_code Prompt PRN code replica input
+ \param L_code Late PRN code replica input
+ \param VL_code Very Late PRN code replica input
+ \param VE_out Very Early correlation output
+ \param E_out Early correlation output
+ \param P_out Prompt correlation output
+ \param L_out Late correlation output
+ \param VL_out Very Late correlation output
+ \param num_points The number of complex values in vectors
+ */
+static inline void volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5_a_sse4_1(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* VE_code, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, const lv_16sc_t* VL_code, unsigned int num_points)
+{
+ const unsigned int sse_iters = num_points / 8;
+
+ __m128i x1, x2, y1, y2, real_bb_signal_sample, imag_bb_signal_sample;
+ __m128i realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output;
+
+ __m128 real_VE_code_acc, imag_VE_code_acc, real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc, real_VL_code_acc, imag_VL_code_acc;
+ __m128i input_i_1, input_i_2, output_i32;
+ __m128 real_output_ps, imag_output_ps;
+
+ float VE_out_real = 0;
+ float VE_out_imag = 0;
+ float E_out_real = 0;
+ float E_out_imag = 0;
+ float P_out_real = 0;
+ float P_out_imag = 0;
+ float L_out_real = 0;
+ float L_out_imag = 0;
+ float VL_out_real = 0;
+ float VL_out_imag = 0;
+
+ const lv_16sc_t* input_ptr = input;
+ const lv_16sc_t* carrier_ptr = carrier;
+
+ const lv_16sc_t* VE_code_ptr = VE_code;
+ lv_32fc_t* VE_out_ptr = VE_out;
+ const lv_16sc_t* E_code_ptr = E_code;
+ lv_32fc_t* E_out_ptr = E_out;
+ const lv_16sc_t* L_code_ptr = L_code;
+ lv_32fc_t* L_out_ptr = L_out;
+ const lv_16sc_t* P_code_ptr = P_code;
+ lv_32fc_t* P_out_ptr = P_out;
+ const lv_16sc_t* VL_code_ptr = VL_code;
+ lv_32fc_t* VL_out_ptr = VL_out;
+
+ *VE_out_ptr = 0;
+ *E_out_ptr = 0;
+ *P_out_ptr = 0;
+ *L_out_ptr = 0;
+ *VL_out_ptr = 0;
+
+ real_VE_code_acc = _mm_setzero_ps();
+ imag_VE_code_acc = _mm_setzero_ps();
+ real_E_code_acc = _mm_setzero_ps();
+ imag_E_code_acc = _mm_setzero_ps();
+ real_P_code_acc = _mm_setzero_ps();
+ imag_P_code_acc = _mm_setzero_ps();
+ real_L_code_acc = _mm_setzero_ps();
+ imag_L_code_acc = _mm_setzero_ps();
+ real_VL_code_acc = _mm_setzero_ps();
+ imag_VL_code_acc = _mm_setzero_ps();
+
+ if (sse_iters>0)
+ {
+ for(int number = 0;number < sse_iters; number++){
+
+ //Perform the carrier wipe-off
+ x1 = _mm_load_si128((__m128i*)input_ptr);
+ input_ptr += 4;
+ x2 = _mm_load_si128((__m128i*)input_ptr);
+
+ y1 = _mm_load_si128((__m128i*)carrier_ptr);
+ carrier_ptr += 4;
+ y2 = _mm_load_si128((__m128i*)carrier_ptr);
+
+ CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(x1, x2, realx, imagx)
+ CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy)
+ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample)
+
+ //Get very early values
+ y1 = _mm_load_si128((__m128i*)VE_code_ptr);
+ VE_code_ptr += 4;
+ y2 = _mm_load_si128((__m128i*)VE_code_ptr);
+
+ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
+
+ real_VE_code_acc = _mm_add_ps (real_VE_code_acc, real_output_ps);
+ imag_VE_code_acc = _mm_add_ps (imag_VE_code_acc, imag_output_ps);
+
+ //Get early values
+ y1 = _mm_load_si128((__m128i*)E_code_ptr);
+ E_code_ptr += 4;
+ y2 = _mm_load_si128((__m128i*)E_code_ptr);
+
+ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
+
+ real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps);
+ imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps);
+
+ //Get prompt values
+ y1 = _mm_load_si128((__m128i*)P_code_ptr);
+ P_code_ptr += 4;
+ y2 = _mm_load_si128((__m128i*)P_code_ptr);
+
+ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
+
+ real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps);
+ imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps);
+
+ //Get late values
+ y1 = _mm_load_si128((__m128i*)L_code_ptr);
+ L_code_ptr += 4;
+ y2 = _mm_load_si128((__m128i*)L_code_ptr);
+
+ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
+
+ real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps);
+ imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps);
+
+ //Get very late values
+ y1 = _mm_load_si128((__m128i*)VL_code_ptr);
+ VL_code_ptr += 4;
+ y2 = _mm_load_si128((__m128i*)VL_code_ptr);
+
+ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
+
+ real_VL_code_acc = _mm_add_ps (real_VL_code_acc, real_output_ps);
+ imag_VL_code_acc = _mm_add_ps (imag_VL_code_acc, imag_output_ps);
+
+ input_ptr += 4;
+ carrier_ptr += 4;
+ VE_code_ptr += 4;
+ E_code_ptr += 4;
+ P_code_ptr += 4;
+ L_code_ptr += 4;
+ VL_code_ptr += 4;
+ }
+
+ __VOLK_ATTR_ALIGNED(16) float real_VE_dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(16) float imag_VE_dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(16) float real_VL_dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(16) float imag_VL_dotProductVector[4];
+
+ _mm_store_ps((float*)real_VE_dotProductVector,real_VE_code_acc); // Store the results back into the dot product vector
+ _mm_store_ps((float*)imag_VE_dotProductVector,imag_VE_code_acc); // Store the results back into the dot product vector
+ _mm_store_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector
+ _mm_store_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector
+ _mm_store_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector
+ _mm_store_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector
+ _mm_store_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector
+ _mm_store_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector
+ _mm_store_ps((float*)real_VL_dotProductVector,real_VL_code_acc); // Store the results back into the dot product vector
+ _mm_store_ps((float*)imag_VL_dotProductVector,imag_VL_code_acc); // Store the results back into the dot product vector
+
+ for (int i = 0; i<4; ++i)
+ {
+ VE_out_real += real_VE_dotProductVector[i];
+ VE_out_imag += imag_VE_dotProductVector[i];
+ E_out_real += real_E_dotProductVector[i];
+ E_out_imag += imag_E_dotProductVector[i];
+ P_out_real += real_P_dotProductVector[i];
+ P_out_imag += imag_P_dotProductVector[i];
+ L_out_real += real_L_dotProductVector[i];
+ L_out_imag += imag_L_dotProductVector[i];
+ VL_out_real += real_VL_dotProductVector[i];
+ VL_out_imag += imag_VL_dotProductVector[i];
+ }
+ *VE_out_ptr = lv_cmake(VE_out_real, VE_out_imag);
+ *E_out_ptr = lv_cmake(E_out_real, E_out_imag);
+ *P_out_ptr = lv_cmake(P_out_real, P_out_imag);
+ *L_out_ptr = lv_cmake(L_out_real, L_out_imag);
+ *VL_out_ptr = lv_cmake(VL_out_real, VL_out_imag);
+ }
+
+ lv_16sc_t bb_signal_sample;
+ for(int i=0; i < num_points%8; ++i)
+ {
+ //Perform the carrier wipe-off
+ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
+ // Now get early, late, and prompt values for each
+ *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VE_code_ptr++));
+ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++));
+ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++));
+ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++));
+ *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VL_code_ptr++));
+ }
+
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Performs the carrier wipe-off mixing and the Very Early, Early, Prompt, Late and Very Vate correlation
+ \param input The input signal input
+ \param carrier The carrier signal input
+ \param VE_code Very Early PRN code replica input
+ \param E_code Early PRN code replica input
+ \param P_code Prompt PRN code replica input
+ \param L_code Late PRN code replica input
+ \param VL_code Very Late PRN code replica input
+ \param VE_out Very Early correlation output
+ \param E_out Early correlation output
+ \param P_out Prompt correlation output
+ \param L_out Late correlation output
+ \param VL_out Very Late correlation output
+ \param num_points The number of complex values in vectors
+ */
+static inline void volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5_a_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* VE_code, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, const lv_16sc_t* VL_code, unsigned int num_points)
+{
+ lv_16sc_t bb_signal_sample;
+ lv_16sc_t tmp1;
+ lv_16sc_t tmp2;
+ lv_16sc_t tmp3;
+ lv_16sc_t tmp4;
+ lv_16sc_t tmp5;
+
+ bb_signal_sample = lv_cmake(0, 0);
+
+ *VE_out = 0;
+ *E_out = 0;
+ *P_out = 0;
+ *L_out = 0;
+ *VL_out = 0;
+ // perform Early, Prompt and Late correlation
+
+ for(int i=0; i < num_points; ++i)
+ {
+ //Perform the carrier wipe-off
+ bb_signal_sample = input[i] * carrier[i];
+
+ tmp1 = bb_signal_sample * VE_code[i];
+ tmp2 = bb_signal_sample * E_code[i];
+ tmp3 = bb_signal_sample * P_code[i];
+ tmp4 = bb_signal_sample * L_code[i];
+ tmp5 = bb_signal_sample * VL_code[i];
+
+ // Now get early, late, and prompt values for each
+ *VE_out += (lv_32fc_t)tmp1;
+ *E_out += (lv_32fc_t)tmp2;
+ *P_out += (lv_32fc_t)tmp3;
+ *L_out += (lv_32fc_t)tmp4;
+ *VL_out += (lv_32fc_t)tmp5;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+#endif /* INCLUDED_gnsssdr_volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5_a_H */
diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_accumulator_s32f.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32f_accumulator_s32f.h
--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_accumulator_s32f.h 1970-01-01 01:00:00.000000000 +0100
+++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32f_accumulator_s32f.h 2014-10-15 01:55:08.000000000 +0200
@@ -0,0 +1,68 @@
+#ifndef INCLUDED_volk_gnsssdr_32f_accumulator_s32f_a_H
+#define INCLUDED_volk_gnsssdr_32f_accumulator_s32f_a_H
+
+#include <volk_gnsssdr/volk_gnsssdr_common.h>
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+ \brief Accumulates the values in the input buffer
+ \param result The accumulated result
+ \param inputBuffer The buffer of data to be accumulated
+ \param num_points The number of values in inputBuffer to be accumulated
+*/
+static inline void volk_gnsssdr_32f_accumulator_s32f_a_sse(float* result, const float* inputBuffer, unsigned int num_points){
+ float returnValue = 0;
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* aPtr = inputBuffer;
+ __VOLK_ATTR_ALIGNED(16) float tempBuffer[4];
+
+ __m128 accumulator = _mm_setzero_ps();
+ __m128 aVal = _mm_setzero_ps();
+
+ for(;number < quarterPoints; number++){
+ aVal = _mm_load_ps(aPtr);
+ accumulator = _mm_add_ps(accumulator, aVal);
+ aPtr += 4;
+ }
+ _mm_store_ps(tempBuffer,accumulator); // Store the results back into the C container
+ returnValue = tempBuffer[0];
+ returnValue += tempBuffer[1];
+ returnValue += tempBuffer[2];
+ returnValue += tempBuffer[3];
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ returnValue += (*aPtr++);
+ }
+ *result = returnValue;
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Accumulates the values in the input buffer
+ \param result The accumulated result
+ \param inputBuffer The buffer of data to be accumulated
+ \param num_points The number of values in inputBuffer to be accumulated
+*/
+static inline void volk_gnsssdr_32f_accumulator_s32f_generic(float* result, const float* inputBuffer, unsigned int num_points){
+ const float* aPtr = inputBuffer;
+ unsigned int number = 0;
+ float returnValue = 0;
+
+ for(;number < num_points; number++){
+ returnValue += (*aPtr++);
+ }
+ *result = returnValue;
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_gnsssdr_32f_accumulator_s32f_a_H */
diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_index_max_16u.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32f_index_max_16u.h
--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_index_max_16u.h 1970-01-01 01:00:00.000000000 +0100
+++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32f_index_max_16u.h 2014-10-15 01:55:08.000000000 +0200
@@ -0,0 +1,149 @@
+#ifndef INCLUDED_volk_gnsssdr_32f_index_max_16u_a_H
+#define INCLUDED_volk_gnsssdr_32f_index_max_16u_a_H
+
+#include <volk_gnsssdr/volk_gnsssdr_common.h>
+#include <volk_gnsssdr/volk_gnsssdr_common.h>
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE4_1
+#include<smmintrin.h>
+
+static inline void volk_gnsssdr_32f_index_max_16u_a_sse4_1(unsigned int* target, const float* src0, unsigned int num_points) {
+ if(num_points > 0){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float* inputPtr = (float*)src0;
+
+ __m128 indexIncrementValues = _mm_set1_ps(4);
+ __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4);
+
+ float max = src0[0];
+ float index = 0;
+ __m128 maxValues = _mm_set1_ps(max);
+ __m128 maxValuesIndex = _mm_setzero_ps();
+ __m128 compareResults;
+ __m128 currentValues;
+
+ __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
+ __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
+
+ for(;number < quarterPoints; number++){
+
+ currentValues = _mm_load_ps(inputPtr); inputPtr += 4;
+ currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
+
+ compareResults = _mm_cmpgt_ps(maxValues, currentValues);
+
+ maxValuesIndex = _mm_blendv_ps(currentIndexes, maxValuesIndex, compareResults);
+ maxValues = _mm_blendv_ps(currentValues, maxValues, compareResults);
+ }
+
+ // Calculate the largest value from the remaining 4 points
+ _mm_store_ps(maxValuesBuffer, maxValues);
+ _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
+
+ for(number = 0; number < 4; number++){
+ if(maxValuesBuffer[number] > max){
+ index = maxIndexesBuffer[number];
+ max = maxValuesBuffer[number];
+ }
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ if(src0[number] > max){
+ index = number;
+ max = src0[number];
+ }
+ }
+ target[0] = (unsigned int)index;
+ }
+}
+
+#endif /*LV_HAVE_SSE4_1*/
+
+#ifdef LV_HAVE_SSE
+#include<xmmintrin.h>
+
+static inline void volk_gnsssdr_32f_index_max_16u_a_sse(unsigned int* target, const float* src0, unsigned int num_points) {
+ if(num_points > 0){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float* inputPtr = (float*)src0;
+
+ __m128 indexIncrementValues = _mm_set1_ps(4);
+ __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4);
+
+ float max = src0[0];
+ float index = 0;
+ __m128 maxValues = _mm_set1_ps(max);
+ __m128 maxValuesIndex = _mm_setzero_ps();
+ __m128 compareResults;
+ __m128 currentValues;
+
+ __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
+ __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
+
+ for(;number < quarterPoints; number++){
+
+ currentValues = _mm_load_ps(inputPtr); inputPtr += 4;
+ currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
+
+ compareResults = _mm_cmpgt_ps(maxValues, currentValues);
+
+ maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, maxValuesIndex) , _mm_andnot_ps(compareResults, currentIndexes));
+
+ maxValues = _mm_or_ps(_mm_and_ps(compareResults, maxValues) , _mm_andnot_ps(compareResults, currentValues));
+ }
+
+ // Calculate the largest value from the remaining 4 points
+ _mm_store_ps(maxValuesBuffer, maxValues);
+ _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
+
+ for(number = 0; number < 4; number++){
+ if(maxValuesBuffer[number] > max){
+ index = maxIndexesBuffer[number];
+ max = maxValuesBuffer[number];
+ }
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ if(src0[number] > max){
+ index = number;
+ max = src0[number];
+ }
+ }
+ target[0] = (unsigned int)index;
+ }
+}
+
+#endif /*LV_HAVE_SSE*/
+
+#ifdef LV_HAVE_GENERIC
+static inline void volk_gnsssdr_32f_index_max_16u_generic(unsigned int* target, const float* src0, unsigned int num_points) {
+ if(num_points > 0){
+ float max = src0[0];
+ unsigned int index = 0;
+
+ unsigned int i = 1;
+
+ for(; i < num_points; ++i) {
+
+ if(src0[i] > max){
+ index = i;
+ max = src0[i];
+ }
+
+ }
+ target[0] = index;
+ }
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#endif /*INCLUDED_volk_gnsssdr_32f_index_max_16u_a_H*/
diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_s32f_convert_16i.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32f_s32f_convert_16i.h
--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_s32f_convert_16i.h 1970-01-01 01:00:00.000000000 +0100
+++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32f_s32f_convert_16i.h 2014-10-15 01:55:08.000000000 +0200
@@ -0,0 +1,302 @@
+#ifndef INCLUDED_volk_gnsssdr_32f_s32f_convert_16i_u_H
+#define INCLUDED_volk_gnsssdr_32f_s32f_convert_16i_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+ /*!
+ \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
+ \param inputVector The floating point input data buffer
+ \param outputVector The 16 bit output data buffer
+ \param scalar The value multiplied against each point in the input buffer
+ \param num_points The number of data values to be converted
+ \note Input buffer does NOT need to be properly aligned
+ */
+static inline void volk_gnsssdr_32f_s32f_convert_16i_u_sse2(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+
+ const unsigned int eighthPoints = num_points / 8;
+
+ const float* inputVectorPtr = (const float*)inputVector;
+ int16_t* outputVectorPtr = outputVector;
+
+ float min_val = -32768;
+ float max_val = 32767;
+ float r;
+
+ __m128 vScalar = _mm_set_ps1(scalar);
+ __m128 inputVal1, inputVal2;
+ __m128i intInputVal1, intInputVal2;
+ __m128 ret1, ret2;
+ __m128 vmin_val = _mm_set_ps1(min_val);
+ __m128 vmax_val = _mm_set_ps1(max_val);
+
+ for(;number < eighthPoints; number++){
+ inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
+ inputVal2 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
+
+ // Scale and clip
+ ret1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
+ ret2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
+
+ intInputVal1 = _mm_cvtps_epi32(ret1);
+ intInputVal2 = _mm_cvtps_epi32(ret2);
+
+ intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
+
+ _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
+ outputVectorPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for(; number < num_points; number++){
+ r = inputVector[number] * scalar;
+ if(r > max_val)
+ r = max_val;
+ else if(r < min_val)
+ r = min_val;
+ outputVector[number] = (int16_t)rintf(r);
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+ /*!
+ \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
+ \param inputVector The floating point input data buffer
+ \param outputVector The 16 bit output data buffer
+ \param scalar The value multiplied against each point in the input buffer
+ \param num_points The number of data values to be converted
+ \note Input buffer does NOT need to be properly aligned
+ */
+static inline void volk_gnsssdr_32f_s32f_convert_16i_u_sse(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* inputVectorPtr = (const float*)inputVector;
+ int16_t* outputVectorPtr = outputVector;
+
+ float min_val = -32768;
+ float max_val = 32767;
+ float r;
+
+ __m128 vScalar = _mm_set_ps1(scalar);
+ __m128 ret;
+ __m128 vmin_val = _mm_set_ps1(min_val);
+ __m128 vmax_val = _mm_set_ps1(max_val);
+
+ __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
+
+ for(;number < quarterPoints; number++){
+ ret = _mm_loadu_ps(inputVectorPtr);
+ inputVectorPtr += 4;
+
+ // Scale and clip
+ ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
+
+ _mm_store_ps(outputFloatBuffer, ret);
+ *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]);
+ *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]);
+ *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]);
+ *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]);
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ r = inputVector[number] * scalar;
+ if(r > max_val)
+ r = max_val;
+ else if(r < min_val)
+ r = min_val;
+ outputVector[number] = (int16_t)rintf(r);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+ /*!
+ \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
+ \param inputVector The floating point input data buffer
+ \param outputVector The 16 bit output data buffer
+ \param scalar The value multiplied against each point in the input buffer
+ \param num_points The number of data values to be converted
+ \note Input buffer does NOT need to be properly aligned
+ */
+static inline void volk_gnsssdr_32f_s32f_convert_16i_generic(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+ int16_t* outputVectorPtr = outputVector;
+ const float* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+ float min_val = -32768;
+ float max_val = 32767;
+ float r;
+
+ for(number = 0; number < num_points; number++){
+ r = *inputVectorPtr++ * scalar;
+ if(r > max_val)
+ r = max_val;
+ else if(r < min_val)
+ r = min_val;
+ *outputVectorPtr++ = (int16_t)rintf(r);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_gnsssdr_32f_s32f_convert_16i_u_H */
+#ifndef INCLUDED_volk_gnsssdr_32f_s32f_convert_16i_a_H
+#define INCLUDED_volk_gnsssdr_32f_s32f_convert_16i_a_H
+
+#include <volk/volk_common.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+ /*!
+ \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
+ \param inputVector The floating point input data buffer
+ \param outputVector The 16 bit output data buffer
+ \param scalar The value multiplied against each point in the input buffer
+ \param num_points The number of data values to be converted
+ */
+static inline void volk_gnsssdr_32f_s32f_convert_16i_a_sse2(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+
+ const unsigned int eighthPoints = num_points / 8;
+
+ const float* inputVectorPtr = (const float*)inputVector;
+ int16_t* outputVectorPtr = outputVector;
+
+ float min_val = -32768;
+ float max_val = 32767;
+ float r;
+
+ __m128 vScalar = _mm_set_ps1(scalar);
+ __m128 inputVal1, inputVal2;
+ __m128i intInputVal1, intInputVal2;
+ __m128 ret1, ret2;
+ __m128 vmin_val = _mm_set_ps1(min_val);
+ __m128 vmax_val = _mm_set_ps1(max_val);
+
+ for(;number < eighthPoints; number++){
+ inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
+ inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
+
+ // Scale and clip
+ ret1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
+ ret2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
+
+ intInputVal1 = _mm_cvtps_epi32(ret1);
+ intInputVal2 = _mm_cvtps_epi32(ret2);
+
+ intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
+
+ _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
+ outputVectorPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for(; number < num_points; number++){
+ r = inputVector[number] * scalar;
+ if(r > max_val)
+ r = max_val;
+ else if(r < min_val)
+ r = min_val;
+ outputVector[number] = (int16_t)rintf(r);
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+ /*!
+ \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
+ \param inputVector The floating point input data buffer
+ \param outputVector The 16 bit output data buffer
+ \param scalar The value multiplied against each point in the input buffer
+ \param num_points The number of data values to be converted
+ */
+static inline void volk_gnsssdr_32f_s32f_convert_16i_a_sse(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* inputVectorPtr = (const float*)inputVector;
+ int16_t* outputVectorPtr = outputVector;
+
+ float min_val = -32768;
+ float max_val = 32767;
+ float r;
+
+ __m128 vScalar = _mm_set_ps1(scalar);
+ __m128 ret;
+ __m128 vmin_val = _mm_set_ps1(min_val);
+ __m128 vmax_val = _mm_set_ps1(max_val);
+
+ __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
+
+ for(;number < quarterPoints; number++){
+ ret = _mm_load_ps(inputVectorPtr);
+ inputVectorPtr += 4;
+
+ // Scale and clip
+ ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
+
+ _mm_store_ps(outputFloatBuffer, ret);
+ *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]);
+ *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]);
+ *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]);
+ *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]);
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ r = inputVector[number] * scalar;
+ if(r > max_val)
+ r = max_val;
+ else if(r < min_val)
+ r = min_val;
+ outputVector[number] = (int16_t)rintf(r);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+ /*!
+ \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
+ \param inputVector The floating point input data buffer
+ \param outputVector The 16 bit output data buffer
+ \param scalar The value multiplied against each point in the input buffer
+ \param num_points The number of data values to be converted
+ */
+static inline void volk_gnsssdr_32f_s32f_convert_16i_a_generic(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+ int16_t* outputVectorPtr = outputVector;
+ const float* inputVectorPtr = inputVector;
+ unsigned int number = 0;
+ float min_val = -32768;
+ float max_val = 32767;
+ float r;
+
+ for(number = 0; number < num_points; number++){
+ r = *inputVectorPtr++ * scalar;
+ if(r < min_val)
+ r = min_val;
+ else if(r > max_val)
+ r = max_val;
+ *outputVectorPtr++ = (int16_t)rintf(r);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_gnsssdr_32f_s32f_convert_16i_a_H */
diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_x2_add_32f.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32f_x2_add_32f.h
--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_x2_add_32f.h 1970-01-01 01:00:00.000000000 +0100
+++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32f_x2_add_32f.h 2014-10-15 01:55:08.000000000 +0200
@@ -0,0 +1,147 @@
+#ifndef INCLUDED_volk_gnsssdr_32f_x2_add_32f_u_H
+#define INCLUDED_volk_gnsssdr_32f_x2_add_32f_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+ \brief Adds the two input vectors and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be added
+ \param bVector One of the vectors to be added
+ \param num_points The number of values in aVector and bVector to be added together and stored into cVector
+*/
+static inline void volk_gnsssdr_32f_x2_add_32f_u_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr= bVector;
+
+ __m128 aVal, bVal, cVal;
+ for(;number < quarterPoints; number++){
+
+ aVal = _mm_loadu_ps(aPtr);
+ bVal = _mm_loadu_ps(bPtr);
+
+ cVal = _mm_add_ps(aVal, bVal);
+
+ _mm_storeu_ps(cPtr,cVal); // Store the results back into the C container
+
+ aPtr += 4;
+ bPtr += 4;
+ cPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ *cPtr++ = (*aPtr++) + (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Adds the two input vectors and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be added
+ \param bVector One of the vectors to be added
+ \param num_points The number of values in aVector and bVector to be added together and stored into cVector
+*/
+static inline void volk_gnsssdr_32f_x2_add_32f_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr= bVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *cPtr++ = (*aPtr++) + (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_volk_gnsssdr_32f_x2_add_32f_u_H */
+#ifndef INCLUDED_volk_gnsssdr_32f_x2_add_32f_a_H
+#define INCLUDED_volk_gnsssdr_32f_x2_add_32f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+ \brief Adds the two input vectors and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be added
+ \param bVector One of the vectors to be added
+ \param num_points The number of values in aVector and bVector to be added together and stored into cVector
+*/
+static inline void volk_gnsssdr_32f_x2_add_32f_a_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr= bVector;
+
+ __m128 aVal, bVal, cVal;
+ for(;number < quarterPoints; number++){
+
+ aVal = _mm_load_ps(aPtr);
+ bVal = _mm_load_ps(bPtr);
+
+ cVal = _mm_add_ps(aVal, bVal);
+
+ _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+
+ aPtr += 4;
+ bPtr += 4;
+ cPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ *cPtr++ = (*aPtr++) + (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Adds the two input vectors and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be added
+ \param bVector One of the vectors to be added
+ \param num_points The number of values in aVector and bVector to be added together and stored into cVector
+*/
+static inline void volk_gnsssdr_32f_x2_add_32f_a_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr= bVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *cPtr++ = (*aPtr++) + (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#ifdef LV_HAVE_ORC
+/*!
+ \brief Adds the two input vectors and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be added
+ \param bVector One of the vectors to be added
+ \param num_points The number of values in aVector and bVector to be added together and stored into cVector
+*/
+extern void volk_gnsssdr_32f_x2_add_32f_a_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
+static inline void volk_gnsssdr_32f_x2_add_32f_u_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+ volk_gnsssdr_32f_x2_add_32f_a_orc_impl(cVector, aVector, bVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_gnsssdr_32f_x2_add_32f_a_H */
diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_conjugate_32fc.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_conjugate_32fc.h
--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_conjugate_32fc.h 1970-01-01 01:00:00.000000000 +0100
+++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_conjugate_32fc.h 2014-10-15 01:55:08.000000000 +0200
@@ -0,0 +1,127 @@
+#ifndef INCLUDED_volk_gnsssdr_32fc_conjugate_32fc_u_H
+#define INCLUDED_volk_gnsssdr_32fc_conjugate_32fc_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk_gnsssdr/volk_gnsssdr_complex.h>
+#include <float.h>
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+ /*!
+ \brief Takes the conjugate of a complex vector.
+ \param cVector The vector where the results will be stored
+ \param aVector Vector to be conjugated
+ \param num_points The number of complex values in aVector to be conjugated and stored into cVector
+ */
+static inline void volk_gnsssdr_32fc_conjugate_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int halfPoints = num_points / 2;
+
+ __m128 x;
+ lv_32fc_t* c = cVector;
+ const lv_32fc_t* a = aVector;
+
+ __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f);
+
+ for(;number < halfPoints; number++){
+
+ x = _mm_loadu_ps((float*)a); // Load the complex data as ar,ai,br,bi
+
+ x = _mm_xor_ps(x, conjugator); // conjugate register
+
+ _mm_storeu_ps((float*)c,x); // Store the results back into the C container
+
+ a += 2;
+ c += 2;
+ }
+
+ if((num_points % 2) != 0) {
+ *c = lv_conj(*a);
+ }
+}
+#endif /* LV_HAVE_SSE3 */
+
+#ifdef LV_HAVE_GENERIC
+ /*!
+ \brief Takes the conjugate of a complex vector.
+ \param cVector The vector where the results will be stored
+ \param aVector Vector to be conjugated
+ \param num_points The number of complex values in aVector to be conjugated and stored into cVector
+ */
+static inline void volk_gnsssdr_32fc_conjugate_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points){
+ lv_32fc_t* cPtr = cVector;
+ const lv_32fc_t* aPtr = aVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *cPtr++ = lv_conj(*aPtr++);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_gnsssdr_32fc_conjugate_32fc_u_H */
+#ifndef INCLUDED_volk_gnsssdr_32fc_conjugate_32fc_a_H
+#define INCLUDED_volk_gnsssdr_32fc_conjugate_32fc_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk_gnsssdr/volk_gnsssdr_complex.h>
+#include <float.h>
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+ /*!
+ \brief Takes the conjugate of a complex vector.
+ \param cVector The vector where the results will be stored
+ \param aVector Vector to be conjugated
+ \param num_points The number of complex values in aVector to be conjugated and stored into cVector
+ */
+static inline void volk_gnsssdr_32fc_conjugate_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int halfPoints = num_points / 2;
+
+ __m128 x;
+ lv_32fc_t* c = cVector;
+ const lv_32fc_t* a = aVector;
+
+ __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f);
+
+ for(;number < halfPoints; number++){
+
+ x = _mm_load_ps((float*)a); // Load the complex data as ar,ai,br,bi
+
+ x = _mm_xor_ps(x, conjugator); // conjugate register
+
+ _mm_store_ps((float*)c,x); // Store the results back into the C container
+
+ a += 2;
+ c += 2;
+ }
+
+ if((num_points % 2) != 0) {
+ *c = lv_conj(*a);
+ }
+}
+#endif /* LV_HAVE_SSE3 */
+
+#ifdef LV_HAVE_GENERIC
+ /*!
+ \brief Takes the conjugate of a complex vector.
+ \param cVector The vector where the results will be stored
+ \param aVector Vector to be conjugated
+ \param num_points The number of complex values in aVector to be conjugated and stored into cVector
+ */
+static inline void volk_gnsssdr_32fc_conjugate_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points){
+ lv_32fc_t* cPtr = cVector;
+ const lv_32fc_t* aPtr = aVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *cPtr++ = lv_conj(*aPtr++);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_volk_gnsssdr_32fc_conjugate_32fc_a_H */
diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_16ic.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_16ic.h
--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_16ic.h 1970-01-01 01:00:00.000000000 +0100
+++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_16ic.h 2014-10-15 01:55:08.000000000 +0200
@@ -0,0 +1,295 @@
+/*!
+ * \file volk_gnsssdr_32fc_convert_16ic.h
+ * \brief Volk protokernel: converts float32 complex values to 16 integer complex values taking care of overflow
+ * \authors <ul>
+ * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
+ * </ul>
+ *
+ * -------------------------------------------------------------------------
+ *
+ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
+ *
+ * GNSS-SDR is a software defined Global Navigation
+ * Satellite Systems receiver
+ *
+ * This file is part of GNSS-SDR.
+ *
+ * GNSS-SDR is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * at your option) any later version.
+ *
+ * GNSS-SDR is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * -------------------------------------------------------------------------
+ */
+
+#ifndef INCLUDED_volk_gnsssdr_32fc_convert_16ic_u_H
+#define INCLUDED_volk_gnsssdr_32fc_convert_16ic_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+/*!
+ \brief Converts a float vector of 64 bits (32 bits each part) into a 32 integer vector (16 bits each part)
+ \param inputVector The floating point input data buffer
+ \param outputVector The 16 bit output data buffer
+ \param num_points The number of data values to be converted
+ */
+static inline void volk_gnsssdr_32fc_convert_16ic_u_sse2(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){
+ const unsigned int sse_iters = num_points/4;
+
+ float* inputVectorPtr = (float*)inputVector;
+ int16_t* outputVectorPtr = (int16_t*)outputVector;
+
+ float min_val = -32768;
+ float max_val = 32767;
+
+ __m128 inputVal1, inputVal2;
+ __m128i intInputVal1, intInputVal2;
+ __m128 ret1, ret2;
+ __m128 vmin_val = _mm_set_ps1(min_val);
+ __m128 vmax_val = _mm_set_ps1(max_val);
+
+ for(unsigned int i = 0;i < sse_iters; i++){
+ inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
+ inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
+
+ // Clip
+ ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
+ ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
+
+ intInputVal1 = _mm_cvtps_epi32(ret1);
+ intInputVal2 = _mm_cvtps_epi32(ret2);
+
+ intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
+
+ _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
+ outputVectorPtr += 8;
+ }
+
+ for(unsigned int i = 0; i < (num_points%4)*2; i++){
+ if(inputVectorPtr[i] > max_val)
+ inputVectorPtr[i] = max_val;
+ else if(inputVectorPtr[i] < min_val)
+ inputVectorPtr[i] = min_val;
+ outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]);
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+ \brief Converts a float vector of 64 bits (32 bits each part) into a 32 integer vector (16 bits each part)
+ \param inputVector The floating point input data buffer
+ \param outputVector The 16 bit output data buffer
+ \param num_points The number of data values to be converted
+ */
+static inline void volk_gnsssdr_32fc_convert_16ic_u_sse(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){
+ const unsigned int sse_iters = num_points/4;
+
+ float* inputVectorPtr = (float*)inputVector;
+ int16_t* outputVectorPtr = (int16_t*)outputVector;
+
+ float min_val = -32768;
+ float max_val = 32767;
+
+ __m128 inputVal1, inputVal2;
+ __m128i intInputVal1, intInputVal2;
+ __m128 ret1, ret2;
+ __m128 vmin_val = _mm_set_ps1(min_val);
+ __m128 vmax_val = _mm_set_ps1(max_val);
+
+ for(unsigned int i = 0;i < sse_iters; i++){
+ inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
+ inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
+
+ // Clip
+ ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
+ ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
+
+ intInputVal1 = _mm_cvtps_epi32(ret1);
+ intInputVal2 = _mm_cvtps_epi32(ret2);
+
+ intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
+
+ _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
+ outputVectorPtr += 8;
+ }
+
+ for(unsigned int i = 0; i < (num_points%4)*2; i++){
+ if(inputVectorPtr[i] > max_val)
+ inputVectorPtr[i] = max_val;
+ else if(inputVectorPtr[i] < min_val)
+ inputVectorPtr[i] = min_val;
+ outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Converts a float vector of 64 bits (32 bits each part) into a 32 integer vector (16 bits each part)
+ \param inputVector The floating point input data buffer
+ \param outputVector The 16 bit output data buffer
+ \param num_points The number of data values to be converted
+ */
+static inline void volk_gnsssdr_32fc_convert_16ic_generic(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){
+ float* inputVectorPtr = (float*)inputVector;
+ int16_t* outputVectorPtr = (int16_t*)outputVector;
+ float min_val = -32768;
+ float max_val = 32767;
+
+ for(unsigned int i = 0; i < num_points*2; i++){
+ if(inputVectorPtr[i] > max_val)
+ inputVectorPtr[i] = max_val;
+ else if(inputVectorPtr[i] < min_val)
+ inputVectorPtr[i] = min_val;
+ outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+#endif /* INCLUDED_volk_gnsssdr_32fc_convert_16ic_u_H */
+
+
+#ifndef INCLUDED_volk_gnsssdr_32fc_convert_16ic_a_H
+#define INCLUDED_volk_gnsssdr_32fc_convert_16ic_a_H
+
+#include <volk/volk_common.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+/*!
+ \brief Converts a float vector of 64 bits (32 bits each part) into a 32 integer vector (16 bits each part)
+ \param inputVector The floating point input data buffer
+ \param outputVector The 16 bit output data buffer
+ \param num_points The number of data values to be converted
+ */
+static inline void volk_gnsssdr_32fc_convert_16ic_a_sse2(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){
+ const unsigned int sse_iters = num_points/4;
+
+ float* inputVectorPtr = (float*)inputVector;
+ int16_t* outputVectorPtr = (int16_t*)outputVector;
+
+ float min_val = -32768;
+ float max_val = 32767;
+
+ __m128 inputVal1, inputVal2;
+ __m128i intInputVal1, intInputVal2;
+ __m128 ret1, ret2;
+ __m128 vmin_val = _mm_set_ps1(min_val);
+ __m128 vmax_val = _mm_set_ps1(max_val);
+
+ for(unsigned int i = 0;i < sse_iters; i++){
+ inputVal1 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
+ inputVal2 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
+
+ // Clip
+ ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
+ ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
+
+ intInputVal1 = _mm_cvtps_epi32(ret1);
+ intInputVal2 = _mm_cvtps_epi32(ret2);
+
+ intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
+
+ _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
+ outputVectorPtr += 8;
+ }
+
+ for(unsigned int i = 0; i < (num_points%4)*2; i++){
+ if(inputVectorPtr[i] > max_val)
+ inputVectorPtr[i] = max_val;
+ else if(inputVectorPtr[i] < min_val)
+ inputVectorPtr[i] = min_val;
+ outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]);
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+ \brief Converts a float vector of 64 bits (32 bits each part) into a 32 integer vector (16 bits each part)
+ \param inputVector The floating point input data buffer
+ \param outputVector The 16 bit output data buffer
+ \param num_points The number of data values to be converted
+ */
+static inline void volk_gnsssdr_32fc_convert_16ic_a_sse(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){
+ const unsigned int sse_iters = num_points/4;
+
+ float* inputVectorPtr = (float*)inputVector;
+ int16_t* outputVectorPtr = (int16_t*)outputVector;
+
+ float min_val = -32768;
+ float max_val = 32767;
+
+ __m128 inputVal1, inputVal2;
+ __m128i intInputVal1, intInputVal2;
+ __m128 ret1, ret2;
+ __m128 vmin_val = _mm_set_ps1(min_val);
+ __m128 vmax_val = _mm_set_ps1(max_val);
+
+ for(unsigned int i = 0;i < sse_iters; i++){
+ inputVal1 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
+ inputVal2 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
+
+ // Clip
+ ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
+ ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
+
+ intInputVal1 = _mm_cvtps_epi32(ret1);
+ intInputVal2 = _mm_cvtps_epi32(ret2);
+
+ intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
+
+ _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
+ outputVectorPtr += 8;
+ }
+
+ for(unsigned int i = 0; i < (num_points%4)*2; i++){
+ if(inputVectorPtr[i] > max_val)
+ inputVectorPtr[i] = max_val;
+ else if(inputVectorPtr[i] < min_val)
+ inputVectorPtr[i] = min_val;
+ outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Converts a float vector of 64 bits (32 bits each part) into a 32 integer vector (16 bits each part)
+ \param inputVector The floating point input data buffer
+ \param outputVector The 16 bit output data buffer
+ \param num_points The number of data values to be converted
+ */
+static inline void volk_gnsssdr_32fc_convert_16ic_a_generic(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){
+ float* inputVectorPtr = (float*)inputVector;
+ int16_t* outputVectorPtr = (int16_t*)outputVector;
+ float min_val = -32768;
+ float max_val = 32767;
+
+ for(unsigned int i = 0; i < num_points*2; i++){
+ if(inputVectorPtr[i] > max_val)
+ inputVectorPtr[i] = max_val;
+ else if(inputVectorPtr[i] < min_val)
+ inputVectorPtr[i] = min_val;
+ outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+#endif /* INCLUDED_volk_gnsssdr_32fc_convert_16ic_a_H */
diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_8ic.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_8ic.h
--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_8ic.h 1970-01-01 01:00:00.000000000 +0100
+++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_8ic.h 2014-10-15 01:55:08.000000000 +0200
@@ -0,0 +1,213 @@
+/*!
+ * \file volk_gnsssdr_32fc_convert_8ic.h
+ * \brief Volk protokernel: converts float32 complex values to 8 integer complex values taking care of overflow
+ * \authors <ul>
+ * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
+ * </ul>
+ *
+ * -------------------------------------------------------------------------
+ *
+ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
+ *
+ * GNSS-SDR is a software defined Global Navigation
+ * Satellite Systems receiver
+ *
+ * This file is part of GNSS-SDR.
+ *
+ * GNSS-SDR is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * at your option) any later version.
+ *
+ * GNSS-SDR is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * -------------------------------------------------------------------------
+ */
+
+#ifndef INCLUDED_volk_gnsssdr_32fc_convert_8ic_u_H
+#define INCLUDED_volk_gnsssdr_32fc_convert_8ic_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+/*!
+ \brief Converts a float vector of 64 bits (32 bits each part) into a 16 integer vector (8 bits each part)
+ \param inputVector The floating point input data buffer
+ \param outputVector The 16 bit output data buffer
+ \param num_points The number of data values to be converted
+ */
+static inline void volk_gnsssdr_32fc_convert_8ic_u_sse2(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){
+ const unsigned int sse_iters = num_points/8;
+
+ float* inputVectorPtr = (float*)inputVector;
+ int8_t* outputVectorPtr = (int8_t*)outputVector;
+
+ float min_val = -128;
+ float max_val = 127;
+
+ __m128 inputVal1, inputVal2, inputVal3, inputVal4;
+ __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
+ __m128i int8InputVal;
+ __m128 ret1, ret2, ret3, ret4;
+ __m128 vmin_val = _mm_set_ps1(min_val);
+ __m128 vmax_val = _mm_set_ps1(max_val);
+
+ for(unsigned int i = 0;i < sse_iters; i++){
+ inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
+ inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
+ inputVal3 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
+ inputVal4 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
+
+ // Clip
+ ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
+ ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
+ ret3 = _mm_max_ps(_mm_min_ps(inputVal3, vmax_val), vmin_val);
+ ret4 = _mm_max_ps(_mm_min_ps(inputVal4, vmax_val), vmin_val);
+
+ intInputVal1 = _mm_cvtps_epi32(ret1);
+ intInputVal2 = _mm_cvtps_epi32(ret2);
+ intInputVal3 = _mm_cvtps_epi32(ret3);
+ intInputVal4 = _mm_cvtps_epi32(ret4);
+
+ intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
+ intInputVal2 = _mm_packs_epi32(intInputVal3, intInputVal4);
+ int8InputVal = _mm_packs_epi16(intInputVal1, intInputVal2);
+
+ _mm_storeu_si128((__m128i*)outputVectorPtr, int8InputVal);
+ outputVectorPtr += 16;
+ }
+
+ for(unsigned int i = 0; i < (num_points%4)*4; i++){
+ if(inputVectorPtr[i] > max_val)
+ inputVectorPtr[i] = max_val;
+ else if(inputVectorPtr[i] < min_val)
+ inputVectorPtr[i] = min_val;
+ outputVectorPtr[i] = (int8_t)rintf(inputVectorPtr[i]);
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Converts a float vector of 64 bits (32 bits each part) into a 16 integer vector (8 bits each part)
+ \param inputVector The floating point input data buffer
+ \param outputVector The 16 bit output data buffer
+ \param num_points The number of data values to be converted
+ */
+static inline void volk_gnsssdr_32fc_convert_8ic_generic(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){
+ float* inputVectorPtr = (float*)inputVector;
+ int8_t* outputVectorPtr = (int8_t*)outputVector;
+ float min_val = -128;
+ float max_val = 127;
+
+ for(unsigned int i = 0; i < num_points*2; i++){
+ if(inputVectorPtr[i] > max_val)
+ inputVectorPtr[i] = max_val;
+ else if(inputVectorPtr[i] < min_val)
+ inputVectorPtr[i] = min_val;
+ outputVectorPtr[i] = (int8_t)rintf(inputVectorPtr[i]);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+#endif /* INCLUDED_volk_gnsssdr_32fc_convert_8ic_u_H */
+
+
+#ifndef INCLUDED_volk_gnsssdr_32fc_convert_8ic_a_H
+#define INCLUDED_volk_gnsssdr_32fc_convert_8ic_a_H
+
+#include <volk/volk_common.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+/*!
+ \brief Converts a float vector of 64 bits (32 bits each part) into a 16 integer vector (8 bits each part)
+ \param inputVector The floating point input data buffer
+ \param outputVector The 16 bit output data buffer
+ \param num_points The number of data values to be converted
+ */
+static inline void volk_gnsssdr_32fc_convert_8ic_a_sse2(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){
+ const unsigned int sse_iters = num_points/8;
+
+ float* inputVectorPtr = (float*)inputVector;
+ int8_t* outputVectorPtr = (int8_t*)outputVector;
+
+ float min_val = -128;
+ float max_val = 127;
+
+ __m128 inputVal1, inputVal2, inputVal3, inputVal4;
+ __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
+ __m128i int8InputVal;
+ __m128 ret1, ret2, ret3, ret4;
+ __m128 vmin_val = _mm_set_ps1(min_val);
+ __m128 vmax_val = _mm_set_ps1(max_val);
+
+ for(unsigned int i = 0;i < sse_iters; i++){
+ inputVal1 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
+ inputVal2 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
+ inputVal3 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
+ inputVal4 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
+
+ // Clip
+ ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
+ ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
+ ret3 = _mm_max_ps(_mm_min_ps(inputVal3, vmax_val), vmin_val);
+ ret4 = _mm_max_ps(_mm_min_ps(inputVal4, vmax_val), vmin_val);
+
+ intInputVal1 = _mm_cvtps_epi32(ret1);
+ intInputVal2 = _mm_cvtps_epi32(ret2);
+ intInputVal3 = _mm_cvtps_epi32(ret3);
+ intInputVal4 = _mm_cvtps_epi32(ret4);
+
+ intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
+ intInputVal2 = _mm_packs_epi32(intInputVal3, intInputVal4);
+ int8InputVal = _mm_packs_epi16(intInputVal1, intInputVal2);
+
+ _mm_store_si128((__m128i*)outputVectorPtr, int8InputVal);
+ outputVectorPtr += 16;
+ }
+
+ for(unsigned int i = 0; i < (num_points%4)*4; i++){
+ if(inputVectorPtr[i] > max_val)
+ inputVectorPtr[i] = max_val;
+ else if(inputVectorPtr[i] < min_val)
+ inputVectorPtr[i] = min_val;
+ outputVectorPtr[i] = (int8_t)rintf(inputVectorPtr[i]);
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Converts a float vector of 64 bits (32 bits each part) into a 16 integer vector (8 bits each part)
+ \param inputVector The floating point input data buffer
+ \param outputVector The 16 bit output data buffer
+ \param num_points The number of data values to be converted
+ */
+static inline void volk_gnsssdr_32fc_convert_8ic_a_generic(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){
+ float* inputVectorPtr = (float*)inputVector;
+ int8_t* outputVectorPtr = (int8_t*)outputVector;
+ float min_val = -128;
+ float max_val = 127;
+
+ for(unsigned int i = 0; i < num_points*2; i++){
+ if(inputVectorPtr[i] > max_val)
+ inputVectorPtr[i] = max_val;
+ else if(inputVectorPtr[i] < min_val)
+ inputVectorPtr[i] = min_val;
+ outputVectorPtr[i] = (int8_t)rintf(inputVectorPtr[i]);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+#endif /* INCLUDED_volk_gnsssdr_32fc_convert_8ic_a_H */
diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_magnitude_squared_32f.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_magnitude_squared_32f.h
--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_magnitude_squared_32f.h 1970-01-01 01:00:00.000000000 +0100
+++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_magnitude_squared_32f.h 2014-10-15 01:55:08.000000000 +0200
@@ -0,0 +1,228 @@
+#ifndef INCLUDED_volk_gnsssdr_32fc_magnitude_squared_32f_u_H
+#define INCLUDED_volk_gnsssdr_32fc_magnitude_squared_32f_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+ /*!
+ \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector
+ \param complexVector The vector containing the complex input values
+ \param magnitudeVector The vector containing the real output values
+ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+ */
+static inline void volk_gnsssdr_32fc_magnitude_squared_32f_u_sse3(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* complexVectorPtr = (float*)complexVector;
+ float* magnitudeVectorPtr = magnitudeVector;
+
+ __m128 cplxValue1, cplxValue2, result;
+ for(;number < quarterPoints; number++){
+ cplxValue1 = _mm_loadu_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ cplxValue2 = _mm_loadu_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
+ cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
+
+ result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
+
+ _mm_storeu_ps(magnitudeVectorPtr, result);
+ magnitudeVectorPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ float val1Real = *complexVectorPtr++;
+ float val1Imag = *complexVectorPtr++;
+ *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
+ }
+}
+#endif /* LV_HAVE_SSE3 */
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+ /*!
+ \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector
+ \param complexVector The vector containing the complex input values
+ \param magnitudeVector The vector containing the real output values
+ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+ */
+static inline void volk_gnsssdr_32fc_magnitude_squared_32f_u_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* complexVectorPtr = (float*)complexVector;
+ float* magnitudeVectorPtr = magnitudeVector;
+
+ __m128 cplxValue1, cplxValue2, iValue, qValue, result;
+ for(;number < quarterPoints; number++){
+ cplxValue1 = _mm_loadu_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ cplxValue2 = _mm_loadu_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ // Arrange in i1i2i3i4 format
+ iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+ // Arrange in q1q2q3q4 format
+ qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
+
+ iValue = _mm_mul_ps(iValue, iValue); // Square the I values
+ qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values
+
+ result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values
+
+ _mm_storeu_ps(magnitudeVectorPtr, result);
+ magnitudeVectorPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ float val1Real = *complexVectorPtr++;
+ float val1Imag = *complexVectorPtr++;
+ *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+ /*!
+ \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector
+ \param complexVector The vector containing the complex input values
+ \param magnitudeVector The vector containing the real output values
+ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+ */
+static inline void volk_gnsssdr_32fc_magnitude_squared_32f_generic(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
+ const float* complexVectorPtr = (float*)complexVector;
+ float* magnitudeVectorPtr = magnitudeVector;
+ unsigned int number = 0;
+ for(number = 0; number < num_points; number++){
+ const float real = *complexVectorPtr++;
+ const float imag = *complexVectorPtr++;
+ *magnitudeVectorPtr++ = (real*real) + (imag*imag);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_volk_gnsssdr_32fc_magnitude_32f_u_H */
+#ifndef INCLUDED_volk_gnsssdr_32fc_magnitude_squared_32f_a_H
+#define INCLUDED_volk_gnsssdr_32fc_magnitude_squared_32f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+ /*!
+ \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector
+ \param complexVector The vector containing the complex input values
+ \param magnitudeVector The vector containing the real output values
+ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+ */
+static inline void volk_gnsssdr_32fc_magnitude_squared_32f_a_sse3(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* complexVectorPtr = (float*)complexVector;
+ float* magnitudeVectorPtr = magnitudeVector;
+
+ __m128 cplxValue1, cplxValue2, result;
+ for(;number < quarterPoints; number++){
+ cplxValue1 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ cplxValue2 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
+ cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
+
+ result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
+
+ _mm_store_ps(magnitudeVectorPtr, result);
+ magnitudeVectorPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ float val1Real = *complexVectorPtr++;
+ float val1Imag = *complexVectorPtr++;
+ *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
+ }
+}
+#endif /* LV_HAVE_SSE3 */
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+ /*!
+ \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector
+ \param complexVector The vector containing the complex input values
+ \param magnitudeVector The vector containing the real output values
+ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+ */
+static inline void volk_gnsssdr_32fc_magnitude_squared_32f_a_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* complexVectorPtr = (float*)complexVector;
+ float* magnitudeVectorPtr = magnitudeVector;
+
+ __m128 cplxValue1, cplxValue2, iValue, qValue, result;
+ for(;number < quarterPoints; number++){
+ cplxValue1 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ cplxValue2 = _mm_load_ps(complexVectorPtr);
+ complexVectorPtr += 4;
+
+ // Arrange in i1i2i3i4 format
+ iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+ // Arrange in q1q2q3q4 format
+ qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
+
+ iValue = _mm_mul_ps(iValue, iValue); // Square the I values
+ qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values
+
+ result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values
+
+ _mm_store_ps(magnitudeVectorPtr, result);
+ magnitudeVectorPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ float val1Real = *complexVectorPtr++;
+ float val1Imag = *complexVectorPtr++;
+ *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+ /*!
+ \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector
+ \param complexVector The vector containing the complex input values
+ \param magnitudeVector The vector containing the real output values
+ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+ */
+static inline void volk_gnsssdr_32fc_magnitude_squared_32f_a_generic(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
+ const float* complexVectorPtr = (float*)complexVector;
+ float* magnitudeVectorPtr = magnitudeVector;
+ unsigned int number = 0;
+ for(number = 0; number < num_points; number++){
+ const float real = *complexVectorPtr++;
+ const float imag = *complexVectorPtr++;
+ *magnitudeVectorPtr++ = (real*real) + (imag*imag);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_volk_gnsssdr_32fc_magnitude_32f_a_H */
diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_s32f_convert_8ic.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_s32f_convert_8ic.h
--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_s32f_convert_8ic.h 1970-01-01 01:00:00.000000000 +0100
+++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_s32f_convert_8ic.h 2014-10-15 01:55:08.000000000 +0200
@@ -0,0 +1,231 @@
+/*!
+ * \file volk_gnsssdr_32fc_s32f_convert_8ic.h
+ * \brief Volk protokernel: converts float32 complex values to 8 integer complex values taking care of overflow
+ * \authors <ul>
+ * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
+ * </ul>
+ *
+ * -------------------------------------------------------------------------
+ *
+ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
+ *
+ * GNSS-SDR is a software defined Global Navigation
+ * Satellite Systems receiver
+ *
+ * This file is part of GNSS-SDR.
+ *
+ * GNSS-SDR is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * at your option) any later version.
+ *
+ * GNSS-SDR is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * -------------------------------------------------------------------------
+ */
+
+#ifndef INCLUDED_volk_gnsssdr_32fc_s32f_convert_8ic_u_H
+#define INCLUDED_volk_gnsssdr_32fc_s32f_convert_8ic_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+/*!
+ \brief Converts a float vector of 64 bits (32 bits each part) into a 16 integer vector (8 bits each part)
+ \param inputVector The floating point input data buffer
+ \param outputVector The 16 bit output data buffer
+ \param num_points The number of data values to be converted
+ */
+static inline void volk_gnsssdr_32fc_s32f_convert_8ic_u_sse2(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, const float scalar, unsigned int num_points){
+ const unsigned int sse_iters = num_points/8;
+
+ float* inputVectorPtr = (float*)inputVector;
+ int8_t* outputVectorPtr = (int8_t*)outputVector;
+ __m128 invScalar = _mm_set_ps1(1.0/scalar);
+
+ float min_val = -128;
+ float max_val = 127;
+
+ __m128 inputVal1, inputVal2, inputVal3, inputVal4;
+ __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
+ __m128i int8InputVal;
+ __m128 ret1, ret2, ret3, ret4;
+ __m128 vmin_val = _mm_set_ps1(min_val);
+ __m128 vmax_val = _mm_set_ps1(max_val);
+
+ for(unsigned int i = 0;i < sse_iters; i++){
+ inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
+ inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
+ inputVal3 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
+ inputVal4 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
+
+ inputVal1 = _mm_mul_ps(inputVal1, invScalar);
+ inputVal2 = _mm_mul_ps(inputVal2, invScalar);
+ inputVal3 = _mm_mul_ps(inputVal3, invScalar);
+ inputVal4 = _mm_mul_ps(inputVal4, invScalar);
+ // Clip
+ ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
+ ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
+ ret3 = _mm_max_ps(_mm_min_ps(inputVal3, vmax_val), vmin_val);
+ ret4 = _mm_max_ps(_mm_min_ps(inputVal4, vmax_val), vmin_val);
+
+ intInputVal1 = _mm_cvtps_epi32(ret1);
+ intInputVal2 = _mm_cvtps_epi32(ret2);
+ intInputVal3 = _mm_cvtps_epi32(ret3);
+ intInputVal4 = _mm_cvtps_epi32(ret4);
+
+ intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
+ intInputVal2 = _mm_packs_epi32(intInputVal3, intInputVal4);
+ int8InputVal = _mm_packs_epi16(intInputVal1, intInputVal2);
+
+ _mm_storeu_si128((__m128i*)outputVectorPtr, int8InputVal);
+ outputVectorPtr += 16;
+ }
+
+ float scaled = 0;
+ for(unsigned int i = 0; i < (num_points%4)*4; i++){
+ scaled = inputVectorPtr[i]/scalar;
+ if(scaled > max_val)
+ scaled = max_val;
+ else if(scaled < min_val)
+ scaled = min_val;
+ outputVectorPtr[i] = (int8_t)rintf(scaled);
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Converts a float vector of 64 bits (32 bits each part) into a 16 integer vector (8 bits each part)
+ \param inputVector The floating point input data buffer
+ \param outputVector The 16 bit output data buffer
+ \param num_points The number of data values to be converted
+ */
+static inline void volk_gnsssdr_32fc_s32f_convert_8ic_generic(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, const float scalar, unsigned int num_points){
+ float* inputVectorPtr = (float*)inputVector;
+ int8_t* outputVectorPtr = (int8_t*)outputVector;
+ float scaled = 0;
+ float min_val = -128;
+ float max_val = 127;
+
+ for(unsigned int i = 0; i < num_points*2; i++){
+ scaled = (inputVectorPtr[i])/scalar;
+ if(scaled > max_val)
+ scaled = max_val;
+ else if(scaled < min_val)
+ scaled = min_val;
+ outputVectorPtr[i] = (int8_t)rintf(scaled);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+#endif /* INCLUDED_volk_gnsssdr_32fc_s32f_convert_8ic_u_H */
+
+
+#ifndef INCLUDED_volk_gnsssdr_32fc_s32f_convert_8ic_a_H
+#define INCLUDED_volk_gnsssdr_32fc_s32f_convert_8ic_a_H
+
+#include <volk/volk_common.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+/*!
+ \brief Converts a float vector of 64 bits (32 bits each part) into a 16 integer vector (8 bits each part)
+ \param inputVector The floating point input data buffer
+ \param outputVector The 16 bit output data buffer
+ \param num_points The number of data values to be converted
+ */
+static inline void volk_gnsssdr_32fc_s32f_convert_8ic_a_sse2(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, const float scalar, unsigned int num_points){
+ const unsigned int sse_iters = num_points/8;
+
+ float* inputVectorPtr = (float*)inputVector;
+ int8_t* outputVectorPtr = (int8_t*)outputVector;
+ __m128 invScalar = _mm_set_ps1(1.0/scalar);
+
+ float min_val = -128;
+ float max_val = 127;
+
+ __m128 inputVal1, inputVal2, inputVal3, inputVal4;
+ __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
+ __m128i int8InputVal;
+ __m128 ret1, ret2, ret3, ret4;
+ __m128 vmin_val = _mm_set_ps1(min_val);
+ __m128 vmax_val = _mm_set_ps1(max_val);
+
+ for(unsigned int i = 0;i < sse_iters; i++){
+ inputVal1 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
+ inputVal2 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
+ inputVal3 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
+ inputVal4 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
+
+ inputVal1 = _mm_mul_ps(inputVal1, invScalar);
+ inputVal2 = _mm_mul_ps(inputVal2, invScalar);
+ inputVal3 = _mm_mul_ps(inputVal3, invScalar);
+ inputVal4 = _mm_mul_ps(inputVal4, invScalar);
+ // Clip
+ ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
+ ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
+ ret3 = _mm_max_ps(_mm_min_ps(inputVal3, vmax_val), vmin_val);
+ ret4 = _mm_max_ps(_mm_min_ps(inputVal4, vmax_val), vmin_val);
+
+ intInputVal1 = _mm_cvtps_epi32(ret1);
+ intInputVal2 = _mm_cvtps_epi32(ret2);
+ intInputVal3 = _mm_cvtps_epi32(ret3);
+ intInputVal4 = _mm_cvtps_epi32(ret4);
+
+ intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
+ intInputVal2 = _mm_packs_epi32(intInputVal3, intInputVal4);
+ int8InputVal = _mm_packs_epi16(intInputVal1, intInputVal2);
+
+ _mm_store_si128((__m128i*)outputVectorPtr, int8InputVal);
+ outputVectorPtr += 16;
+ }
+
+ float scaled = 0;
+ for(unsigned int i = 0; i < (num_points%4)*4; i++){
+ scaled = inputVectorPtr[i]/scalar;
+ if(scaled > max_val)
+ scaled = max_val;
+ else if(scaled < min_val)
+ scaled = min_val;
+ outputVectorPtr[i] = (int8_t)rintf(scaled);
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Converts a float vector of 64 bits (32 bits each part) into a 16 integer vector (8 bits each part)
+ \param inputVector The floating point input data buffer
+ \param outputVector The 16 bit output data buffer
+ \param num_points The number of data values to be converted
+ */
+static inline void volk_gnsssdr_32fc_s32f_convert_8ic_a_generic(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, const float scalar, unsigned int num_points){
+ float* inputVectorPtr = (float*)inputVector;
+ int8_t* outputVectorPtr = (int8_t*)outputVector;
+ float scaled = 0;
+ float min_val = -128;
+ float max_val = 127;
+
+ for(unsigned int i = 0; i < num_points*2; i++){
+ scaled = inputVectorPtr[i]/scalar;
+ if(scaled > max_val)
+ scaled = max_val;
+ else if(scaled < min_val)
+ scaled = min_val;
+ outputVectorPtr[i] = (int8_t)rintf(scaled);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+#endif /* INCLUDED_volk_gnsssdr_32fc_s32f_convert_8ic_a_H */
diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc.h
--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc.h 1970-01-01 01:00:00.000000000 +0100
+++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc.h 2014-10-15 01:55:08.000000000 +0200
@@ -0,0 +1,266 @@
+/*!
+ * \file volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc
+ * \brief Volk protokernel: replaces the tracking function for update_local_code
+ * \authors <ul>
+ * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
+ * </ul>
+ *
+ * Volk protokernel that replaces the tracking function for update_local_code
+ *
+ * -------------------------------------------------------------------------
+ *
+ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
+ *
+ * GNSS-SDR is a software defined Global Navigation
+ * Satellite Systems receiver
+ *
+ * This file is part of GNSS-SDR.
+ *
+ * GNSS-SDR is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * at your option) any later version.
+ *
+ * GNSS-SDR is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * -------------------------------------------------------------------------
+ */
+
+#ifndef INCLUDED_volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_u_H
+#define INCLUDED_volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk_gnsssdr/volk_gnsssdr_complex.h>
+#include <float.h>
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+ /*!
+ \brief Takes the conjugate of a complex vector.
+ \param cVector The vector where the results will be stored
+ \param aVector Vector to be conjugated
+ \param num_points The number of complex values in aVector to be conjugated and stored into cVector
+ */
+static inline void volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_u_sse4_1(lv_32fc_t* d_very_early_code, const float d_very_early_late_spc_chips, const float code_length_half_chips, const float code_phase_step_half_chips, const float tcode_half_chips_input, const lv_32fc_t* d_ca_code, unsigned int num_points){
+
+// float* pointer1 = (float*)&d_very_early_late_spc_chips;
+// *pointer1 = 1;
+// float* pointer2 = (float*)&code_length_half_chips;
+// *pointer2 = 6;
+// float* pointer3 = (float*)&code_phase_step_half_chips;
+// *pointer3 = 7;
+// float* pointer4 = (float*)&tcode_half_chips_input;
+// *pointer4 = 8;
+
+ const unsigned int sse_iters = num_points / 4;
+
+ __m128 tquot, fmod_num, fmod_result, associated_chip_index_array;
+
+ __m128 tcode_half_chips_array = _mm_set_ps (tcode_half_chips_input+3*code_phase_step_half_chips, tcode_half_chips_input+2*code_phase_step_half_chips, tcode_half_chips_input+code_phase_step_half_chips, tcode_half_chips_input);
+ __m128 code_phase_step_half_chips_array = _mm_set1_ps (code_phase_step_half_chips*4);
+ __m128 d_very_early_late_spc_chips_Multiplied_by_2 = _mm_set1_ps (2*d_very_early_late_spc_chips);
+ __m128 code_length_half_chips_array = _mm_set1_ps (code_length_half_chips);
+ __m128 twos = _mm_set1_ps (2);
+ __m128i associated_chip_index_array_int;
+
+ __VOLK_ATTR_ALIGNED(16) int32_t output[4];
+
+ for (unsigned int i = 0; i < sse_iters; i++)
+ {
+ //fmod = numer - tquot * denom; tquot = numer/denom truncated
+ //associated_chip_index = 2 + round(fmod(tcode_half_chips - 2*d_very_early_late_spc_chips, code_length_half_chips));
+ fmod_num = _mm_sub_ps (tcode_half_chips_array, d_very_early_late_spc_chips_Multiplied_by_2);
+ tquot = _mm_div_ps (fmod_num, code_length_half_chips_array);
+ tquot = _mm_round_ps (tquot, (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) );
+ fmod_result = _mm_sub_ps (fmod_num, _mm_mul_ps (tquot, code_length_half_chips_array));
+
+ associated_chip_index_array = _mm_round_ps (fmod_result, (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC));
+ associated_chip_index_array = _mm_add_ps(twos, associated_chip_index_array);
+ associated_chip_index_array_int = _mm_cvtps_epi32 (associated_chip_index_array);
+ _mm_storeu_si128 ((__m128i*)output, associated_chip_index_array_int);
+
+ //d_very_early_code[i] = d_ca_code[associated_chip_index];
+ *d_very_early_code++ = d_ca_code[output[0]];
+ *d_very_early_code++ = d_ca_code[output[1]];
+ *d_very_early_code++ = d_ca_code[output[2]];
+ *d_very_early_code++ = d_ca_code[output[3]];
+
+ //tcode_half_chips = tcode_half_chips + code_phase_step_half_chips;
+ tcode_half_chips_array = _mm_add_ps (tcode_half_chips_array, code_phase_step_half_chips_array);
+ }
+
+ if (num_points%4!=0)
+ {
+ __VOLK_ATTR_ALIGNED(16) float tcode_half_chips_stored[4];
+ _mm_storeu_si128 ((__m128i*)tcode_half_chips_stored, tcode_half_chips_array);
+
+ int associated_chip_index;
+ float tcode_half_chips = tcode_half_chips_stored[0];
+ float d_very_early_late_spc_chips_multiplied_by_2 = 2*d_very_early_late_spc_chips;
+
+ for (unsigned int i = 0; i < num_points%4; i++)
+ {
+ associated_chip_index = 2 + round(fmod(tcode_half_chips - d_very_early_late_spc_chips_multiplied_by_2, code_length_half_chips));
+ d_very_early_code[i] = d_ca_code[associated_chip_index];
+ tcode_half_chips = tcode_half_chips + code_phase_step_half_chips;
+ }
+ }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#ifdef LV_HAVE_GENERIC
+ /*!
+ \brief Takes the conjugate of a complex vector.
+ \param cVector The vector where the results will be stored
+ \param aVector Vector to be conjugated
+ \param num_points The number of complex values in aVector to be conjugated and stored into cVector
+ */
+static inline void volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_generic(lv_32fc_t* d_very_early_code, const float d_very_early_late_spc_chips, const float code_length_half_chips, const float code_phase_step_half_chips, const float tcode_half_chips_input, const lv_32fc_t* d_ca_code, unsigned int num_points){
+
+ float* pointer1 = (float*)&d_very_early_late_spc_chips;
+ *pointer1 = 1;
+ float* pointer2 = (float*)&code_length_half_chips;
+ *pointer2 = 6;
+ float* pointer3 = (float*)&code_phase_step_half_chips;
+ *pointer3 = 7;
+ float* pointer4 = (float*)&tcode_half_chips_input;
+ *pointer4 = 8;
+
+ int associated_chip_index;
+ float tcode_half_chips = tcode_half_chips_input;
+ float d_very_early_late_spc_chips_multiplied_by_2 = 2*d_very_early_late_spc_chips;
+
+ for (unsigned int i = 0; i < num_points; i++)
+ {
+ associated_chip_index = 2 + round(fmod(tcode_half_chips - d_very_early_late_spc_chips_multiplied_by_2, code_length_half_chips));
+ d_very_early_code[i] = d_ca_code[associated_chip_index];
+ tcode_half_chips = tcode_half_chips + code_phase_step_half_chips;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_u_H */
+#ifndef INCLUDED_volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_a_H
+#define INCLUDED_volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk_gnsssdr/volk_gnsssdr_complex.h>
+#include <float.h>
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+ /*!
+ \brief Takes the conjugate of a complex vector.
+ \param cVector The vector where the results will be stored
+ \param aVector Vector to be conjugated
+ \param num_points The number of complex values in aVector to be conjugated and stored into cVector
+ */
+static inline void volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_a_sse4_1(lv_32fc_t* d_very_early_code, const float d_very_early_late_spc_chips, const float code_length_half_chips, const float code_phase_step_half_chips, const float tcode_half_chips_input, const lv_32fc_t* d_ca_code, unsigned int num_points){
+
+ // float* pointer1 = (float*)&d_very_early_late_spc_chips;
+ // *pointer1 = 1;
+ // float* pointer2 = (float*)&code_length_half_chips;
+ // *pointer2 = 6;
+ // float* pointer3 = (float*)&code_phase_step_half_chips;
+ // *pointer3 = 7;
+ // float* pointer4 = (float*)&tcode_half_chips_input;
+ // *pointer4 = 8;
+
+ const unsigned int sse_iters = num_points / 4;
+
+ __m128 tquot, fmod_num, fmod_result, associated_chip_index_array;
+
+ __m128 tcode_half_chips_array = _mm_set_ps (tcode_half_chips_input+3*code_phase_step_half_chips, tcode_half_chips_input+2*code_phase_step_half_chips, tcode_half_chips_input+code_phase_step_half_chips, tcode_half_chips_input);
+ __m128 code_phase_step_half_chips_array = _mm_set1_ps (code_phase_step_half_chips*4);
+ __m128 d_very_early_late_spc_chips_Multiplied_by_2 = _mm_set1_ps (2*d_very_early_late_spc_chips);
+ __m128 code_length_half_chips_array = _mm_set1_ps (code_length_half_chips);
+ __m128 twos = _mm_set1_ps (2);
+ __m128i associated_chip_index_array_int;
+
+ __VOLK_ATTR_ALIGNED(16) int32_t output[4];
+
+ for (unsigned int i = 0; i < sse_iters; i++)
+ {
+ //fmod = numer - tquot * denom; tquot = numer/denom truncated
+ //associated_chip_index = 2 + round(fmod(tcode_half_chips - 2*d_very_early_late_spc_chips, code_length_half_chips));
+ fmod_num = _mm_sub_ps (tcode_half_chips_array, d_very_early_late_spc_chips_Multiplied_by_2);
+ tquot = _mm_div_ps (fmod_num, code_length_half_chips_array);
+ tquot = _mm_round_ps (tquot, (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) );
+ fmod_result = _mm_sub_ps (fmod_num, _mm_mul_ps (tquot, code_length_half_chips_array));
+
+ associated_chip_index_array = _mm_round_ps (fmod_result, (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC));
+ associated_chip_index_array = _mm_add_ps(twos, associated_chip_index_array);
+ associated_chip_index_array_int = _mm_cvtps_epi32 (associated_chip_index_array);
+ _mm_store_si128 ((__m128i*)output, associated_chip_index_array_int);
+
+ //d_very_early_code[i] = d_ca_code[associated_chip_index];
+ *d_very_early_code++ = d_ca_code[output[0]];
+ *d_very_early_code++ = d_ca_code[output[1]];
+ *d_very_early_code++ = d_ca_code[output[2]];
+ *d_very_early_code++ = d_ca_code[output[3]];
+
+ //tcode_half_chips = tcode_half_chips + code_phase_step_half_chips;
+ tcode_half_chips_array = _mm_add_ps (tcode_half_chips_array, code_phase_step_half_chips_array);
+ }
+
+ if (num_points%4!=0)
+ {
+ __VOLK_ATTR_ALIGNED(16) float tcode_half_chips_stored[4];
+ _mm_store_si128 ((__m128i*)tcode_half_chips_stored, tcode_half_chips_array);
+
+ int associated_chip_index;
+ float tcode_half_chips = tcode_half_chips_stored[0];
+ float d_very_early_late_spc_chips_multiplied_by_2 = 2*d_very_early_late_spc_chips;
+
+ for (unsigned int i = 0; i < num_points%4; i++)
+ {
+ associated_chip_index = 2 + round(fmod(tcode_half_chips - d_very_early_late_spc_chips_multiplied_by_2, code_length_half_chips));
+ d_very_early_code[i] = d_ca_code[associated_chip_index];
+ tcode_half_chips = tcode_half_chips + code_phase_step_half_chips;
+ }
+ }
+
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#ifdef LV_HAVE_GENERIC
+ /*!
+ \brief Takes the conjugate of a complex vector.
+ \param cVector The vector where the results will be stored
+ \param aVector Vector to be conjugated
+ \param num_points The number of complex values in aVector to be conjugated and stored into cVector
+ */
+static inline void volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_a_generic(lv_32fc_t* d_very_early_code, const float d_very_early_late_spc_chips, const float code_length_half_chips, const float code_phase_step_half_chips, const float tcode_half_chips_input, const lv_32fc_t* d_ca_code, unsigned int num_points){
+
+ // float* pointer1 = (float*)&d_very_early_late_spc_chips;
+ // *pointer1 = 1;
+ // float* pointer2 = (float*)&code_length_half_chips;
+ // *pointer2 = 6;
+ // float* pointer3 = (float*)&code_phase_step_half_chips;
+ // *pointer3 = 7;
+ // float* pointer4 = (float*)&tcode_half_chips_input;
+ // *pointer4 = 8;
+
+ int associated_chip_index;
+ float tcode_half_chips = tcode_half_chips_input;
+ float d_very_early_late_spc_chips_multiplied_by_2 = 2*d_very_early_late_spc_chips;
+
+ for (unsigned int i = 0; i < num_points; i++)
+ {
+ associated_chip_index = 2 + round(fmod(tcode_half_chips - d_very_early_late_spc_chips_multiplied_by_2, code_length_half_chips));
+ d_very_early_code[i] = d_ca_code[associated_chip_index];
+ tcode_half_chips = tcode_half_chips + code_phase_step_half_chips;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_a_H */
diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_s32fc_multiply_32fc.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_s32fc_multiply_32fc.h
--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_s32fc_multiply_32fc.h 1970-01-01 01:00:00.000000000 +0100
+++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_s32fc_multiply_32fc.h 2014-10-15 01:55:08.000000000 +0200
@@ -0,0 +1,178 @@
+#ifndef INCLUDED_volk_gnsssdr_32fc_s32fc_multiply_32fc_u_H
+#define INCLUDED_volk_gnsssdr_32fc_s32fc_multiply_32fc_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk_gnsssdr/volk_gnsssdr_complex.h>
+#include <float.h>
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+/*!
+ \brief Multiplies the input vector by a scalar and stores the results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector The vector to be multiplied
+ \param scalar The complex scalar to multiply aVector
+ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_gnsssdr_32fc_s32fc_multiply_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int halfPoints = num_points / 2;
+
+ __m128 x, yl, yh, z, tmp1, tmp2;
+ lv_32fc_t* c = cVector;
+ const lv_32fc_t* a = aVector;
+
+ // Set up constant scalar vector
+ yl = _mm_set_ps1(lv_creal(scalar));
+ yh = _mm_set_ps1(lv_cimag(scalar));
+
+ for(;number < halfPoints; number++){
+
+ x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+
+ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+
+ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+
+ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+ _mm_storeu_ps((float*)c,z); // Store the results back into the C container
+
+ a += 2;
+ c += 2;
+ }
+
+ if((num_points % 2) != 0) {
+ *c = (*a) * scalar;
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Multiplies the input vector by a scalar and stores the results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector The vector to be multiplied
+ \param scalar The complex scalar to multiply aVector
+ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_gnsssdr_32fc_s32fc_multiply_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
+ lv_32fc_t* cPtr = cVector;
+ const lv_32fc_t* aPtr = aVector;
+ unsigned int number = num_points;
+
+ // unwrap loop
+ while (number >= 8){
+ *cPtr++ = (*aPtr++) * scalar;
+ *cPtr++ = (*aPtr++) * scalar;
+ *cPtr++ = (*aPtr++) * scalar;
+ *cPtr++ = (*aPtr++) * scalar;
+ *cPtr++ = (*aPtr++) * scalar;
+ *cPtr++ = (*aPtr++) * scalar;
+ *cPtr++ = (*aPtr++) * scalar;
+ *cPtr++ = (*aPtr++) * scalar;
+ number -= 8;
+ }
+
+ // clean up any remaining
+ while (number-- > 0)
+ *cPtr++ = *aPtr++ * scalar;
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_u_H */
+#ifndef INCLUDED_volk_gnsssdr_32fc_s32fc_multiply_32fc_a_H
+#define INCLUDED_volk_gnsssdr_32fc_s32fc_multiply_32fc_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk_gnsssdr/volk_gnsssdr_complex.h>
+#include <float.h>
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+ /*!
+ \brief Multiplies the two input complex vectors and stores their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be multiplied
+ \param bVector One of the vectors to be multiplied
+ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+ */
+static inline void volk_gnsssdr_32fc_s32fc_multiply_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int halfPoints = num_points / 2;
+
+ __m128 x, yl, yh, z, tmp1, tmp2;
+ lv_32fc_t* c = cVector;
+ const lv_32fc_t* a = aVector;
+
+ // Set up constant scalar vector
+ yl = _mm_set_ps1(lv_creal(scalar));
+ yh = _mm_set_ps1(lv_cimag(scalar));
+
+ for(;number < halfPoints; number++){
+
+ x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+
+ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+
+ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+
+ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+ _mm_store_ps((float*)c,z); // Store the results back into the C container
+
+ a += 2;
+ c += 2;
+ }
+
+ if((num_points % 2) != 0) {
+ *c = (*a) * scalar;
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+
+#ifdef LV_HAVE_GENERIC
+ /*!
+ \brief Multiplies the two input complex vectors and stores their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be multiplied
+ \param bVector One of the vectors to be multiplied
+ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+ */
+static inline void volk_gnsssdr_32fc_s32fc_multiply_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
+ lv_32fc_t* cPtr = cVector;
+ const lv_32fc_t* aPtr = aVector;
+ unsigned int number = num_points;
+
+ // unwrap loop
+ while (number >= 8){
+ *cPtr++ = (*aPtr++) * scalar;
+ *cPtr++ = (*aPtr++) * scalar;
+ *cPtr++ = (*aPtr++) * scalar;
+ *cPtr++ = (*aPtr++) * scalar;
+ *cPtr++ = (*aPtr++) * scalar;
+ *cPtr++ = (*aPtr++) * scalar;
+ *cPtr++ = (*aPtr++) * scalar;
+ *cPtr++ = (*aPtr++) * scalar;
+ number -= 8;
+ }
+
+ // clean up any remaining
+ while (number-- > 0)
+ *cPtr++ = *aPtr++ * scalar;
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+
+#endif /* INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_a_H */
diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x2_dot_prod_32fc.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x2_dot_prod_32fc.h
--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x2_dot_prod_32fc.h 1970-01-01 01:00:00.000000000 +0100
+++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x2_dot_prod_32fc.h 2014-10-15 01:55:08.000000000 +0200
@@ -0,0 +1,763 @@
+#ifndef INCLUDED_volk_gnsssdr_32fc_x2_dot_prod_32fc_u_H
+#define INCLUDED_volk_gnsssdr_32fc_x2_dot_prod_32fc_u_H
+
+#include <volk_gnsssdr/volk_gnsssdr_common.h>
+#include <volk_gnsssdr/volk_gnsssdr_complex.h>
+#include <stdio.h>
+#include <string.h>
+
+
+#ifdef LV_HAVE_GENERIC
+
+
+static inline void volk_gnsssdr_32fc_x2_dot_prod_32fc_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+
+ float * res = (float*) result;
+ float * in = (float*) input;
+ float * tp = (float*) taps;
+ unsigned int n_2_ccomplex_blocks = num_points/2;
+ unsigned int isodd = num_points & 1;
+
+ float sum0[2] = {0,0};
+ float sum1[2] = {0,0};
+ unsigned int i = 0;
+
+ for(i = 0; i < n_2_ccomplex_blocks; ++i) {
+ sum0[0] += in[0] * tp[0] - in[1] * tp[1];
+ sum0[1] += in[0] * tp[1] + in[1] * tp[0];
+ sum1[0] += in[2] * tp[2] - in[3] * tp[3];
+ sum1[1] += in[2] * tp[3] + in[3] * tp[2];
+
+ in += 4;
+ tp += 4;
+ }
+
+ res[0] = sum0[0] + sum1[0];
+ res[1] = sum0[1] + sum1[1];
+
+ // Cleanup if we had an odd number of points
+ for(i = 0; i < isodd; ++i) {
+ *result += input[num_points - 1] * taps[num_points - 1];
+ }
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+
+#if LV_HAVE_SSE && LV_HAVE_64
+
+static inline void volk_gnsssdr_32fc_x2_dot_prod_32fc_u_sse_64(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+
+ const unsigned int num_bytes = num_points*8;
+ unsigned int isodd = num_points & 1;
+
+ asm
+ (
+ "# ccomplex_dotprod_generic (float* result, const float *input,\n\t"
+ "# const float *taps, unsigned num_bytes)\n\t"
+ "# float sum0 = 0;\n\t"
+ "# float sum1 = 0;\n\t"
+ "# float sum2 = 0;\n\t"
+ "# float sum3 = 0;\n\t"
+ "# do {\n\t"
+ "# sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t"
+ "# sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t"
+ "# sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t"
+ "# sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t"
+ "# input += 4;\n\t"
+ "# taps += 4; \n\t"
+ "# } while (--n_2_ccomplex_blocks != 0);\n\t"
+ "# result[0] = sum0 + sum2;\n\t"
+ "# result[1] = sum1 + sum3;\n\t"
+ "# TODO: prefetch and better scheduling\n\t"
+ " xor %%r9, %%r9\n\t"
+ " xor %%r10, %%r10\n\t"
+ " movq %%rcx, %%rax\n\t"
+ " movq %%rcx, %%r8\n\t"
+ " movq %[rsi], %%r9\n\t"
+ " movq %[rdx], %%r10\n\t"
+ " xorps %%xmm6, %%xmm6 # zero accumulators\n\t"
+ " movups 0(%%r9), %%xmm0\n\t"
+ " xorps %%xmm7, %%xmm7 # zero accumulators\n\t"
+ " movups 0(%%r10), %%xmm2\n\t"
+ " shr $5, %%rax # rax = n_2_ccomplex_blocks / 2\n\t"
+ " shr $4, %%r8\n\t"
+ " jmp .%=L1_test\n\t"
+ " # 4 taps / loop\n\t"
+ " # something like ?? cycles / loop\n\t"
+ ".%=Loop1: \n\t"
+ "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
+ "# movups (%%r9), %%xmmA\n\t"
+ "# movups (%%r10), %%xmmB\n\t"
+ "# movups %%xmmA, %%xmmZ\n\t"
+ "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t"
+ "# mulps %%xmmB, %%xmmA\n\t"
+ "# mulps %%xmmZ, %%xmmB\n\t"
+ "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
+ "# xorps %%xmmPN, %%xmmA\n\t"
+ "# movups %%xmmA, %%xmmZ\n\t"
+ "# unpcklps %%xmmB, %%xmmA\n\t"
+ "# unpckhps %%xmmB, %%xmmZ\n\t"
+ "# movups %%xmmZ, %%xmmY\n\t"
+ "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t"
+ "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t"
+ "# addps %%xmmZ, %%xmmA\n\t"
+ "# addps %%xmmA, %%xmmC\n\t"
+ "# A=xmm0, B=xmm2, Z=xmm4\n\t"
+ "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
+ " movups 16(%%r9), %%xmm1\n\t"
+ " movups %%xmm0, %%xmm4\n\t"
+ " mulps %%xmm2, %%xmm0\n\t"
+ " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
+ " movups 16(%%r10), %%xmm3\n\t"
+ " movups %%xmm1, %%xmm5\n\t"
+ " addps %%xmm0, %%xmm6\n\t"
+ " mulps %%xmm3, %%xmm1\n\t"
+ " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t"
+ " addps %%xmm1, %%xmm6\n\t"
+ " mulps %%xmm4, %%xmm2\n\t"
+ " movups 32(%%r9), %%xmm0\n\t"
+ " addps %%xmm2, %%xmm7\n\t"
+ " mulps %%xmm5, %%xmm3\n\t"
+ " add $32, %%r9\n\t"
+ " movups 32(%%r10), %%xmm2\n\t"
+ " addps %%xmm3, %%xmm7\n\t"
+ " add $32, %%r10\n\t"
+ ".%=L1_test:\n\t"
+ " dec %%rax\n\t"
+ " jge .%=Loop1\n\t"
+ " # We've handled the bulk of multiplies up to here.\n\t"
+ " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
+ " # If so, we've got 2 more taps to do.\n\t"
+ " and $1, %%r8\n\t"
+ " je .%=Leven\n\t"
+ " # The count was odd, do 2 more taps.\n\t"
+ " # Note that we've already got mm0/mm2 preloaded\n\t"
+ " # from the main loop.\n\t"
+ " movups %%xmm0, %%xmm4\n\t"
+ " mulps %%xmm2, %%xmm0\n\t"
+ " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
+ " addps %%xmm0, %%xmm6\n\t"
+ " mulps %%xmm4, %%xmm2\n\t"
+ " addps %%xmm2, %%xmm7\n\t"
+ ".%=Leven:\n\t"
+ " # neg inversor\n\t"
+ " xorps %%xmm1, %%xmm1\n\t"
+ " mov $0x80000000, %%r9\n\t"
+ " movd %%r9, %%xmm1\n\t"
+ " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t"
+ " # pfpnacc\n\t"
+ " xorps %%xmm1, %%xmm6\n\t"
+ " movups %%xmm6, %%xmm2\n\t"
+ " unpcklps %%xmm7, %%xmm6\n\t"
+ " unpckhps %%xmm7, %%xmm2\n\t"
+ " movups %%xmm2, %%xmm3\n\t"
+ " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t"
+ " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t"
+ " addps %%xmm2, %%xmm6\n\t"
+ " # xmm6 = r1 i2 r3 i4\n\t"
+ " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t"
+ " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t"
+ " movlps %%xmm6, (%[rdi]) # store low 2x32 bits (complex) to memory\n\t"
+ :
+ :[rsi] "r" (input), [rdx] "r" (taps), "c" (num_bytes), [rdi] "r" (result)
+ :"rax", "r8", "r9", "r10"
+ );
+
+
+ if(isodd) {
+ *result += input[num_points - 1] * taps[num_points - 1];
+ }
+
+ return;
+
+}
+
+#endif /* LV_HAVE_SSE && LV_HAVE_64 */
+
+
+
+
+#ifdef LV_HAVE_SSE3
+
+#include <pmmintrin.h>
+
+static inline void volk_gnsssdr_32fc_x2_dot_prod_32fc_u_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+
+ lv_32fc_t dotProduct;
+ memset(&dotProduct, 0x0, 2*sizeof(float));
+
+ unsigned int number = 0;
+ const unsigned int halfPoints = num_points/2;
+ unsigned int isodd = num_points & 1;
+
+ __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
+
+ const lv_32fc_t* a = input;
+ const lv_32fc_t* b = taps;
+
+ dotProdVal = _mm_setzero_ps();
+
+ for(;number < halfPoints; number++){
+
+ x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+ y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
+
+ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
+
+ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+
+ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+
+ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+ dotProdVal = _mm_add_ps(dotProdVal, z); // Add the complex multiplication results together
+
+ a += 2;
+ b += 2;
+ }
+
+ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector[2];
+
+ _mm_storeu_ps((float*)dotProductVector,dotProdVal); // Store the results back into the dot product vector
+
+ dotProduct += ( dotProductVector[0] + dotProductVector[1] );
+
+ if(isodd) {
+ dotProduct += input[num_points - 1] * taps[num_points - 1];
+ }
+
+ *result = dotProduct;
+}
+
+#endif /*LV_HAVE_SSE3*/
+
+#ifdef LV_HAVE_SSE4_1
+
+#include <smmintrin.h>
+
+static inline void volk_gnsssdr_32fc_x2_dot_prod_32fc_u_sse4_1(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+
+ unsigned int i = 0;
+ const unsigned int qtr_points = num_points/4;
+ const unsigned int isodd = num_points & 3;
+
+ __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1;
+ float *p_input, *p_taps;
+ __m64 *p_result;
+
+ p_result = (__m64*)result;
+ p_input = (float*)input;
+ p_taps = (float*)taps;
+
+ static const __m128i neg = {0x000000000000000080000000};
+
+ real0 = _mm_setzero_ps();
+ real1 = _mm_setzero_ps();
+ im0 = _mm_setzero_ps();
+ im1 = _mm_setzero_ps();
+
+ for(; i < qtr_points; ++i) {
+ xmm0 = _mm_loadu_ps(p_input);
+ xmm1 = _mm_loadu_ps(p_taps);
+
+ p_input += 4;
+ p_taps += 4;
+
+ xmm2 = _mm_loadu_ps(p_input);
+ xmm3 = _mm_loadu_ps(p_taps);
+
+ p_input += 4;
+ p_taps += 4;
+
+ xmm4 = _mm_unpackhi_ps(xmm0, xmm2);
+ xmm5 = _mm_unpackhi_ps(xmm1, xmm3);
+ xmm0 = _mm_unpacklo_ps(xmm0, xmm2);
+ xmm2 = _mm_unpacklo_ps(xmm1, xmm3);
+
+ //imaginary vector from input
+ xmm1 = _mm_unpackhi_ps(xmm0, xmm4);
+ //real vector from input
+ xmm3 = _mm_unpacklo_ps(xmm0, xmm4);
+ //imaginary vector from taps
+ xmm0 = _mm_unpackhi_ps(xmm2, xmm5);
+ //real vector from taps
+ xmm2 = _mm_unpacklo_ps(xmm2, xmm5);
+
+ xmm4 = _mm_dp_ps(xmm3, xmm2, 0xf1);
+ xmm5 = _mm_dp_ps(xmm1, xmm0, 0xf1);
+
+ xmm6 = _mm_dp_ps(xmm3, xmm0, 0xf2);
+ xmm7 = _mm_dp_ps(xmm1, xmm2, 0xf2);
+
+ real0 = _mm_add_ps(xmm4, real0);
+ real1 = _mm_add_ps(xmm5, real1);
+ im0 = _mm_add_ps(xmm6, im0);
+ im1 = _mm_add_ps(xmm7, im1);
+ }
+
+ real1 = _mm_xor_ps(real1, bit128_p(&neg)->float_vec);
+
+ im0 = _mm_add_ps(im0, im1);
+ real0 = _mm_add_ps(real0, real1);
+
+ im0 = _mm_add_ps(im0, real0);
+
+ _mm_storel_pi(p_result, im0);
+
+ for(i = num_points-isodd; i < num_points; i++) {
+ *result += input[i] * taps[i];
+ }
+}
+
+#endif /*LV_HAVE_SSE4_1*/
+
+
+
+
+#endif /*INCLUDED_volk_gnsssdr_32fc_x2_dot_prod_32fc_u_H*/
+#ifndef INCLUDED_volk_gnsssdr_32fc_x2_dot_prod_32fc_a_H
+#define INCLUDED_volk_gnsssdr_32fc_x2_dot_prod_32fc_a_H
+
+#include <volk_gnsssdr/volk_gnsssdr_common.h>
+#include <volk_gnsssdr/volk_gnsssdr_complex.h>
+#include <stdio.h>
+#include <string.h>
+
+
+#ifdef LV_HAVE_GENERIC
+
+
+static inline void volk_gnsssdr_32fc_x2_dot_prod_32fc_a_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+
+ const unsigned int num_bytes = num_points*8;
+
+ float * res = (float*) result;
+ float * in = (float*) input;
+ float * tp = (float*) taps;
+ unsigned int n_2_ccomplex_blocks = num_bytes >> 4;
+ unsigned int isodd = num_points & 1;
+
+ float sum0[2] = {0,0};
+ float sum1[2] = {0,0};
+ unsigned int i = 0;
+
+ for(i = 0; i < n_2_ccomplex_blocks; ++i) {
+ sum0[0] += in[0] * tp[0] - in[1] * tp[1];
+ sum0[1] += in[0] * tp[1] + in[1] * tp[0];
+ sum1[0] += in[2] * tp[2] - in[3] * tp[3];
+ sum1[1] += in[2] * tp[3] + in[3] * tp[2];
+
+ in += 4;
+ tp += 4;
+ }
+
+ res[0] = sum0[0] + sum1[0];
+ res[1] = sum0[1] + sum1[1];
+
+ for(i = 0; i < isodd; ++i) {
+ *result += input[num_points - 1] * taps[num_points - 1];
+ }
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#if LV_HAVE_SSE && LV_HAVE_64
+
+
+static inline void volk_gnsssdr_32fc_x2_dot_prod_32fc_a_sse_64(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+
+ const unsigned int num_bytes = num_points*8;
+ unsigned int isodd = num_points & 1;
+
+ asm
+ (
+ "# ccomplex_dotprod_generic (float* result, const float *input,\n\t"
+ "# const float *taps, unsigned num_bytes)\n\t"
+ "# float sum0 = 0;\n\t"
+ "# float sum1 = 0;\n\t"
+ "# float sum2 = 0;\n\t"
+ "# float sum3 = 0;\n\t"
+ "# do {\n\t"
+ "# sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t"
+ "# sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t"
+ "# sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t"
+ "# sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t"
+ "# input += 4;\n\t"
+ "# taps += 4; \n\t"
+ "# } while (--n_2_ccomplex_blocks != 0);\n\t"
+ "# result[0] = sum0 + sum2;\n\t"
+ "# result[1] = sum1 + sum3;\n\t"
+ "# TODO: prefetch and better scheduling\n\t"
+ " xor %%r9, %%r9\n\t"
+ " xor %%r10, %%r10\n\t"
+ " movq %%rcx, %%rax\n\t"
+ " movq %%rcx, %%r8\n\t"
+ " movq %[rsi], %%r9\n\t"
+ " movq %[rdx], %%r10\n\t"
+ " xorps %%xmm6, %%xmm6 # zero accumulators\n\t"
+ " movaps 0(%%r9), %%xmm0\n\t"
+ " xorps %%xmm7, %%xmm7 # zero accumulators\n\t"
+ " movaps 0(%%r10), %%xmm2\n\t"
+ " shr $5, %%rax # rax = n_2_ccomplex_blocks / 2\n\t"
+ " shr $4, %%r8\n\t"
+ " jmp .%=L1_test\n\t"
+ " # 4 taps / loop\n\t"
+ " # something like ?? cycles / loop\n\t"
+ ".%=Loop1: \n\t"
+ "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
+ "# movaps (%%r9), %%xmmA\n\t"
+ "# movaps (%%r10), %%xmmB\n\t"
+ "# movaps %%xmmA, %%xmmZ\n\t"
+ "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t"
+ "# mulps %%xmmB, %%xmmA\n\t"
+ "# mulps %%xmmZ, %%xmmB\n\t"
+ "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
+ "# xorps %%xmmPN, %%xmmA\n\t"
+ "# movaps %%xmmA, %%xmmZ\n\t"
+ "# unpcklps %%xmmB, %%xmmA\n\t"
+ "# unpckhps %%xmmB, %%xmmZ\n\t"
+ "# movaps %%xmmZ, %%xmmY\n\t"
+ "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t"
+ "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t"
+ "# addps %%xmmZ, %%xmmA\n\t"
+ "# addps %%xmmA, %%xmmC\n\t"
+ "# A=xmm0, B=xmm2, Z=xmm4\n\t"
+ "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
+ " movaps 16(%%r9), %%xmm1\n\t"
+ " movaps %%xmm0, %%xmm4\n\t"
+ " mulps %%xmm2, %%xmm0\n\t"
+ " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
+ " movaps 16(%%r10), %%xmm3\n\t"
+ " movaps %%xmm1, %%xmm5\n\t"
+ " addps %%xmm0, %%xmm6\n\t"
+ " mulps %%xmm3, %%xmm1\n\t"
+ " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t"
+ " addps %%xmm1, %%xmm6\n\t"
+ " mulps %%xmm4, %%xmm2\n\t"
+ " movaps 32(%%r9), %%xmm0\n\t"
+ " addps %%xmm2, %%xmm7\n\t"
+ " mulps %%xmm5, %%xmm3\n\t"
+ " add $32, %%r9\n\t"
+ " movaps 32(%%r10), %%xmm2\n\t"
+ " addps %%xmm3, %%xmm7\n\t"
+ " add $32, %%r10\n\t"
+ ".%=L1_test:\n\t"
+ " dec %%rax\n\t"
+ " jge .%=Loop1\n\t"
+ " # We've handled the bulk of multiplies up to here.\n\t"
+ " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
+ " # If so, we've got 2 more taps to do.\n\t"
+ " and $1, %%r8\n\t"
+ " je .%=Leven\n\t"
+ " # The count was odd, do 2 more taps.\n\t"
+ " # Note that we've already got mm0/mm2 preloaded\n\t"
+ " # from the main loop.\n\t"
+ " movaps %%xmm0, %%xmm4\n\t"
+ " mulps %%xmm2, %%xmm0\n\t"
+ " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
+ " addps %%xmm0, %%xmm6\n\t"
+ " mulps %%xmm4, %%xmm2\n\t"
+ " addps %%xmm2, %%xmm7\n\t"
+ ".%=Leven:\n\t"
+ " # neg inversor\n\t"
+ " xorps %%xmm1, %%xmm1\n\t"
+ " mov $0x80000000, %%r9\n\t"
+ " movd %%r9, %%xmm1\n\t"
+ " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t"
+ " # pfpnacc\n\t"
+ " xorps %%xmm1, %%xmm6\n\t"
+ " movaps %%xmm6, %%xmm2\n\t"
+ " unpcklps %%xmm7, %%xmm6\n\t"
+ " unpckhps %%xmm7, %%xmm2\n\t"
+ " movaps %%xmm2, %%xmm3\n\t"
+ " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t"
+ " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t"
+ " addps %%xmm2, %%xmm6\n\t"
+ " # xmm6 = r1 i2 r3 i4\n\t"
+ " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t"
+ " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t"
+ " movlps %%xmm6, (%[rdi]) # store low 2x32 bits (complex) to memory\n\t"
+ :
+ :[rsi] "r" (input), [rdx] "r" (taps), "c" (num_bytes), [rdi] "r" (result)
+ :"rax", "r8", "r9", "r10"
+ );
+
+
+ if(isodd) {
+ *result += input[num_points - 1] * taps[num_points - 1];
+ }
+
+ return;
+
+}
+
+#endif
+
+#if LV_HAVE_SSE && LV_HAVE_32
+
+static inline void volk_gnsssdr_32fc_x2_dot_prod_32fc_a_sse_32(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+
+ volk_gnsssdr_32fc_x2_dot_prod_32fc_a_generic(result, input, taps, num_points);
+
+#if 0
+ const unsigned int num_bytes = num_points*8;
+ unsigned int isodd = num_points & 1;
+
+ asm volatile
+ (
+ " #pushl %%ebp\n\t"
+ " #movl %%esp, %%ebp\n\t"
+ " movl 12(%%ebp), %%eax # input\n\t"
+ " movl 16(%%ebp), %%edx # taps\n\t"
+ " movl 20(%%ebp), %%ecx # n_bytes\n\t"
+ " xorps %%xmm6, %%xmm6 # zero accumulators\n\t"
+ " movaps 0(%%eax), %%xmm0\n\t"
+ " xorps %%xmm7, %%xmm7 # zero accumulators\n\t"
+ " movaps 0(%%edx), %%xmm2\n\t"
+ " shrl $5, %%ecx # ecx = n_2_ccomplex_blocks / 2\n\t"
+ " jmp .%=L1_test\n\t"
+ " # 4 taps / loop\n\t"
+ " # something like ?? cycles / loop\n\t"
+ ".%=Loop1: \n\t"
+ "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
+ "# movaps (%%eax), %%xmmA\n\t"
+ "# movaps (%%edx), %%xmmB\n\t"
+ "# movaps %%xmmA, %%xmmZ\n\t"
+ "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t"
+ "# mulps %%xmmB, %%xmmA\n\t"
+ "# mulps %%xmmZ, %%xmmB\n\t"
+ "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
+ "# xorps %%xmmPN, %%xmmA\n\t"
+ "# movaps %%xmmA, %%xmmZ\n\t"
+ "# unpcklps %%xmmB, %%xmmA\n\t"
+ "# unpckhps %%xmmB, %%xmmZ\n\t"
+ "# movaps %%xmmZ, %%xmmY\n\t"
+ "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t"
+ "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t"
+ "# addps %%xmmZ, %%xmmA\n\t"
+ "# addps %%xmmA, %%xmmC\n\t"
+ "# A=xmm0, B=xmm2, Z=xmm4\n\t"
+ "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
+ " movaps 16(%%eax), %%xmm1\n\t"
+ " movaps %%xmm0, %%xmm4\n\t"
+ " mulps %%xmm2, %%xmm0\n\t"
+ " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
+ " movaps 16(%%edx), %%xmm3\n\t"
+ " movaps %%xmm1, %%xmm5\n\t"
+ " addps %%xmm0, %%xmm6\n\t"
+ " mulps %%xmm3, %%xmm1\n\t"
+ " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t"
+ " addps %%xmm1, %%xmm6\n\t"
+ " mulps %%xmm4, %%xmm2\n\t"
+ " movaps 32(%%eax), %%xmm0\n\t"
+ " addps %%xmm2, %%xmm7\n\t"
+ " mulps %%xmm5, %%xmm3\n\t"
+ " addl $32, %%eax\n\t"
+ " movaps 32(%%edx), %%xmm2\n\t"
+ " addps %%xmm3, %%xmm7\n\t"
+ " addl $32, %%edx\n\t"
+ ".%=L1_test:\n\t"
+ " decl %%ecx\n\t"
+ " jge .%=Loop1\n\t"
+ " # We've handled the bulk of multiplies up to here.\n\t"
+ " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
+ " # If so, we've got 2 more taps to do.\n\t"
+ " movl 20(%%ebp), %%ecx # n_2_ccomplex_blocks\n\t"
+ " shrl $4, %%ecx\n\t"
+ " andl $1, %%ecx\n\t"
+ " je .%=Leven\n\t"
+ " # The count was odd, do 2 more taps.\n\t"
+ " # Note that we've already got mm0/mm2 preloaded\n\t"
+ " # from the main loop.\n\t"
+ " movaps %%xmm0, %%xmm4\n\t"
+ " mulps %%xmm2, %%xmm0\n\t"
+ " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
+ " addps %%xmm0, %%xmm6\n\t"
+ " mulps %%xmm4, %%xmm2\n\t"
+ " addps %%xmm2, %%xmm7\n\t"
+ ".%=Leven:\n\t"
+ " # neg inversor\n\t"
+ " movl 8(%%ebp), %%eax \n\t"
+ " xorps %%xmm1, %%xmm1\n\t"
+ " movl $0x80000000, (%%eax)\n\t"
+ " movss (%%eax), %%xmm1\n\t"
+ " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t"
+ " # pfpnacc\n\t"
+ " xorps %%xmm1, %%xmm6\n\t"
+ " movaps %%xmm6, %%xmm2\n\t"
+ " unpcklps %%xmm7, %%xmm6\n\t"
+ " unpckhps %%xmm7, %%xmm2\n\t"
+ " movaps %%xmm2, %%xmm3\n\t"
+ " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t"
+ " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t"
+ " addps %%xmm2, %%xmm6\n\t"
+ " # xmm6 = r1 i2 r3 i4\n\t"
+ " #movl 8(%%ebp), %%eax # @result\n\t"
+ " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t"
+ " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t"
+ " movlps %%xmm6, (%%eax) # store low 2x32 bits (complex) to memory\n\t"
+ " #popl %%ebp\n\t"
+ :
+ :
+ : "eax", "ecx", "edx"
+ );
+
+
+ int getem = num_bytes % 16;
+
+ if(isodd) {
+ *result += (input[num_points - 1] * taps[num_points - 1]);
+ }
+
+ return;
+#endif
+}
+
+#endif /*LV_HAVE_SSE*/
+
+#ifdef LV_HAVE_SSE3
+
+#include <pmmintrin.h>
+
+static inline void volk_gnsssdr_32fc_x2_dot_prod_32fc_a_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+
+ const unsigned int num_bytes = num_points*8;
+ unsigned int isodd = num_points & 1;
+
+ lv_32fc_t dotProduct;
+ memset(&dotProduct, 0x0, 2*sizeof(float));
+
+ unsigned int number = 0;
+ const unsigned int halfPoints = num_bytes >> 4;
+
+ __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
+
+ const lv_32fc_t* a = input;
+ const lv_32fc_t* b = taps;
+
+ dotProdVal = _mm_setzero_ps();
+
+ for(;number < halfPoints; number++){
+
+ x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+ y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
+
+ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
+
+ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+
+ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+
+ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+ dotProdVal = _mm_add_ps(dotProdVal, z); // Add the complex multiplication results together
+
+ a += 2;
+ b += 2;
+ }
+
+ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector[2];
+
+ _mm_store_ps((float*)dotProductVector,dotProdVal); // Store the results back into the dot product vector
+
+ dotProduct += ( dotProductVector[0] + dotProductVector[1] );
+
+ if(isodd) {
+ dotProduct += input[num_points - 1] * taps[num_points - 1];
+ }
+
+ *result = dotProduct;
+}
+
+#endif /*LV_HAVE_SSE3*/
+
+#ifdef LV_HAVE_SSE4_1
+
+#include <smmintrin.h>
+
+static inline void volk_gnsssdr_32fc_x2_dot_prod_32fc_a_sse4_1(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+
+ unsigned int i = 0;
+ const unsigned int qtr_points = num_points/4;
+ const unsigned int isodd = num_points & 3;
+
+ __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1;
+ float *p_input, *p_taps;
+ __m64 *p_result;
+
+ static const __m128i neg = {0x000000000000000080000000};
+
+ p_result = (__m64*)result;
+ p_input = (float*)input;
+ p_taps = (float*)taps;
+
+ real0 = _mm_setzero_ps();
+ real1 = _mm_setzero_ps();
+ im0 = _mm_setzero_ps();
+ im1 = _mm_setzero_ps();
+
+ for(; i < qtr_points; ++i) {
+ xmm0 = _mm_load_ps(p_input);
+ xmm1 = _mm_load_ps(p_taps);
+
+ p_input += 4;
+ p_taps += 4;
+
+ xmm2 = _mm_load_ps(p_input);
+ xmm3 = _mm_load_ps(p_taps);
+
+ p_input += 4;
+ p_taps += 4;
+
+ xmm4 = _mm_unpackhi_ps(xmm0, xmm2);
+ xmm5 = _mm_unpackhi_ps(xmm1, xmm3);
+ xmm0 = _mm_unpacklo_ps(xmm0, xmm2);
+ xmm2 = _mm_unpacklo_ps(xmm1, xmm3);
+
+ //imaginary vector from input
+ xmm1 = _mm_unpackhi_ps(xmm0, xmm4);
+ //real vector from input
+ xmm3 = _mm_unpacklo_ps(xmm0, xmm4);
+ //imaginary vector from taps
+ xmm0 = _mm_unpackhi_ps(xmm2, xmm5);
+ //real vector from taps
+ xmm2 = _mm_unpacklo_ps(xmm2, xmm5);
+
+ xmm4 = _mm_dp_ps(xmm3, xmm2, 0xf1);
+ xmm5 = _mm_dp_ps(xmm1, xmm0, 0xf1);
+
+ xmm6 = _mm_dp_ps(xmm3, xmm0, 0xf2);
+ xmm7 = _mm_dp_ps(xmm1, xmm2, 0xf2);
+
+ real0 = _mm_add_ps(xmm4, real0);
+ real1 = _mm_add_ps(xmm5, real1);
+ im0 = _mm_add_ps(xmm6, im0);
+ im1 = _mm_add_ps(xmm7, im1);
+ }
+
+ real1 = _mm_xor_ps(real1, bit128_p(&neg)->float_vec);
+
+ im0 = _mm_add_ps(im0, im1);
+ real0 = _mm_add_ps(real0, real1);
+
+ im0 = _mm_add_ps(im0, real0);
+
+ _mm_storel_pi(p_result, im0);
+
+ for(i = num_points-isodd; i < num_points; i++) {
+ *result += input[i] * taps[i];
+ }
+}
+
+#endif /*LV_HAVE_SSE4_1*/
+
+#endif /*INCLUDED_volk_gnsssdr_32fc_x2_dot_prod_32fc_a_H*/
diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x2_multiply_32fc.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x2_multiply_32fc.h
--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x2_multiply_32fc.h 1970-01-01 01:00:00.000000000 +0100
+++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x2_multiply_32fc.h 2014-10-15 01:55:08.000000000 +0200
@@ -0,0 +1,170 @@
+#ifndef INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_u_H
+#define INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk_gnsssdr/volk_gnsssdr_complex.h>
+#include <float.h>
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+ /*!
+ \brief Multiplies the two input complex vectors and stores their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be multiplied
+ \param bVector One of the vectors to be multiplied
+ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+ */
+static inline void volk_gnsssdr_32fc_x2_multiply_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int halfPoints = num_points / 2;
+
+ __m128 x, y, yl, yh, z, tmp1, tmp2;
+ lv_32fc_t* c = cVector;
+ const lv_32fc_t* a = aVector;
+ const lv_32fc_t* b = bVector;
+
+ for(;number < halfPoints; number++){
+
+ x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+ y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
+
+ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
+
+ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+
+ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+
+ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+ _mm_storeu_ps((float*)c,z); // Store the results back into the C container
+
+ a += 2;
+ b += 2;
+ c += 2;
+ }
+
+ if((num_points % 2) != 0) {
+ *c = (*a) * (*b);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+ /*!
+ \brief Multiplies the two input complex vectors and stores their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be multiplied
+ \param bVector One of the vectors to be multiplied
+ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+ */
+static inline void volk_gnsssdr_32fc_x2_multiply_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
+ lv_32fc_t* cPtr = cVector;
+ const lv_32fc_t* aPtr = aVector;
+ const lv_32fc_t* bPtr= bVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *cPtr++ = (*aPtr++) * (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_u_H */
+#ifndef INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_a_H
+#define INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk_gnsssdr/volk_gnsssdr_complex.h>
+#include <float.h>
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+ /*!
+ \brief Multiplies the two input complex vectors and stores their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be multiplied
+ \param bVector One of the vectors to be multiplied
+ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+ */
+static inline void volk_gnsssdr_32fc_x2_multiply_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int halfPoints = num_points / 2;
+
+ __m128 x, y, yl, yh, z, tmp1, tmp2;
+ lv_32fc_t* c = cVector;
+ const lv_32fc_t* a = aVector;
+ const lv_32fc_t* b = bVector;
+ for(;number < halfPoints; number++){
+
+ x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+ y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
+
+ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
+
+ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+
+ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+
+ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+ _mm_store_ps((float*)c,z); // Store the results back into the C container
+
+ a += 2;
+ b += 2;
+ c += 2;
+ }
+
+ if((num_points % 2) != 0) {
+ *c = (*a) * (*b);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+ /*!
+ \brief Multiplies the two input complex vectors and stores their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be multiplied
+ \param bVector One of the vectors to be multiplied
+ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+ */
+static inline void volk_gnsssdr_32fc_x2_multiply_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
+ lv_32fc_t* cPtr = cVector;
+ const lv_32fc_t* aPtr = aVector;
+ const lv_32fc_t* bPtr= bVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *cPtr++ = (*aPtr++) * (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#ifdef LV_HAVE_ORC
+ /*!
+ \brief Multiplies the two input complex vectors and stores their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be multiplied
+ \param bVector One of the vectors to be multiplied
+ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+ */
+extern void volk_gnsssdr_32fc_x2_multiply_32fc_a_orc_impl(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points);
+static inline void volk_gnsssdr_32fc_x2_multiply_32fc_u_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
+ volk_gnsssdr_32fc_x2_multiply_32fc_a_orc_impl(cVector, aVector, bVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+
+
+
+#endif /* INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_a_H */
diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3.h
--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3.h 1970-01-01 01:00:00.000000000 +0100
+++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3.h 2014-10-15 01:55:08.000000000 +0200
@@ -0,0 +1,409 @@
+#ifndef INCLUDED_gnsssdr_volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_u_H
+#define INCLUDED_gnsssdr_volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk_gnsssdr/volk_gnsssdr_complex.h>
+#include <float.h>
+#include <string.h>
+
+/*!
+ * TODO: Code the SSE4 version and benchmark it
+ */
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+
+
+ /*!
+ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
+ \param input The input signal input
+ \param carrier The carrier signal input
+ \param E_code Early PRN code replica input
+ \param P_code Early PRN code replica input
+ \param L_code Early PRN code replica input
+ \param E_out Early correlation output
+ \param P_out Early correlation output
+ \param L_out Early correlation output
+ \param num_points The number of complex values in vectors
+ */
+static inline void volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_u_sse3(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int halfPoints = num_points / 2;
+
+ lv_32fc_t dotProduct_E;
+ memset(&dotProduct_E, 0x0, 2*sizeof(float));
+ lv_32fc_t dotProduct_P;
+ memset(&dotProduct_P, 0x0, 2*sizeof(float));
+ lv_32fc_t dotProduct_L;
+ memset(&dotProduct_L, 0x0, 2*sizeof(float));
+
+ // Aux vars
+ __m128 x, y, yl, yh, z, tmp1, tmp2, z_E, z_P, z_L;
+
+ z_E = _mm_setzero_ps();
+ z_P = _mm_setzero_ps();
+ z_L = _mm_setzero_ps();
+
+ //input and output vectors
+ //lv_32fc_t* _input_BB = input_BB;
+ const lv_32fc_t* _input = input;
+ const lv_32fc_t* _carrier = carrier;
+ const lv_32fc_t* _E_code = E_code;
+ const lv_32fc_t* _P_code = P_code;
+ const lv_32fc_t* _L_code = L_code;
+
+ for(;number < halfPoints; number++)
+ {
+ // carrier wipe-off (vector point-to-point product)
+ x = _mm_loadu_ps((float*)_input); // Load the ar + ai, br + bi as ar,ai,br,bi
+ y = _mm_loadu_ps((float*)_carrier); // Load the cr + ci, dr + di as cr,ci,dr,di
+
+ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
+
+ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+
+ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+
+ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+ //_mm_storeu_ps((float*)_input_BB,z); // Store the results back into the _input_BB container
+
+ // correlation E,P,L (3x vector scalar product)
+ // Early
+ //x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi
+ x = z;
+
+ y = _mm_load_ps((float*)_E_code); // Load the cr + ci, dr + di as cr,ci,dr,di
+
+ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
+
+ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+
+ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+
+ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+ z_E = _mm_add_ps(z_E, z); // Add the complex multiplication results together
+
+ // Prompt
+ //x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi
+ y = _mm_load_ps((float*)_P_code); // Load the cr + ci, dr + di as cr,ci,dr,di
+
+ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
+
+ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+
+ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+
+ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+
+ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+ z_P = _mm_add_ps(z_P, z); // Add the complex multiplication results together
+
+ // Late
+ //x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi
+ y = _mm_load_ps((float*)_L_code); // Load the cr + ci, dr + di as cr,ci,dr,di
+
+ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
+
+ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+
+ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+
+ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+
+ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+ z_L = _mm_add_ps(z_L, z); // Add the complex multiplication results together
+
+ /*pointer increment*/
+ _carrier += 2;
+ _input += 2;
+ //_input_BB += 2;
+ _E_code += 2;
+ _P_code += 2;
+ _L_code +=2;
+ }
+
+ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_E[2];
+ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_P[2];
+ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_L[2];
+ //__VOLK_ATTR_ALIGNED(16) lv_32fc_t _input_BB;
+
+ _mm_store_ps((float*)dotProductVector_E,z_E); // Store the results back into the dot product vector
+ _mm_store_ps((float*)dotProductVector_P,z_P); // Store the results back into the dot product vector
+ _mm_store_ps((float*)dotProductVector_L,z_L); // Store the results back into the dot product vector
+
+ dotProduct_E += ( dotProductVector_E[0] + dotProductVector_E[1] );
+ dotProduct_P += ( dotProductVector_P[0] + dotProductVector_P[1] );
+ dotProduct_L += ( dotProductVector_L[0] + dotProductVector_L[1] );
+
+ if((num_points % 2) != 0)
+ {
+ //_input_BB = (*_input) * (*_carrier);
+ dotProduct_E += (*_input) * (*_E_code)*(*_carrier);
+ dotProduct_P += (*_input) * (*_P_code)*(*_carrier);
+ dotProduct_L += (*_input) * (*_L_code)*(*_carrier);
+ }
+
+ *E_out = dotProduct_E;
+ *P_out = dotProduct_P;
+ *L_out = dotProduct_L;
+}
+
+#endif /* LV_HAVE_SSE3 */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
+ \param input The input signal input
+ \param carrier The carrier signal input
+ \param E_code Early PRN code replica input
+ \param P_code Early PRN code replica input
+ \param L_code Early PRN code replica input
+ \param E_out Early correlation output
+ \param P_out Early correlation output
+ \param L_out Early correlation output
+ \param num_points The number of complex values in vectors
+ */
+static inline void volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_generic(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, unsigned int num_points)
+{
+ lv_32fc_t bb_signal_sample;
+
+ bb_signal_sample = lv_cmake(0, 0);
+
+ *E_out = 0;
+ *P_out = 0;
+ *L_out = 0;
+ // perform Early, Prompt and Late correlation
+ for(int i=0; i < num_points; ++i)
+ {
+ //Perform the carrier wipe-off
+ bb_signal_sample = input[i] * carrier[i];
+ // Now get early, late, and prompt values for each
+ *E_out += bb_signal_sample * E_code[i];
+ *P_out += bb_signal_sample * P_code[i];
+ *L_out += bb_signal_sample * L_code[i];
+ }
+}
+
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_gnsssdr_volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_u_H */
+
+
+#ifndef INCLUDED_gnsssdr_volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_a_H
+#define INCLUDED_gnsssdr_volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk_gnsssdr/volk_gnsssdr_complex.h>
+#include <float.h>
+#include <string.h>
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+/*!
+ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
+ \param input The input signal input
+ \param carrier The carrier signal input
+ \param E_code Early PRN code replica input
+ \param P_code Early PRN code replica input
+ \param L_code Early PRN code replica input
+ \param E_out Early correlation output
+ \param P_out Early correlation output
+ \param L_out Early correlation output
+ \param num_points The number of complex values in vectors
+ */
+static inline void volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_a_sse3(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int halfPoints = num_points / 2;
+
+ lv_32fc_t dotProduct_E;
+ memset(&dotProduct_E, 0x0, 2*sizeof(float));
+ lv_32fc_t dotProduct_P;
+ memset(&dotProduct_P, 0x0, 2*sizeof(float));
+ lv_32fc_t dotProduct_L;
+ memset(&dotProduct_L, 0x0, 2*sizeof(float));
+
+ // Aux vars
+ __m128 x, y, yl, yh, z, tmp1, tmp2, z_E, z_P, z_L;
+
+ z_E = _mm_setzero_ps();
+ z_P = _mm_setzero_ps();
+ z_L = _mm_setzero_ps();
+
+ //input and output vectors
+ //lv_32fc_t* _input_BB = input_BB;
+ const lv_32fc_t* _input = input;
+ const lv_32fc_t* _carrier = carrier;
+ const lv_32fc_t* _E_code = E_code;
+ const lv_32fc_t* _P_code = P_code;
+ const lv_32fc_t* _L_code = L_code;
+
+ for(;number < halfPoints; number++)
+ {
+ // carrier wipe-off (vector point-to-point product)
+ x = _mm_load_ps((float*)_input); // Load the ar + ai, br + bi as ar,ai,br,bi
+ y = _mm_load_ps((float*)_carrier); // Load the cr + ci, dr + di as cr,ci,dr,di
+
+ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
+
+ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+
+ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+
+ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+ //_mm_storeu_ps((float*)_input_BB,z); // Store the results back into the _input_BB container
+
+ // correlation E,P,L (3x vector scalar product)
+ // Early
+ //x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi
+ x = z;
+
+ y = _mm_load_ps((float*)_E_code); // Load the cr + ci, dr + di as cr,ci,dr,di
+
+ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
+
+ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+
+ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+
+ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+ z_E = _mm_add_ps(z_E, z); // Add the complex multiplication results together
+
+ // Prompt
+ //x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi
+ y = _mm_load_ps((float*)_P_code); // Load the cr + ci, dr + di as cr,ci,dr,di
+
+ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
+
+ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+
+ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+
+ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+
+ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+ z_P = _mm_add_ps(z_P, z); // Add the complex multiplication results together
+
+ // Late
+ //x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi
+ y = _mm_load_ps((float*)_L_code); // Load the cr + ci, dr + di as cr,ci,dr,di
+
+ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
+
+ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+
+ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+
+ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+
+ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+ z_L = _mm_add_ps(z_L, z); // Add the complex multiplication results together
+
+ /*pointer increment*/
+ _carrier += 2;
+ _input += 2;
+ //_input_BB += 2;
+ _E_code += 2;
+ _P_code += 2;
+ _L_code +=2;
+ }
+
+ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_E[2];
+ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_P[2];
+ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_L[2];
+ //__VOLK_ATTR_ALIGNED(16) lv_32fc_t _input_BB;
+
+ _mm_store_ps((float*)dotProductVector_E,z_E); // Store the results back into the dot product vector
+ _mm_store_ps((float*)dotProductVector_P,z_P); // Store the results back into the dot product vector
+ _mm_store_ps((float*)dotProductVector_L,z_L); // Store the results back into the dot product vector
+
+ dotProduct_E += ( dotProductVector_E[0] + dotProductVector_E[1] );
+ dotProduct_P += ( dotProductVector_P[0] + dotProductVector_P[1] );
+ dotProduct_L += ( dotProductVector_L[0] + dotProductVector_L[1] );
+
+ if((num_points % 2) != 0)
+ {
+ //_input_BB = (*_input) * (*_carrier);
+ dotProduct_E += (*_input) * (*_E_code)*(*_carrier);
+ dotProduct_P += (*_input) * (*_P_code)*(*_carrier);
+ dotProduct_L += (*_input) * (*_L_code)*(*_carrier);
+ }
+
+ *E_out = dotProduct_E;
+ *P_out = dotProduct_P;
+ *L_out = dotProduct_L;
+}
+
+#endif /* LV_HAVE_SSE3 */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
+ \param input The input signal input
+ \param carrier The carrier signal input
+ \param E_code Early PRN code replica input
+ \param P_code Early PRN code replica input
+ \param L_code Early PRN code replica input
+ \param E_out Early correlation output
+ \param P_out Early correlation output
+ \param L_out Early correlation output
+ \param num_points The number of complex values in vectors
+ */
+static inline void volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_a_generic(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, unsigned int num_points)
+{
+ lv_32fc_t bb_signal_sample;
+
+ bb_signal_sample = lv_cmake(0, 0);
+
+ *E_out = 0;
+ *P_out = 0;
+ *L_out = 0;
+ // perform Early, Prompt and Late correlation
+ for(int i=0; i < num_points; ++i)
+ {
+ //Perform the carrier wipe-off
+ bb_signal_sample = input[i] * carrier[i];
+ // Now get early, late, and prompt values for each
+ *E_out += bb_signal_sample * E_code[i];
+ *P_out += bb_signal_sample * P_code[i];
+ *L_out += bb_signal_sample * L_code[i];
+ }
+}
+
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_gnsssdr_volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_a_H */
diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5.h
--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5.h 1970-01-01 01:00:00.000000000 +0100
+++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5.h 2014-10-15 01:55:08.000000000 +0200
@@ -0,0 +1,848 @@
+/*!
+ * \file volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5
+ * \brief Volk protokernel: performs the carrier wipe-off mixing and the VE, Early, Prompt, Late and VL correlation with 64 bits vectors
+ * \authors <ul>
+ * <li>Javier Arribas, 2011. jarribas(at)cttc.es
+ * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
+ * </ul>
+ *
+ * Volk protokernel that performs the carrier wipe-off mixing and the
+ * VE, Early, Prompt, Late and VL correlation with 64 bits vectors (32 bits the
+ * real part and 32 bits the imaginary part):
+ * - The carrier wipe-off is done by multiplying the input signal by the
+ * carrier (multiplication of 64 bits vectors) It returns the input
+ * signal in base band (BB)
+ * - VE values are calculated by multiplying the input signal in BB by the
+ * VE code (multiplication of 64 bits vectors), accumulating the results
+ * - Early values are calculated by multiplying the input signal in BB by the
+ * early code (multiplication of 64 bits vectors), accumulating the results
+ * - Prompt values are calculated by multiplying the input signal in BB by the
+ * prompt code (multiplication of 64 bits vectors), accumulating the results
+ * - Late values are calculated by multiplying the input signal in BB by the
+ * late code (multiplication of 64 bits vectors), accumulating the results
+ * - VL values are calculated by multiplying the input signal in BB by the
+ * VL code (multiplication of 64 bits vectors), accumulating the results
+ *
+ * -------------------------------------------------------------------------
+ *
+ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
+ *
+ * GNSS-SDR is a software defined Global Navigation
+ * Satellite Systems receiver
+ *
+ * This file is part of GNSS-SDR.
+ *
+ * GNSS-SDR is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * at your option) any later version.
+ *
+ * GNSS-SDR is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * -------------------------------------------------------------------------
+ */
+
+#ifndef INCLUDED_gnsssdr_volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_u_H
+#define INCLUDED_gnsssdr_volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk_gnsssdr/volk_gnsssdr_complex.h>
+#include <float.h>
+#include <string.h>
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+/*!
+ \brief Performs the carrier wipe-off mixing and the VE, Early, Prompt, Late and VL correlation
+ \param input The input signal input
+ \param carrier The carrier signal input
+ \param VE_code VE PRN code replica input
+ \param E_code Early PRN code replica input
+ \param P_code Early PRN code replica input
+ \param L_code Early PRN code replica input
+ \param VL_code VL PRN code replica input
+ \param VE_out VE correlation output
+ \param E_out Early correlation output
+ \param P_out Early correlation output
+ \param L_out Early correlation output
+ \param VL_out VL correlation output
+ \param num_points The number of complex values in vectors
+ */
+static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_u_avx(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* VE_code, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, const lv_32fc_t* VL_code, unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int halfPoints = num_points / 4;
+
+ lv_32fc_t dotProduct_VE;
+ lv_32fc_t dotProduct_E;
+ lv_32fc_t dotProduct_P;
+ lv_32fc_t dotProduct_L;
+ lv_32fc_t dotProduct_VL;
+
+ // Aux vars
+ __m256 x, y, yl, yh, z, tmp1, tmp2, z_VE, z_E, z_P, z_L, z_VL;
+ __m256 bb_signal_sample, bb_signal_sample_shuffled;
+
+ z_VE = _mm256_setzero_ps();
+ z_E = _mm256_setzero_ps();
+ z_P = _mm256_setzero_ps();
+ z_L = _mm256_setzero_ps();
+ z_VL = _mm256_setzero_ps();
+
+ //input and output vectors
+ const lv_32fc_t* _input = input;
+ const lv_32fc_t* _carrier = carrier;
+ const lv_32fc_t* _VE_code = VE_code;
+ const lv_32fc_t* _E_code = E_code;
+ const lv_32fc_t* _P_code = P_code;
+ const lv_32fc_t* _L_code = L_code;
+ const lv_32fc_t* _VL_code = VL_code;
+
+ for(;number < halfPoints; number++)
+ {
+ // carrier wipe-off (vector point-to-point product)
+ x = _mm256_loadu_ps((float*)_input); // Load the ar + ai, br + bi as ar,ai,br,bi
+ y = _mm256_loadu_ps((float*)_carrier); // Load the cr + ci, dr + di as cr,ci,dr,di
+
+ yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+ yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di
+
+ tmp1 = _mm256_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+
+ x = _mm256_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+
+ tmp2 = _mm256_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+ bb_signal_sample = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+ bb_signal_sample_shuffled = _mm256_shuffle_ps(bb_signal_sample,bb_signal_sample,0xB1); // Re-arrange bb_signal_sample to be ai,ar,bi,br
+
+ // correlation VE,E,P,L,VL (5x vector scalar product)
+ // VE
+ y = _mm256_loadu_ps((float*)_VE_code); // Load the cr + ci, dr + di as cr,ci,dr,di
+
+ yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+ yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di
+
+ tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+ tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+ z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+ z_VE = _mm256_add_ps(z_VE, z); // Add the complex multiplication results together
+
+ // Early
+ y = _mm256_loadu_ps((float*)_E_code); // Load the cr + ci, dr + di as cr,ci,dr,di
+
+ yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+ yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di
+
+ tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+ tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+ z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+ z_E = _mm256_add_ps(z_E, z); // Add the complex multiplication results together
+
+ // Prompt
+ y = _mm256_loadu_ps((float*)_P_code); // Load the cr + ci, dr + di as cr,ci,dr,di
+
+ yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+ yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di
+
+ tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+ tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+ z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+ z_P = _mm256_add_ps(z_P, z); // Add the complex multiplication results together
+
+ // Late
+ y = _mm256_loadu_ps((float*)_L_code); // Load the cr + ci, dr + di as cr,ci,dr,di
+
+ yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+ yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di
+
+ tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+ tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+ z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+ z_L = _mm256_add_ps(z_L, z); // Add the complex multiplication results together
+
+ // VL
+ y = _mm256_loadu_ps((float*)_VL_code); // Load the cr + ci, dr + di as cr,ci,dr,di
+
+ yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+ yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di
+
+ tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+ tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+ z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+ z_VL = _mm256_add_ps(z_VL, z); // Add the complex multiplication results together
+
+ /*pointer increment*/
+ _carrier += 4;
+ _input += 4;
+ _VE_code += 4;
+ _E_code += 4;
+ _P_code += 4;
+ _L_code += 4;
+ _VL_code += 4;
+ }
+
+ __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_VE[4];
+ __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_E[4];
+ __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_P[4];
+ __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_L[4];
+ __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_VL[4];
+
+ _mm256_storeu_ps((float*)dotProductVector_VE,z_VE); // Store the results back into the dot product vector
+ _mm256_storeu_ps((float*)dotProductVector_E,z_E); // Store the results back into the dot product vector
+ _mm256_storeu_ps((float*)dotProductVector_P,z_P); // Store the results back into the dot product vector
+ _mm256_storeu_ps((float*)dotProductVector_L,z_L); // Store the results back into the dot product vector
+ _mm256_storeu_ps((float*)dotProductVector_VL,z_VL); // Store the results back into the dot product vector
+
+ dotProduct_VE = ( dotProductVector_VE[0] + dotProductVector_VE[1] + dotProductVector_VE[2] + dotProductVector_VE[3] );
+ dotProduct_E = ( dotProductVector_E[0] + dotProductVector_E[1] + dotProductVector_E[2] + dotProductVector_E[3] );
+ dotProduct_P = ( dotProductVector_P[0] + dotProductVector_P[1] + dotProductVector_P[2] + dotProductVector_P[3] );
+ dotProduct_L = ( dotProductVector_L[0] + dotProductVector_L[1] + dotProductVector_L[2] + dotProductVector_L[3] );
+ dotProduct_VL = ( dotProductVector_VL[0] + dotProductVector_VL[1] + dotProductVector_VL[2] + dotProductVector_VL[3] );
+
+ for (int i = 0; i<(num_points % 4); ++i)
+ {
+ dotProduct_VE += (*_input) * (*_VE_code++) * (*_carrier);
+ dotProduct_E += (*_input) * (*_E_code++) * (*_carrier);
+ dotProduct_P += (*_input) * (*_P_code++) * (*_carrier);
+ dotProduct_L += (*_input) * (*_L_code++) * (*_carrier);
+ dotProduct_VL += (*_input++) * (*_VL_code++) * (*_carrier++);
+ }
+
+ *VE_out = dotProduct_VE;
+ *E_out = dotProduct_E;
+ *P_out = dotProduct_P;
+ *L_out = dotProduct_L;
+ *VL_out = dotProduct_VL;
+}
+#endif /* LV_HAVE_AVX */
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+ /*!
+ \brief Performs the carrier wipe-off mixing and the VE, Early, Prompt, Late and VL correlation
+ \param input The input signal input
+ \param carrier The carrier signal input
+ \param VE_code VE PRN code replica input
+ \param E_code Early PRN code replica input
+ \param P_code Early PRN code replica input
+ \param L_code Early PRN code replica input
+ \param VL_code VL PRN code replica input
+ \param VE_out VE correlation output
+ \param E_out Early correlation output
+ \param P_out Early correlation output
+ \param L_out Early correlation output
+ \param VL_out VL correlation output
+ \param num_points The number of complex values in vectors
+ */
+static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_u_sse3(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* VE_code, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, const lv_32fc_t* VL_code, unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int halfPoints = num_points / 2;
+
+ lv_32fc_t dotProduct_VE;
+ lv_32fc_t dotProduct_E;
+ lv_32fc_t dotProduct_P;
+ lv_32fc_t dotProduct_L;
+ lv_32fc_t dotProduct_VL;
+
+ // Aux vars
+ __m128 x, y, yl, yh, z, tmp1, tmp2, z_VE, z_E, z_P, z_L, z_VL;
+ __m128 bb_signal_sample, bb_signal_sample_shuffled;
+
+ z_VE = _mm_setzero_ps();
+ z_E = _mm_setzero_ps();
+ z_P = _mm_setzero_ps();
+ z_L = _mm_setzero_ps();
+ z_VL = _mm_setzero_ps();
+
+ //input and output vectors
+ const lv_32fc_t* _input = input;
+ const lv_32fc_t* _carrier = carrier;
+ const lv_32fc_t* _VE_code = VE_code;
+ const lv_32fc_t* _E_code = E_code;
+ const lv_32fc_t* _P_code = P_code;
+ const lv_32fc_t* _L_code = L_code;
+ const lv_32fc_t* _VL_code = VL_code;
+
+ for(;number < halfPoints; number++)
+ {
+ // carrier wipe-off (vector point-to-point product)
+ x = _mm_loadu_ps((float*)_input); // Load the ar + ai, br + bi as ar,ai,br,bi
+ y = _mm_loadu_ps((float*)_carrier); // Load the cr + ci, dr + di as cr,ci,dr,di
+
+ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
+
+ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+
+ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+
+ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+ bb_signal_sample = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+ bb_signal_sample_shuffled = _mm_shuffle_ps(bb_signal_sample,bb_signal_sample,0xB1); // Re-arrange bb_signal_sample to be ai,ar,bi,br
+
+ // correlation VE,E,P,L,VL (5x vector scalar product)
+ // VE
+ y = _mm_loadu_ps((float*)_VE_code); // Load the cr + ci, dr + di as cr,ci,dr,di
+
+ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
+
+ tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+ tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+ z_VE = _mm_add_ps(z_VE, z); // Add the complex multiplication results together
+
+ // Early
+ y = _mm_loadu_ps((float*)_E_code); // Load the cr + ci, dr + di as cr,ci,dr,di
+
+ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
+
+ tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+ tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+ z_E = _mm_add_ps(z_E, z); // Add the complex multiplication results together
+
+ // Prompt
+ y = _mm_loadu_ps((float*)_P_code); // Load the cr + ci, dr + di as cr,ci,dr,di
+
+ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
+
+ tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+ tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+ z_P = _mm_add_ps(z_P, z); // Add the complex multiplication results together
+
+ // Late
+ y = _mm_loadu_ps((float*)_L_code); // Load the cr + ci, dr + di as cr,ci,dr,di
+
+ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
+
+ tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+ tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+ z_L = _mm_add_ps(z_L, z); // Add the complex multiplication results together
+
+ // VL
+ //x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi
+ y = _mm_loadu_ps((float*)_VL_code); // Load the cr + ci, dr + di as cr,ci,dr,di
+
+ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
+
+ tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+ tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+ z_VL = _mm_add_ps(z_VL, z); // Add the complex multiplication results together
+
+ /*pointer increment*/
+ _carrier += 2;
+ _input += 2;
+ _VE_code += 2;
+ _E_code += 2;
+ _P_code += 2;
+ _L_code +=2;
+ _VL_code +=2;
+ }
+
+ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_VE[2];
+ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_E[2];
+ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_P[2];
+ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_L[2];
+ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_VL[2];
+
+ _mm_storeu_ps((float*)dotProductVector_VE,z_VE); // Store the results back into the dot product vector
+ _mm_storeu_ps((float*)dotProductVector_E,z_E); // Store the results back into the dot product vector
+ _mm_storeu_ps((float*)dotProductVector_P,z_P); // Store the results back into the dot product vector
+ _mm_storeu_ps((float*)dotProductVector_L,z_L); // Store the results back into the dot product vector
+ _mm_storeu_ps((float*)dotProductVector_VL,z_VL); // Store the results back into the dot product vector
+
+ dotProduct_VE = ( dotProductVector_VE[0] + dotProductVector_VE[1] );
+ dotProduct_E = ( dotProductVector_E[0] + dotProductVector_E[1] );
+ dotProduct_P = ( dotProductVector_P[0] + dotProductVector_P[1] );
+ dotProduct_L = ( dotProductVector_L[0] + dotProductVector_L[1] );
+ dotProduct_VL = ( dotProductVector_VL[0] + dotProductVector_VL[1] );
+
+ if((num_points % 2) != 0)
+ {
+ dotProduct_VE += (*_input) * (*_VE_code)*(*_carrier);
+ dotProduct_E += (*_input) * (*_E_code)*(*_carrier);
+ dotProduct_P += (*_input) * (*_P_code)*(*_carrier);
+ dotProduct_L += (*_input) * (*_L_code)*(*_carrier);
+ dotProduct_VL += (*_input) * (*_VL_code)*(*_carrier);
+ }
+
+ *VE_out = dotProduct_VE;
+ *E_out = dotProduct_E;
+ *P_out = dotProduct_P;
+ *L_out = dotProduct_L;
+ *VL_out = dotProduct_VL;
+}
+#endif /* LV_HAVE_SSE3 */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Performs the carrier wipe-off mixing and the VE, Early, Prompt, Late and VL correlation
+ \param input The input signal input
+ \param carrier The carrier signal input
+ \param VE_code VE PRN code replica input
+ \param E_code Early PRN code replica input
+ \param P_code Early PRN code replica input
+ \param L_code Early PRN code replica input
+ \param VL_code VL PRN code replica input
+ \param VE_out VE correlation output
+ \param E_out Early correlation output
+ \param P_out Early correlation output
+ \param L_out Early correlation output
+ \param VL_out VL correlation output
+ \param num_points The number of complex values in vectors
+ */
+static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* VE_code, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, const lv_32fc_t* VL_code, unsigned int num_points)
+{
+ lv_32fc_t bb_signal_sample;
+
+ bb_signal_sample = lv_cmake(0, 0);
+
+ *VE_out = 0;
+ *E_out = 0;
+ *P_out = 0;
+ *L_out = 0;
+ *VL_out = 0;
+ // perform Early, Prompt and Late correlation
+ for(int i=0; i < num_points; ++i)
+ {
+ //Perform the carrier wipe-off
+ bb_signal_sample = input[i] * carrier[i];
+ // Now get early, late, and prompt values for each
+ *VE_out += bb_signal_sample * VE_code[i];
+ *E_out += bb_signal_sample * E_code[i];
+ *P_out += bb_signal_sample * P_code[i];
+ *L_out += bb_signal_sample * L_code[i];
+ *VL_out += bb_signal_sample * VL_code[i];
+ }
+}
+
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_gnsssdr_volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_u_H */
+
+
+#ifndef INCLUDED_gnsssdr_volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_a_H
+#define INCLUDED_gnsssdr_volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk_gnsssdr/volk_gnsssdr_complex.h>
+#include <float.h>
+#include <string.h>
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+/*!
+ \brief Performs the carrier wipe-off mixing and the VE, Early, Prompt, Late and VL correlation
+ \param input The input signal input
+ \param carrier The carrier signal input
+ \param VE_code VE PRN code replica input
+ \param E_code Early PRN code replica input
+ \param P_code Early PRN code replica input
+ \param L_code Early PRN code replica input
+ \param VL_code VL PRN code replica input
+ \param VE_out VE correlation output
+ \param E_out Early correlation output
+ \param P_out Early correlation output
+ \param L_out Early correlation output
+ \param VL_out VL correlation output
+ \param num_points The number of complex values in vectors
+ */
+static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_a_avx(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* VE_code, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, const lv_32fc_t* VL_code, unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int halfPoints = num_points / 4;
+
+ lv_32fc_t dotProduct_VE;
+ lv_32fc_t dotProduct_E;
+ lv_32fc_t dotProduct_P;
+ lv_32fc_t dotProduct_L;
+ lv_32fc_t dotProduct_VL;
+
+ // Aux vars
+ __m256 x, y, yl, yh, z, tmp1, tmp2, z_VE, z_E, z_P, z_L, z_VL;
+ __m256 bb_signal_sample, bb_signal_sample_shuffled;
+
+ z_VE = _mm256_setzero_ps();
+ z_E = _mm256_setzero_ps();
+ z_P = _mm256_setzero_ps();
+ z_L = _mm256_setzero_ps();
+ z_VL = _mm256_setzero_ps();
+
+ //input and output vectors
+ const lv_32fc_t* _input = input;
+ const lv_32fc_t* _carrier = carrier;
+ const lv_32fc_t* _VE_code = VE_code;
+ const lv_32fc_t* _E_code = E_code;
+ const lv_32fc_t* _P_code = P_code;
+ const lv_32fc_t* _L_code = L_code;
+ const lv_32fc_t* _VL_code = VL_code;
+
+ for(;number < halfPoints; number++)
+ {
+ // carrier wipe-off (vector point-to-point product)
+ x = _mm256_load_ps((float*)_input); // Load the ar + ai, br + bi as ar,ai,br,bi
+ y = _mm256_load_ps((float*)_carrier); // Load the cr + ci, dr + di as cr,ci,dr,di
+
+ yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+ yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di
+
+ tmp1 = _mm256_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+
+ x = _mm256_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+
+ tmp2 = _mm256_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+ bb_signal_sample = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+ bb_signal_sample_shuffled = _mm256_shuffle_ps(bb_signal_sample,bb_signal_sample,0xB1); // Re-arrange bb_signal_sample to be ai,ar,bi,br
+
+ // correlation VE,E,P,L,VL (5x vector scalar product)
+ // VE
+ y = _mm256_load_ps((float*)_VE_code); // Load the cr + ci, dr + di as cr,ci,dr,di
+
+ yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+ yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di
+
+ tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+ tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+ z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+ z_VE = _mm256_add_ps(z_VE, z); // Add the complex multiplication results together
+
+ // Early
+ y = _mm256_load_ps((float*)_E_code); // Load the cr + ci, dr + di as cr,ci,dr,di
+
+ yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+ yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di
+
+ tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+ tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+ z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+ z_E = _mm256_add_ps(z_E, z); // Add the complex multiplication results together
+
+ // Prompt
+ y = _mm256_load_ps((float*)_P_code); // Load the cr + ci, dr + di as cr,ci,dr,di
+
+ yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+ yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di
+
+ tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+ tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+ z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+ z_P = _mm256_add_ps(z_P, z); // Add the complex multiplication results together
+
+ // Late
+ y = _mm256_load_ps((float*)_L_code); // Load the cr + ci, dr + di as cr,ci,dr,di
+
+ yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+ yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di
+
+ tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+ tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+ z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+ z_L = _mm256_add_ps(z_L, z); // Add the complex multiplication results together
+
+ // VL
+ y = _mm256_load_ps((float*)_VL_code); // Load the cr + ci, dr + di as cr,ci,dr,di
+
+ yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+ yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di
+
+ tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+ tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+ z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+ z_VL = _mm256_add_ps(z_VL, z); // Add the complex multiplication results together
+
+ /*pointer increment*/
+ _carrier += 4;
+ _input += 4;
+ _VE_code += 4;
+ _E_code += 4;
+ _P_code += 4;
+ _L_code += 4;
+ _VL_code += 4;
+ }
+
+ __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_VE[4];
+ __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_E[4];
+ __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_P[4];
+ __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_L[4];
+ __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_VL[4];
+
+ _mm256_store_ps((float*)dotProductVector_VE,z_VE); // Store the results back into the dot product vector
+ _mm256_store_ps((float*)dotProductVector_E,z_E); // Store the results back into the dot product vector
+ _mm256_store_ps((float*)dotProductVector_P,z_P); // Store the results back into the dot product vector
+ _mm256_store_ps((float*)dotProductVector_L,z_L); // Store the results back into the dot product vector
+ _mm256_store_ps((float*)dotProductVector_VL,z_VL); // Store the results back into the dot product vector
+
+ dotProduct_VE = ( dotProductVector_VE[0] + dotProductVector_VE[1] + dotProductVector_VE[2] + dotProductVector_VE[3] );
+ dotProduct_E = ( dotProductVector_E[0] + dotProductVector_E[1] + dotProductVector_E[2] + dotProductVector_E[3] );
+ dotProduct_P = ( dotProductVector_P[0] + dotProductVector_P[1] + dotProductVector_P[2] + dotProductVector_P[3] );
+ dotProduct_L = ( dotProductVector_L[0] + dotProductVector_L[1] + dotProductVector_L[2] + dotProductVector_L[3] );
+ dotProduct_VL = ( dotProductVector_VL[0] + dotProductVector_VL[1] + dotProductVector_VL[2] + dotProductVector_VL[3] );
+
+ for (int i = 0; i<(num_points % 4); ++i)
+ {
+ dotProduct_VE += (*_input) * (*_VE_code++) * (*_carrier);
+ dotProduct_E += (*_input) * (*_E_code++) * (*_carrier);
+ dotProduct_P += (*_input) * (*_P_code++) * (*_carrier);
+ dotProduct_L += (*_input) * (*_L_code++) * (*_carrier);
+ dotProduct_VL += (*_input++) * (*_VL_code++) * (*_carrier++);
+ }
+
+ *VE_out = dotProduct_VE;
+ *E_out = dotProduct_E;
+ *P_out = dotProduct_P;
+ *L_out = dotProduct_L;
+ *VL_out = dotProduct_VL;
+}
+#endif /* LV_HAVE_AVX */
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+/*!
+ \brief Performs the carrier wipe-off mixing and the VE, Early, Prompt, Late and VL correlation
+ \param input The input signal input
+ \param carrier The carrier signal input
+ \param VE_code VE PRN code replica input
+ \param E_code Early PRN code replica input
+ \param P_code Early PRN code replica input
+ \param L_code Early PRN code replica input
+ \param VL_code VL PRN code replica input
+ \param VE_out VE correlation output
+ \param E_out Early correlation output
+ \param P_out Early correlation output
+ \param L_out Early correlation output
+ \param VL_out VL correlation output
+ \param num_points The number of complex values in vectors
+ */
+static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_a_sse3(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* VE_code, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, const lv_32fc_t* VL_code, unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int halfPoints = num_points / 2;
+
+ lv_32fc_t dotProduct_VE;
+ lv_32fc_t dotProduct_E;
+ lv_32fc_t dotProduct_P;
+ lv_32fc_t dotProduct_L;
+ lv_32fc_t dotProduct_VL;
+
+ // Aux vars
+ __m128 x, y, yl, yh, z, tmp1, tmp2, z_VE, z_E, z_P, z_L, z_VL;
+ __m128 bb_signal_sample, bb_signal_sample_shuffled;
+
+ z_VE = _mm_setzero_ps();
+ z_E = _mm_setzero_ps();
+ z_P = _mm_setzero_ps();
+ z_L = _mm_setzero_ps();
+ z_VL = _mm_setzero_ps();
+
+ //input and output vectors
+ const lv_32fc_t* _input = input;
+ const lv_32fc_t* _carrier = carrier;
+ const lv_32fc_t* _VE_code = VE_code;
+ const lv_32fc_t* _E_code = E_code;
+ const lv_32fc_t* _P_code = P_code;
+ const lv_32fc_t* _L_code = L_code;
+ const lv_32fc_t* _VL_code = VL_code;
+
+ for(;number < halfPoints; number++)
+ {
+ // carrier wipe-off (vector point-to-point product)
+ x = _mm_load_ps((float*)_input); // Load the ar + ai, br + bi as ar,ai,br,bi
+ y = _mm_load_ps((float*)_carrier); // Load the cr + ci, dr + di as cr,ci,dr,di
+
+ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
+
+ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+
+ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+
+ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+ bb_signal_sample = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+ bb_signal_sample_shuffled = _mm_shuffle_ps(bb_signal_sample,bb_signal_sample,0xB1); // Re-arrange bb_signal_sample to be ai,ar,bi,br
+
+ // correlation VE,E,P,L,VL (5x vector scalar product)
+ // VE
+ y = _mm_load_ps((float*)_VE_code); // Load the cr + ci, dr + di as cr,ci,dr,di
+
+ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
+
+ tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+ tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+ z_VE = _mm_add_ps(z_VE, z); // Add the complex multiplication results together
+
+ // Early
+ y = _mm_load_ps((float*)_E_code); // Load the cr + ci, dr + di as cr,ci,dr,di
+
+ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
+
+ tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+ tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+ z_E = _mm_add_ps(z_E, z); // Add the complex multiplication results together
+
+ // Prompt
+ y = _mm_load_ps((float*)_P_code); // Load the cr + ci, dr + di as cr,ci,dr,di
+
+ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
+
+ tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+ tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+ z_P = _mm_add_ps(z_P, z); // Add the complex multiplication results together
+
+ // Late
+ y = _mm_load_ps((float*)_L_code); // Load the cr + ci, dr + di as cr,ci,dr,di
+
+ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
+
+ tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+ tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+ z_L = _mm_add_ps(z_L, z); // Add the complex multiplication results together
+
+ // VL
+ //x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi
+ y = _mm_load_ps((float*)_VL_code); // Load the cr + ci, dr + di as cr,ci,dr,di
+
+ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
+
+ tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+ tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+ z_VL = _mm_add_ps(z_VL, z); // Add the complex multiplication results together
+
+ /*pointer increment*/
+ _carrier += 2;
+ _input += 2;
+ _VE_code += 2;
+ _E_code += 2;
+ _P_code += 2;
+ _L_code +=2;
+ _VL_code +=2;
+ }
+
+ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_VE[2];
+ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_E[2];
+ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_P[2];
+ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_L[2];
+ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_VL[2];
+
+ _mm_store_ps((float*)dotProductVector_VE,z_VE); // Store the results back into the dot product vector
+ _mm_store_ps((float*)dotProductVector_E,z_E); // Store the results back into the dot product vector
+ _mm_store_ps((float*)dotProductVector_P,z_P); // Store the results back into the dot product vector
+ _mm_store_ps((float*)dotProductVector_L,z_L); // Store the results back into the dot product vector
+ _mm_store_ps((float*)dotProductVector_VL,z_VL); // Store the results back into the dot product vector
+
+ dotProduct_VE = ( dotProductVector_VE[0] + dotProductVector_VE[1] );
+ dotProduct_E = ( dotProductVector_E[0] + dotProductVector_E[1] );
+ dotProduct_P = ( dotProductVector_P[0] + dotProductVector_P[1] );
+ dotProduct_L = ( dotProductVector_L[0] + dotProductVector_L[1] );
+ dotProduct_VL = ( dotProductVector_VL[0] + dotProductVector_VL[1] );
+
+ if((num_points % 2) != 0)
+ {
+ dotProduct_VE += (*_input) * (*_VE_code)*(*_carrier);
+ dotProduct_E += (*_input) * (*_E_code)*(*_carrier);
+ dotProduct_P += (*_input) * (*_P_code)*(*_carrier);
+ dotProduct_L += (*_input) * (*_L_code)*(*_carrier);
+ dotProduct_VL += (*_input) * (*_VL_code)*(*_carrier);
+ }
+
+ *VE_out = dotProduct_VE;
+ *E_out = dotProduct_E;
+ *P_out = dotProduct_P;
+ *L_out = dotProduct_L;
+ *VL_out = dotProduct_VL;
+}
+#endif /* LV_HAVE_SSE3 */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Performs the carrier wipe-off mixing and the VE, Early, Prompt, Late and VL correlation
+ \param input The input signal input
+ \param carrier The carrier signal input
+ \param VE_code VE PRN code replica input
+ \param E_code Early PRN code replica input
+ \param P_code Early PRN code replica input
+ \param L_code Early PRN code replica input
+ \param VL_code VL PRN code replica input
+ \param VE_out VE correlation output
+ \param E_out Early correlation output
+ \param P_out Early correlation output
+ \param L_out Early correlation output
+ \param VL_out VL correlation output
+ \param num_points The number of complex values in vectors
+ */
+static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_a_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* VE_code, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, const lv_32fc_t* VL_code, unsigned int num_points)
+{
+ lv_32fc_t bb_signal_sample;
+
+ bb_signal_sample = lv_cmake(0, 0);
+
+ *VE_out = 0;
+ *E_out = 0;
+ *P_out = 0;
+ *L_out = 0;
+ *VL_out = 0;
+ // perform Early, Prompt and Late correlation
+ for(int i=0; i < num_points; ++i)
+ {
+ //Perform the carrier wipe-off
+ bb_signal_sample = input[i] * carrier[i];
+ // Now get early, late, and prompt values for each
+ *VE_out += bb_signal_sample * VE_code[i];
+ *E_out += bb_signal_sample * E_code[i];
+ *P_out += bb_signal_sample * P_code[i];
+ *L_out += bb_signal_sample * L_code[i];
+ *VL_out += bb_signal_sample * VL_code[i];
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+#endif /* INCLUDED_gnsssdr_volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_a_H */
diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_64f_accumulator_64f.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_64f_accumulator_64f.h
--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_64f_accumulator_64f.h 1970-01-01 01:00:00.000000000 +0100
+++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_64f_accumulator_64f.h 2014-10-15 01:55:08.000000000 +0200
@@ -0,0 +1,243 @@
+/*!
+ * \file volk_gnsssdr_64f_accumulator_64f.h
+ * \brief Volk protokernel: 64 bits (double) scalar accumulator
+ * \authors <ul>
+ * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
+ * </ul>
+ *
+ * Volk protokernel that implements an accumulator of char values
+ *
+ * -------------------------------------------------------------------------
+ *
+ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
+ *
+ * GNSS-SDR is a software defined Global Navigation
+ * Satellite Systems receiver
+ *
+ * This file is part of GNSS-SDR.
+ *
+ * GNSS-SDR is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * at your option) any later version.
+ *
+ * GNSS-SDR is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * -------------------------------------------------------------------------
+ */
+
+#ifndef INCLUDED_volk_gnsssdr_64f_accumulator_64f_u_H
+#define INCLUDED_volk_gnsssdr_64f_accumulator_64f_u_H
+
+#include <volk_gnsssdr/volk_gnsssdr_common.h>
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+/*!
+ \brief Accumulates the values in the input buffer
+ \param result The accumulated result
+ \param inputBuffer The buffer of data to be accumulated
+ \param num_points The number of values in inputBuffer to be accumulated
+ */
+static inline void volk_gnsssdr_64f_accumulator_64f_u_avx(double* result,const double* inputBuffer, unsigned int num_points){
+ double returnValue = 0;
+ const unsigned int sse_iters = num_points / 4;
+
+ const double* aPtr = inputBuffer;
+
+ __VOLK_ATTR_ALIGNED(32) double tempBuffer[4];
+ __m256d accumulator = _mm256_setzero_pd();
+ __m256d aVal = _mm256_setzero_pd();
+
+ for(unsigned int number = 0; number < sse_iters; number++)
+ {
+ aVal = _mm256_loadu_pd(aPtr);
+ accumulator = _mm256_add_pd(accumulator, aVal);
+ aPtr += 4;
+ }
+
+ _mm256_storeu_pd((double*)tempBuffer,accumulator);
+
+ for(int i = 0; i<4; ++i){
+ returnValue += tempBuffer[i];
+ }
+
+ for(int i = 0; i<(num_points % 4); ++i){
+ returnValue += (*aPtr++);
+ }
+
+ *result = returnValue;
+}
+#endif /* LV_HAVE_AVX */
+
+#ifdef LV_HAVE_SSE3
+#include <xmmintrin.h>
+/*!
+ \brief Accumulates the values in the input buffer
+ \param result The accumulated result
+ \param inputBuffer The buffer of data to be accumulated
+ \param num_points The number of values in inputBuffer to be accumulated
+ */
+static inline void volk_gnsssdr_64f_accumulator_64f_u_sse3(double* result,const double* inputBuffer, unsigned int num_points){
+ double returnValue = 0;
+ const unsigned int sse_iters = num_points / 2;
+
+ const double* aPtr = inputBuffer;
+
+ __VOLK_ATTR_ALIGNED(16) double tempBuffer[2];
+ __m128d accumulator = _mm_setzero_pd();
+ __m128d aVal = _mm_setzero_pd();
+
+ for(unsigned int number = 0; number < sse_iters; number++)
+ {
+ aVal = _mm_loadu_pd(aPtr);
+ accumulator = _mm_add_pd(accumulator, aVal);
+ aPtr += 2;
+ }
+
+ _mm_storeu_pd((double*)tempBuffer,accumulator);
+
+ for(int i = 0; i<2; ++i){
+ returnValue += tempBuffer[i];
+ }
+
+ for(int i = 0; i<(num_points % 2); ++i){
+ returnValue += (*aPtr++);
+ }
+
+ *result = returnValue;
+}
+#endif /* LV_HAVE_SSE3 */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Accumulates the values in the input buffer
+ \param result The accumulated result
+ \param inputBuffer The buffer of data to be accumulated
+ \param num_points The number of values in inputBuffer to be accumulated
+ */
+static inline void volk_gnsssdr_64f_accumulator_64f_generic(double* result,const double* inputBuffer, unsigned int num_points){
+ const double* aPtr = inputBuffer;
+ double returnValue = 0;
+
+ for(unsigned int number = 0;number < num_points; number++){
+ returnValue += (*aPtr++);
+ }
+ *result = returnValue;
+}
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_volk_gnsssdr_64f_accumulator_64f_u_H */
+
+
+#ifndef INCLUDED_volk_gnsssdr_64f_accumulator_64f_a_H
+#define INCLUDED_volk_gnsssdr_64f_accumulator_64f_a_H
+
+#include <volk_gnsssdr/volk_gnsssdr_common.h>
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+/*!
+ \brief Accumulates the values in the input buffer
+ \param result The accumulated result
+ \param inputBuffer The buffer of data to be accumulated
+ \param num_points The number of values in inputBuffer to be accumulated
+ */
+static inline void volk_gnsssdr_64f_accumulator_64f_a_avx(double* result,const double* inputBuffer, unsigned int num_points){
+ double returnValue = 0;
+ const unsigned int sse_iters = num_points / 4;
+
+ const double* aPtr = inputBuffer;
+
+ __VOLK_ATTR_ALIGNED(32) double tempBuffer[4];
+ __m256d accumulator = _mm256_setzero_pd();
+ __m256d aVal = _mm256_setzero_pd();
+
+ for(unsigned int number = 0; number < sse_iters; number++)
+ {
+ aVal = _mm256_load_pd(aPtr);
+ accumulator = _mm256_add_pd(accumulator, aVal);
+ aPtr += 4;
+ }
+
+ _mm256_store_pd((double*)tempBuffer,accumulator);
+
+ for(int i = 0; i<4; ++i){
+ returnValue += tempBuffer[i];
+ }
+
+ for(int i = 0; i<(num_points % 4); ++i){
+ returnValue += (*aPtr++);
+ }
+
+ *result = returnValue;
+}
+#endif /* LV_HAVE_AVX */
+
+#ifdef LV_HAVE_SSE3
+#include <xmmintrin.h>
+/*!
+ \brief Accumulates the values in the input buffer
+ \param result The accumulated result
+ \param inputBuffer The buffer of data to be accumulated
+ \param num_points The number of values in inputBuffer to be accumulated
+ */
+static inline void volk_gnsssdr_64f_accumulator_64f_a_sse3(double* result,const double* inputBuffer, unsigned int num_points){
+ double returnValue = 0;
+ const unsigned int sse_iters = num_points / 2;
+
+ const double* aPtr = inputBuffer;
+
+ __VOLK_ATTR_ALIGNED(16) double tempBuffer[2];
+ __m128d accumulator = _mm_setzero_pd();
+ __m128d aVal = _mm_setzero_pd();
+
+ for(unsigned int number = 0; number < sse_iters; number++)
+ {
+ aVal = _mm_load_pd(aPtr);
+ accumulator = _mm_add_pd(accumulator, aVal);
+ aPtr += 2;
+ }
+
+ _mm_store_pd((double*)tempBuffer,accumulator);
+
+ for(int i = 0; i<2; ++i){
+ returnValue += tempBuffer[i];
+ }
+
+ for(int i = 0; i<(num_points % 2); ++i){
+ returnValue += (*aPtr++);
+ }
+
+ *result = returnValue;
+}
+#endif /* LV_HAVE_SSE3 */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Accumulates the values in the input buffer
+ \param result The accumulated result
+ \param inputBuffer The buffer of data to be accumulated
+ \param num_points The number of values in inputBuffer to be accumulated
+ */
+static inline void volk_gnsssdr_64f_accumulator_64f_a_generic(double* result,const double* inputBuffer, unsigned int num_points){
+ const double* aPtr = inputBuffer;
+ double returnValue = 0;
+
+ for(unsigned int number = 0;number < num_points; number++){
+ returnValue += (*aPtr++);
+ }
+ *result = returnValue;
+}
+#endif /* LV_HAVE_GENERIC */
+#endif /* INCLUDED_volk_gnsssdr_64f_accumulator_64f_a_H */
\ No newline at end of file
diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_accumulator_s8i.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8i_accumulator_s8i.h
--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_accumulator_s8i.h 1970-01-01 01:00:00.000000000 +0100
+++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8i_accumulator_s8i.h 2014-10-15 01:55:08.000000000 +0200
@@ -0,0 +1,183 @@
+/*!
+ * \file volk_gnsssdr_8i_accumulator_s8i.h
+ * \brief Volk protokernel: 8 bits (char) scalar accumulator
+ * \authors <ul>
+ * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
+ * </ul>
+ *
+ * Volk protokernel that implements an accumulator of char values
+ *
+ * -------------------------------------------------------------------------
+ *
+ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
+ *
+ * GNSS-SDR is a software defined Global Navigation
+ * Satellite Systems receiver
+ *
+ * This file is part of GNSS-SDR.
+ *
+ * GNSS-SDR is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * at your option) any later version.
+ *
+ * GNSS-SDR is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * -------------------------------------------------------------------------
+ */
+
+#ifndef INCLUDED_volk_gnsssdr_8i_accumulator_s8i_u_H
+#define INCLUDED_volk_gnsssdr_8i_accumulator_s8i_u_H
+
+#include <volk_gnsssdr/volk_gnsssdr_common.h>
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE3
+#include <xmmintrin.h>
+/*!
+ \brief Accumulates the values in the input buffer
+ \param result The accumulated result
+ \param inputBuffer The buffer of data to be accumulated
+ \param num_points The number of values in inputBuffer to be accumulated
+ */
+static inline void volk_gnsssdr_8i_accumulator_s8i_u_sse3(char* result, const char* inputBuffer, unsigned int num_points){
+ char returnValue = 0;
+ const unsigned int sse_iters = num_points / 16;
+
+ const char* aPtr = inputBuffer;
+
+ __VOLK_ATTR_ALIGNED(16) char tempBuffer[16];
+ __m128i accumulator = _mm_setzero_si128();
+ __m128i aVal = _mm_setzero_si128();
+
+ for(unsigned int number = 0; number < sse_iters; number++){
+ aVal = _mm_lddqu_si128((__m128i*)aPtr);
+ accumulator = _mm_add_epi8(accumulator, aVal);
+ aPtr += 16;
+ }
+ _mm_storeu_si128((__m128i*)tempBuffer,accumulator);
+
+ for(int i = 0; i<16; ++i){
+ returnValue += tempBuffer[i];
+ }
+
+ for(int i = 0; i<(num_points % 16); ++i){
+ returnValue += (*aPtr++);
+ }
+
+ *result = returnValue;
+}
+#endif /* LV_HAVE_SSE3 */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Accumulates the values in the input buffer
+ \param result The accumulated result
+ \param inputBuffer The buffer of data to be accumulated
+ \param num_points The number of values in inputBuffer to be accumulated
+ */
+static inline void volk_gnsssdr_8i_accumulator_s8i_generic(char* result, const char* inputBuffer, unsigned int num_points){
+ const char* aPtr = inputBuffer;
+ char returnValue = 0;
+
+ for(unsigned int number = 0;number < num_points; number++){
+ returnValue += (*aPtr++);
+ }
+ *result = returnValue;
+}
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_volk_gnsssdr_8i_accumulator_s8i_u_H */
+
+
+#ifndef INCLUDED_volk_gnsssdr_8i_accumulator_s8i_a_H
+#define INCLUDED_volk_gnsssdr_8i_accumulator_s8i_a_H
+
+#include <volk_gnsssdr/volk_gnsssdr_common.h>
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE3
+#include <xmmintrin.h>
+/*!
+ \brief Accumulates the values in the input buffer
+ \param result The accumulated result
+ \param inputBuffer The buffer of data to be accumulated
+ \param num_points The number of values in inputBuffer to be accumulated
+ */
+static inline void volk_gnsssdr_8i_accumulator_s8i_a_sse3(char* result, const char* inputBuffer, unsigned int num_points){
+ char returnValue = 0;
+ const unsigned int sse_iters = num_points / 16;
+
+ const char* aPtr = inputBuffer;
+
+ __VOLK_ATTR_ALIGNED(16) char tempBuffer[16];
+ __m128i accumulator = _mm_setzero_si128();
+ __m128i aVal = _mm_setzero_si128();
+
+ for(unsigned int number = 0; number < sse_iters; number++){
+ aVal = _mm_load_si128((__m128i*)aPtr);
+ accumulator = _mm_add_epi8(accumulator, aVal);
+ aPtr += 16;
+ }
+ _mm_store_si128((__m128i*)tempBuffer,accumulator);
+
+ for(int i = 0; i<16; ++i){
+ returnValue += tempBuffer[i];
+ }
+
+ for(int i = 0; i<(num_points % 16); ++i){
+ returnValue += (*aPtr++);
+ }
+
+ *result = returnValue;
+}
+#endif /* LV_HAVE_SSE3 */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Accumulates the values in the input buffer
+ \param result The accumulated result
+ \param inputBuffer The buffer of data to be accumulated
+ \param num_points The number of values in inputBuffer to be accumulated
+ */
+static inline void volk_gnsssdr_8i_accumulator_s8i_a_generic(char* result, const char* inputBuffer, unsigned int num_points){
+ const char* aPtr = inputBuffer;
+ char returnValue = 0;
+
+ for(unsigned int number = 0;number < num_points; number++){
+ returnValue += (*aPtr++);
+ }
+ *result = returnValue;
+}
+#endif /* LV_HAVE_GENERIC */
+
+#ifdef LV_HAVE_ORC
+/*!
+ \brief Accumulates the values in the input buffer
+ \param result The accumulated result
+ \param inputBuffer The buffer of data to be accumulated
+ \param num_points The number of values in inputBuffer to be accumulated
+ */
+extern void volk_gnsssdr_8i_accumulator_s8i_a_orc_impl(short* result, const char* inputBuffer, unsigned int num_points);
+static inline void volk_gnsssdr_8i_accumulator_s8i_u_orc(char* result, const char* inputBuffer, unsigned int num_points){
+
+ short res = 0;
+ char* resc = (char*)&res;
+ resc++;
+
+ volk_gnsssdr_8i_accumulator_s8i_a_orc_impl(&res, inputBuffer, num_points);
+
+ *result = *resc;
+}
+#endif /* LV_HAVE_ORC */
+
+#endif /* INCLUDED_volk_gnsssdr_8i_accumulator_s8i_a_H */
+
diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_index_max_16u.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8i_index_max_16u.h
--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_index_max_16u.h 1970-01-01 01:00:00.000000000 +0100
+++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8i_index_max_16u.h 2014-10-15 01:55:08.000000000 +0200
@@ -0,0 +1,493 @@
+/*!
+ * \file volk_gnsssdr_8i_index_max_16u.h
+ * \brief Volk protokernel: calculates the index of the maximum value in a group of 8 bits (char) scalars
+ * \authors <ul>
+ * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
+ * </ul>
+ *
+ * Volk protokernel that returns the index of the maximum value of a group of 8 bits (char) scalars
+ *
+ * -------------------------------------------------------------------------
+ *
+ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
+ *
+ * GNSS-SDR is a software defined Global Navigation
+ * Satellite Systems receiver
+ *
+ * This file is part of GNSS-SDR.
+ *
+ * GNSS-SDR is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * at your option) any later version.
+ *
+ * GNSS-SDR is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * -------------------------------------------------------------------------
+ */
+
+#ifndef INCLUDED_volk_gnsssdr_8i_index_max_16u_u_H
+#define INCLUDED_volk_gnsssdr_8i_index_max_16u_u_H
+
+#include <volk_gnsssdr/volk_gnsssdr_common.h>
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_AVX
+#include "immintrin.h"
+/*!
+ \brief Returns the index of the max value in src0
+ \param target The index of the max value in src0
+ \param src0 The buffer of data to be analysed
+ \param num_points The number of values in src0 to be analysed
+ */
+static inline void volk_gnsssdr_8i_index_max_16u_u_avx(unsigned int* target, const char* src0, unsigned int num_points) {
+ if(num_points > 0){
+ const unsigned int sse_iters = num_points / 32;
+
+ char* basePtr = (char*)src0;
+ char* inputPtr = (char*)src0;
+ char max = src0[0];
+ unsigned int index = 0;
+ __VOLK_ATTR_ALIGNED(32) char currentValuesBuffer[32];
+ __m256i ones, compareResults, currentValues;
+ __m128i compareResultslo, compareResultshi, maxValues, lo, hi;
+
+ ones = _mm256_set1_epi8(0xFF);
+ maxValues = _mm_set1_epi8(max);
+
+ for(unsigned int number = 0; number < sse_iters; number++)
+ {
+ currentValues = _mm256_lddqu_si256((__m256i*)inputPtr);
+
+ lo = _mm256_castsi256_si128(currentValues);
+ hi = _mm256_extractf128_si256(currentValues,1);
+
+ compareResultslo = _mm_cmpgt_epi8(maxValues, lo);
+ compareResultshi = _mm_cmpgt_epi8(maxValues, hi);
+
+ //compareResults = _mm256_set_m128i(compareResultshi , compareResultslo); //not defined in some versions of immintrin.h
+ compareResults = _mm256_insertf128_si256(_mm256_castsi128_si256(compareResultslo),(compareResultshi),1);
+
+ if (!_mm256_testc_si256(compareResults, ones))
+ {
+ _mm256_storeu_si256((__m256i*)&currentValuesBuffer, currentValues);
+
+ for(int i = 0; i < 32; i++)
+ {
+ if(currentValuesBuffer[i] > max)
+ {
+ index = inputPtr - basePtr + i;
+ max = currentValuesBuffer[i];
+ }
+ }
+ maxValues = _mm_set1_epi8(max);
+ }
+
+ inputPtr += 32;
+ }
+
+ for(int i = 0; i<(num_points % 32); ++i)
+ {
+ if(src0[i] > max)
+ {
+ index = i;
+ max = src0[i];
+ }
+ }
+ target[0] = index;
+ }
+}
+
+#endif /*LV_HAVE_AVX*/
+
+#ifdef LV_HAVE_SSE4_1
+#include<smmintrin.h>
+/*!
+ \brief Returns the index of the max value in src0
+ \param target The index of the max value in src0
+ \param src0 The buffer of data to be analysed
+ \param num_points The number of values in src0 to be analysed
+ */
+static inline void volk_gnsssdr_8i_index_max_16u_u_sse4_1(unsigned int* target, const char* src0, unsigned int num_points) {
+ if(num_points > 0){
+ const unsigned int sse_iters = num_points / 16;
+
+ char* basePtr = (char*)src0;
+ char* inputPtr = (char*)src0;
+ char max = src0[0];
+ unsigned int index = 0;
+ __VOLK_ATTR_ALIGNED(16) char currentValuesBuffer[16];
+ __m128i maxValues, compareResults, currentValues;
+
+ maxValues = _mm_set1_epi8(max);
+
+ for(unsigned int number = 0; number < sse_iters; number++)
+ {
+ currentValues = _mm_lddqu_si128((__m128i*)inputPtr);
+
+ compareResults = _mm_cmpgt_epi8(maxValues, currentValues);
+
+ if (!_mm_test_all_ones(compareResults))
+ {
+ _mm_storeu_si128((__m128i*)&currentValuesBuffer, currentValues);
+
+ for(int i = 0; i < 16; i++)
+ {
+ if(currentValuesBuffer[i] > max)
+ {
+ index = inputPtr - basePtr + i;
+ max = currentValuesBuffer[i];
+ }
+ }
+ maxValues = _mm_set1_epi8(max);
+ }
+
+ inputPtr += 16;
+ }
+
+ for(int i = 0; i<(num_points % 16); ++i)
+ {
+ if(src0[i] > max)
+ {
+ index = i;
+ max = src0[i];
+ }
+ }
+ target[0] = index;
+ }
+}
+
+#endif /*LV_HAVE_SSE4_1*/
+
+#ifdef LV_HAVE_SSE2
+#include<xmmintrin.h>
+/*!
+ \brief Returns the index of the max value in src0
+ \param target The index of the max value in src0
+ \param src0 The buffer of data to be analysed
+ \param num_points The number of values in src0 to be analysed
+ */
+static inline void volk_gnsssdr_8i_index_max_16u_u_sse2(unsigned int* target, const char* src0, unsigned int num_points) {
+ if(num_points > 0){
+ const unsigned int sse_iters = num_points / 16;
+
+ char* basePtr = (char*)src0;
+ char* inputPtr = (char*)src0;
+ char max = src0[0];
+ unsigned int index = 0;
+ unsigned short mask;
+ __VOLK_ATTR_ALIGNED(16) char currentValuesBuffer[16];
+ __m128i maxValues, compareResults, currentValues;
+
+ maxValues = _mm_set1_epi8(max);
+
+ for(unsigned int number = 0; number < sse_iters; number++)
+ {
+ currentValues = _mm_loadu_si128((__m128i*)inputPtr);
+ compareResults = _mm_cmpgt_epi8(maxValues, currentValues);
+ mask = _mm_movemask_epi8(compareResults);
+
+ if (mask != 0xFFFF)
+ {
+ _mm_storeu_si128((__m128i*)&currentValuesBuffer, currentValues);
+ mask = ~mask;
+ int i = 0;
+ while (mask > 0)
+ {
+ if ((mask & 1) == 1)
+ {
+ if(currentValuesBuffer[i] > max)
+ {
+ index = inputPtr - basePtr + i;
+ max = currentValuesBuffer[i];
+ }
+ }
+ i++;
+ mask >>= 1;
+ }
+ maxValues = _mm_set1_epi8(max);
+ }
+ inputPtr += 16;
+ }
+
+ for(int i = 0; i<(num_points % 16); ++i)
+ {
+ if(src0[i] > max)
+ {
+ index = i;
+ max = src0[i];
+ }
+ }
+ target[0] = index;
+ }
+}
+
+#endif /*LV_HAVE_SSE2*/
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Returns the index of the max value in src0
+ \param target The index of the max value in src0
+ \param src0 The buffer of data to be analysed
+ \param num_points The number of values in src0 to be analysed
+ */
+static inline void volk_gnsssdr_8i_index_max_16u_generic(unsigned int* target, const char* src0, unsigned int num_points) {
+
+ if(num_points > 0)
+ {
+ char max = src0[0];
+ unsigned int index = 0;
+
+ for(unsigned int i = 1; i < num_points; ++i)
+ {
+ if(src0[i] > max)
+ {
+ index = i;
+ max = src0[i];
+ }
+ }
+ target[0] = index;
+ }
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+#endif /*INCLUDED_volk_gnsssdr_8i_index_max_16u_u_H*/
+
+
+#ifndef INCLUDED_volk_gnsssdr_8i_index_max_16u_a_H
+#define INCLUDED_volk_gnsssdr_8i_index_max_16u_a_H
+
+#include <volk_gnsssdr/volk_gnsssdr_common.h>
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_AVX
+#include "immintrin.h"
+/*!
+ \brief Returns the index of the max value in src0
+ \param target The index of the max value in src0
+ \param src0 The buffer of data to be analysed
+ \param num_points The number of values in src0 to be analysed
+ */
+static inline void volk_gnsssdr_8i_index_max_16u_a_avx(unsigned int* target, const char* src0, unsigned int num_points) {
+ if(num_points > 0){
+ const unsigned int sse_iters = num_points / 32;
+
+ char* basePtr = (char*)src0;
+ char* inputPtr = (char*)src0;
+ char max = src0[0];
+ unsigned int index = 0;
+ __VOLK_ATTR_ALIGNED(32) char currentValuesBuffer[32];
+ __m256i ones, compareResults, currentValues;
+ __m128i compareResultslo, compareResultshi, maxValues, lo, hi;
+
+ ones = _mm256_set1_epi8(0xFF);
+ maxValues = _mm_set1_epi8(max);
+
+ for(unsigned int number = 0; number < sse_iters; number++)
+ {
+ currentValues = _mm256_load_si256((__m256i*)inputPtr);
+
+ lo = _mm256_castsi256_si128(currentValues);
+ hi = _mm256_extractf128_si256(currentValues,1);
+
+ compareResultslo = _mm_cmpgt_epi8(maxValues, lo);
+ compareResultshi = _mm_cmpgt_epi8(maxValues, hi);
+
+ //compareResults = _mm256_set_m128i(compareResultshi , compareResultslo); //not defined in some versions of immintrin.h
+ compareResults = _mm256_insertf128_si256(_mm256_castsi128_si256(compareResultslo),(compareResultshi),1);
+
+ if (!_mm256_testc_si256(compareResults, ones))
+ {
+ _mm256_store_si256((__m256i*)&currentValuesBuffer, currentValues);
+
+ for(int i = 0; i < 32; i++)
+ {
+ if(currentValuesBuffer[i] > max)
+ {
+ index = inputPtr - basePtr + i;
+ max = currentValuesBuffer[i];
+ }
+ }
+ maxValues = _mm_set1_epi8(max);
+ }
+
+ inputPtr += 32;
+ }
+
+ for(int i = 0; i<(num_points % 32); ++i)
+ {
+ if(src0[i] > max)
+ {
+ index = i;
+ max = src0[i];
+ }
+ }
+ target[0] = index;
+ }
+}
+
+#endif /*LV_HAVE_AVX*/
+
+#ifdef LV_HAVE_SSE4_1
+#include "smmintrin.h"
+#include "emmintrin.h"
+/*!
+ \brief Returns the index of the max value in src0
+ \param target The index of the max value in src0
+ \param src0 The buffer of data to be analysed
+ \param num_points The number of values in src0 to be analysed
+ */
+static inline void volk_gnsssdr_8i_index_max_16u_a_sse4_1(unsigned int* target, const char* src0, unsigned int num_points) {
+ if(num_points > 0){
+ const unsigned int sse_iters = num_points / 16;
+
+ char* basePtr = (char*)src0;
+ char* inputPtr = (char*)src0;
+ char max = src0[0];
+ unsigned int index = 0;
+ __VOLK_ATTR_ALIGNED(16) char currentValuesBuffer[16];
+ __m128i maxValues, compareResults, currentValues;
+
+ maxValues = _mm_set1_epi8(max);
+
+ for(unsigned int number = 0; number < sse_iters; number++)
+ {
+ currentValues = _mm_load_si128((__m128i*)inputPtr);
+
+ compareResults = _mm_cmpgt_epi8(maxValues, currentValues);
+
+ if (!_mm_test_all_ones(compareResults))
+ {
+ _mm_store_si128((__m128i*)&currentValuesBuffer, currentValues);
+
+ for(int i = 0; i < 16; i++)
+ {
+ if(currentValuesBuffer[i] > max)
+ {
+ index = inputPtr - basePtr + i;
+ max = currentValuesBuffer[i];
+ }
+ }
+ maxValues = _mm_set1_epi8(max);
+ }
+
+ inputPtr += 16;
+ }
+
+ for(int i = 0; i<(num_points % 16); ++i)
+ {
+ if(src0[i] > max)
+ {
+ index = i;
+ max = src0[i];
+ }
+ }
+ target[0] = index;
+ }
+}
+
+#endif /*LV_HAVE_SSE4_1*/
+
+#ifdef LV_HAVE_SSE2
+#include "emmintrin.h"
+/*!
+ \brief Returns the index of the max value in src0
+ \param target The index of the max value in src0
+ \param src0 The buffer of data to be analysed
+ \param num_points The number of values in src0 to be analysed
+ */
+static inline void volk_gnsssdr_8i_index_max_16u_a_sse2(unsigned int* target, const char* src0, unsigned int num_points) {
+ if(num_points > 0){
+ const unsigned int sse_iters = num_points / 16;
+
+ char* basePtr = (char*)src0;
+ char* inputPtr = (char*)src0;
+ char max = src0[0];
+ unsigned int index = 0;
+ unsigned short mask;
+ __VOLK_ATTR_ALIGNED(16) char currentValuesBuffer[16];
+ __m128i maxValues, compareResults, currentValues;
+
+ maxValues = _mm_set1_epi8(max);
+
+ for(unsigned int number = 0; number < sse_iters; number++)
+ {
+ currentValues = _mm_load_si128((__m128i*)inputPtr);
+ compareResults = _mm_cmpgt_epi8(maxValues, currentValues);
+ mask = _mm_movemask_epi8(compareResults);
+
+ if (mask != 0xFFFF)
+ {
+ _mm_store_si128((__m128i*)&currentValuesBuffer, currentValues);
+ mask = ~mask;
+ int i = 0;
+ while (mask > 0)
+ {
+ if ((mask & 1) == 1)
+ {
+ if(currentValuesBuffer[i] > max)
+ {
+ index = inputPtr - basePtr + i;
+ max = currentValuesBuffer[i];
+ }
+ }
+ i++;
+ mask >>= 1;
+ }
+ maxValues = _mm_set1_epi8(max);
+ }
+ inputPtr += 16;
+ }
+
+ for(int i = 0; i<(num_points % 16); ++i)
+ {
+ if(src0[i] > max)
+ {
+ index = i;
+ max = src0[i];
+ }
+ }
+ target[0] = index;
+ }
+}
+
+#endif /*LV_HAVE_SSE2*/
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Returns the index of the max value in src0
+ \param target The index of the max value in src0
+ \param src0 The buffer of data to be analysed
+ \param num_points The number of values in src0 to be analysed
+ */
+static inline void volk_gnsssdr_8i_index_max_16u_a_generic(unsigned int* target, const char* src0, unsigned int num_points) {
+
+ if(num_points > 0)
+ {
+ char max = src0[0];
+ unsigned int index = 0;
+
+ for(unsigned int i = 1; i < num_points; ++i)
+ {
+ if(src0[i] > max)
+ {
+ index = i;
+ max = src0[i];
+ }
+ }
+ target[0] = index;
+ }
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+#endif /*INCLUDED_volk_gnsssdr_8i_index_max_16u_a_H*/
diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_max_s8i.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8i_max_s8i.h
--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_max_s8i.h 1970-01-01 01:00:00.000000000 +0100
+++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8i_max_s8i.h 2014-10-15 01:55:08.000000000 +0200
@@ -0,0 +1,327 @@
+/*!
+ * \file volk_gnsssdr_8i_max_s8i.h
+ * \brief Volk protokernel: calculates the maximum value in a group of 8 bits (char) scalars
+ * \authors <ul>
+ * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
+ * </ul>
+ *
+ * Volk protokernel that returns the maximum value of a group of 8 bits (char) scalars
+ *
+ * -------------------------------------------------------------------------
+ *
+ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
+ *
+ * GNSS-SDR is a software defined Global Navigation
+ * Satellite Systems receiver
+ *
+ * This file is part of GNSS-SDR.
+ *
+ * GNSS-SDR is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * at your option) any later version.
+ *
+ * GNSS-SDR is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * -------------------------------------------------------------------------
+ */
+
+#ifndef INCLUDED_volk_gnsssdr_8i_max_s8i_u_H
+#define INCLUDED_volk_gnsssdr_8i_max_s8i_u_H
+
+#include <volk_gnsssdr/volk_gnsssdr_common.h>
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE4_1
+#include<smmintrin.h>
+/*!
+ \brief Returns the max value in src0
+ \param target The max value in src0
+ \param src0 The buffer of data to be analysed
+ \param num_points The number of values in src0 to be analysed
+ */
+static inline void volk_gnsssdr_8i_max_s8i_u_sse4_1(char target, const char* src0, unsigned int num_points) {
+ if(num_points > 0){
+ const unsigned int sse_iters = num_points / 16;
+
+ char* inputPtr = (char*)src0;
+ char max = src0[0];
+ __VOLK_ATTR_ALIGNED(16) char maxValuesBuffer[16];
+ __m128i maxValues, compareResults, currentValues;
+
+ maxValues = _mm_set1_epi8(max);
+
+ for(unsigned int number = 0; number < sse_iters; number++)
+ {
+ currentValues = _mm_loadu_si128((__m128i*)inputPtr);
+ compareResults = _mm_cmpgt_epi8(maxValues, currentValues);
+ maxValues = _mm_blendv_epi8(currentValues, maxValues, compareResults);
+ inputPtr += 16;
+ }
+
+ _mm_storeu_si128((__m128i*)maxValuesBuffer, maxValues);
+
+ for(int i = 0; i<16; ++i)
+ {
+ if(maxValuesBuffer[i] > max)
+ {
+ max = maxValuesBuffer[i];
+ }
+ }
+
+ for(int i = 0; i<(num_points % 16); ++i)
+ {
+ if(src0[i] > max)
+ {
+ max = src0[i];
+ }
+ }
+ target = max;
+ }
+}
+
+#endif /*LV_HAVE_SSE4_1*/
+
+#ifdef LV_HAVE_SSE2
+#include<xmmintrin.h>
+/*!
+ \brief Returns the max value in src0
+ \param target The max value in src0
+ \param src0 The buffer of data to be analysed
+ \param num_points The number of values in src0 to be analysed
+ */
+static inline void volk_gnsssdr_8i_max_s8i_u_sse2(char target, const char* src0, unsigned int num_points) {
+ if(num_points > 0){
+ const unsigned int sse_iters = num_points / 16;
+
+ char* inputPtr = (char*)src0;
+ char max = src0[0];
+ unsigned short mask;
+ __VOLK_ATTR_ALIGNED(16) char currentValuesBuffer[16];
+ __m128i maxValues, compareResults, currentValues;
+
+ maxValues = _mm_set1_epi8(max);
+
+ for(unsigned int number = 0; number < sse_iters; number++)
+ {
+ currentValues = _mm_loadu_si128((__m128i*)inputPtr);
+ compareResults = _mm_cmpgt_epi8(maxValues, currentValues);
+ mask = _mm_movemask_epi8(compareResults);
+
+ if (mask != 0xFFFF)
+ {
+ _mm_storeu_si128((__m128i*)&currentValuesBuffer, currentValues);
+ mask = ~mask;
+ int i = 0;
+ while (mask > 0)
+ {
+ if ((mask & 1) == 1)
+ {
+ if(currentValuesBuffer[i] > max)
+ {
+ max = currentValuesBuffer[i];
+ }
+ }
+ i++;
+ mask >>= 1;
+ }
+ maxValues = _mm_set1_epi8(max);
+ }
+ inputPtr += 16;
+ }
+
+ for(int i = 0; i<(num_points % 16); ++i)
+ {
+ if(src0[i] > max)
+ {
+ max = src0[i];
+ }
+ }
+ target = max;
+ }
+}
+
+#endif /*LV_HAVE_SSE2*/
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Returns the max value in src0
+ \param target The max value in src0
+ \param src0 The buffer of data to be analysed
+ \param num_points The number of values in src0 to be analysed
+ */
+static inline void volk_gnsssdr_8i_max_s8i_generic(char target, const char* src0, unsigned int num_points) {
+ if(num_points > 0)
+ {
+ char max = src0[0];
+
+ for(unsigned int i = 1; i < num_points; ++i)
+ {
+ if(src0[i] > max)
+ {
+ max = src0[i];
+ }
+ }
+ target = max;
+ }
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+#endif /*INCLUDED_volk_gnsssdr_8i_max_s8i_u_H*/
+
+
+#ifndef INCLUDED_volk_gnsssdr_8i_max_s8i_a_H
+#define INCLUDED_volk_gnsssdr_8i_max_s8i_a_H
+
+#include <volk_gnsssdr/volk_gnsssdr_common.h>
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE4_1
+#include "smmintrin.h"
+/*!
+ \brief Returns the max value in src0
+ \param target The max value in src0
+ \param src0 The buffer of data to be analysed
+ \param num_points The number of values in src0 to be analysed
+ */
+static inline void volk_gnsssdr_8i_max_s8i_a_sse4_1(char target, const char* src0, unsigned int num_points) {
+ if(num_points > 0){
+ const unsigned int sse_iters = num_points / 16;
+
+ char* inputPtr = (char*)src0;
+ char max = src0[0];
+ __VOLK_ATTR_ALIGNED(16) char maxValuesBuffer[16];
+ __m128i maxValues, compareResults, currentValues;
+
+ maxValues = _mm_set1_epi8(max);
+
+ for(unsigned int number = 0; number < sse_iters; number++)
+ {
+ currentValues = _mm_load_si128((__m128i*)inputPtr);
+ compareResults = _mm_cmpgt_epi8(maxValues, currentValues);
+ maxValues = _mm_blendv_epi8(currentValues, maxValues, compareResults);
+ inputPtr += 16;
+ }
+
+ _mm_store_si128((__m128i*)maxValuesBuffer, maxValues);
+
+ for(int i = 0; i<16; ++i)
+ {
+ if(maxValuesBuffer[i] > max)
+ {
+ max = maxValuesBuffer[i];
+ }
+ }
+
+ for(int i = 0; i<(num_points % 16); ++i)
+ {
+ if(src0[i] > max)
+ {
+ max = src0[i];
+ }
+ }
+ target = max;
+ }
+}
+
+#endif /*LV_HAVE_SSE4_1*/
+
+#ifdef LV_HAVE_SSE2
+#include "emmintrin.h"
+/*!
+ \brief Returns the max value in src0
+ \param target The max value in src0
+ \param src0 The buffer of data to be analysed
+ \param num_points The number of values in src0 to be analysed
+ */
+static inline void volk_gnsssdr_8i_max_s8i_a_sse2(char target, const char* src0, unsigned int num_points) {
+ if(num_points > 0){
+ const unsigned int sse_iters = num_points / 16;
+
+ char* inputPtr = (char*)src0;
+ char max = src0[0];
+ unsigned short mask;
+ __VOLK_ATTR_ALIGNED(16) char currentValuesBuffer[16];
+ __m128i maxValues, compareResults, currentValues;
+
+ maxValues = _mm_set1_epi8(max);
+
+ for(unsigned int number = 0; number < sse_iters; number++)
+ {
+ currentValues = _mm_load_si128((__m128i*)inputPtr);
+ compareResults = _mm_cmpgt_epi8(maxValues, currentValues);
+ mask = _mm_movemask_epi8(compareResults);
+
+ if (mask != 0xFFFF)
+ {
+ _mm_store_si128((__m128i*)&currentValuesBuffer, currentValues);
+ mask = ~mask;
+ int i = 0;
+ while (mask > 0)
+ {
+ if ((mask & 1) == 1)
+ {
+ if(currentValuesBuffer[i] > max)
+ {
+ max = currentValuesBuffer[i];
+ }
+ }
+ i++;
+ mask >>= 1;
+ }
+ maxValues = _mm_set1_epi8(max);
+ }
+ inputPtr += 16;
+ }
+
+ for(int i = 0; i<(num_points % 16); ++i)
+ {
+ if(src0[i] > max)
+ {
+ max = src0[i];
+ }
+ }
+ target = max;
+ }
+}
+
+#endif /*LV_HAVE_SSE2*/
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Returns the max value in src0
+ \param target The max value in src0
+ \param src0 The buffer of data to be analysed
+ \param num_points The number of values in src0 to be analysed
+ */
+static inline void volk_gnsssdr_8i_max_s8i_a_generic(char target, const char* src0, unsigned int num_points) {
+ if(num_points > 0)
+ {
+ if(num_points > 0)
+ {
+ char max = src0[0];
+
+ for(unsigned int i = 1; i < num_points; ++i)
+ {
+ if(src0[i] > max)
+ {
+ max = src0[i];
+ }
+ }
+ target = max;
+ }
+ }
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+#endif /*INCLUDED_volk_gnsssdr_8i_max_s8i_a_H*/
\ No newline at end of file
diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_x2_add_8i.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8i_x2_add_8i.h
--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_x2_add_8i.h 1970-01-01 01:00:00.000000000 +0100
+++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8i_x2_add_8i.h 2014-10-15 01:55:08.000000000 +0200
@@ -0,0 +1,184 @@
+/*!
+ * \file volk_gnsssdr_8i_x2_add_8i.h
+ * \brief Volk protokernel: adds pairs of 8 bits (char) scalars
+ * \authors <ul>
+ * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
+ * </ul>
+ *
+ * Volk protokernel that adds pairs of 8 bits (char) scalars
+ *
+ * -------------------------------------------------------------------------
+ *
+ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
+ *
+ * GNSS-SDR is a software defined Global Navigation
+ * Satellite Systems receiver
+ *
+ * This file is part of GNSS-SDR.
+ *
+ * GNSS-SDR is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * at your option) any later version.
+ *
+ * GNSS-SDR is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * -------------------------------------------------------------------------
+ */
+
+#ifndef INCLUDED_volk_gnsssdr_8i_x2_add_8i_u_H
+#define INCLUDED_volk_gnsssdr_8i_x2_add_8i_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE2
+#include "pmmintrin.h"
+/*!
+ \brief Adds the two input vectors and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be added
+ \param bVector One of the vectors to be added
+ \param num_points The number of values in aVector and bVector to be added together and stored into cVector
+ */
+static inline void volk_gnsssdr_8i_x2_add_8i_u_sse2(char* cVector, const char* aVector, const char* bVector, unsigned int num_points){
+
+ const unsigned int sse_iters = num_points / 16;
+
+ char* cPtr = cVector;
+ const char* aPtr = aVector;
+ const char* bPtr= bVector;
+
+ __m128i aVal, bVal, cVal;
+
+ for(int number = 0; number < sse_iters; number++){
+
+ aVal = _mm_lddqu_si128((__m128i*)aPtr);
+ bVal = _mm_lddqu_si128((__m128i*)bPtr);
+
+ cVal = _mm_add_epi8(aVal, bVal);
+
+ _mm_storeu_si128((__m128i*)cPtr,cVal); // Store the results back into the C container
+
+ aPtr += 16;
+ bPtr += 16;
+ cPtr += 16;
+ }
+
+ for(int i = 0; i<(num_points % 16); ++i)
+ {
+ *cPtr++ = (*aPtr++) + (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Adds the two input vectors and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be added
+ \param bVector One of the vectors to be added
+ \param num_points The number of values in aVector and bVector to be added together and stored into cVector
+ */
+static inline void volk_gnsssdr_8i_x2_add_8i_generic(char* cVector, const char* aVector, const char* bVector, unsigned int num_points){
+ char* cPtr = cVector;
+ const char* aPtr = aVector;
+ const char* bPtr= bVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *cPtr++ = (*aPtr++) + (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_volk_gnsssdr_8i_x2_add_8i_u_H */
+
+
+#ifndef INCLUDED_volk_gnsssdr_8i_x2_add_8i_a_H
+#define INCLUDED_volk_gnsssdr_8i_x2_add_8i_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE2
+#include "pmmintrin.h"
+/*!
+ \brief Adds the two input vectors and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be added
+ \param bVector One of the vectors to be added
+ \param num_points The number of values in aVector and bVector to be added together and stored into cVector
+ */
+static inline void volk_gnsssdr_8i_x2_add_8i_a_sse2(char* cVector, const char* aVector, const char* bVector, unsigned int num_points){
+
+ const unsigned int sse_iters = num_points / 16;
+
+ char* cPtr = cVector;
+ const char* aPtr = aVector;
+ const char* bPtr= bVector;
+
+ __m128i aVal, bVal, cVal;
+
+ for(int number = 0; number < sse_iters; number++){
+
+ aVal = _mm_load_si128((__m128i*)aPtr);
+ bVal = _mm_load_si128((__m128i*)bPtr);
+
+ cVal = _mm_add_epi8(aVal, bVal);
+
+ _mm_store_si128((__m128i*)cPtr,cVal); // Store the results back into the C container
+
+ aPtr += 16;
+ bPtr += 16;
+ cPtr += 16;
+ }
+
+ for(int i = 0; i<(num_points % 16); ++i)
+ {
+ *cPtr++ = (*aPtr++) + (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Adds the two input vectors and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be added
+ \param bVector One of the vectors to be added
+ \param num_points The number of values in aVector and bVector to be added together and stored into cVector
+ */
+static inline void volk_gnsssdr_8i_x2_add_8i_a_generic(char* cVector, const char* aVector, const char* bVector, unsigned int num_points){
+ char* cPtr = cVector;
+ const char* aPtr = aVector;
+ const char* bPtr= bVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *cPtr++ = (*aPtr++) + (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#ifdef LV_HAVE_ORC
+/*!
+ \brief Adds the two input vectors and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be added
+ \param bVector One of the vectors to be added
+ \param num_points The number of values in aVector and bVector to be added together and stored into cVector
+ */
+extern void volk_gnsssdr_8i_x2_add_8i_a_orc_impl(char* cVector, const char* aVector, const char* bVector, unsigned int num_points);
+static inline void volk_gnsssdr_8i_x2_add_8i_u_orc(char* cVector, const char* aVector, const char* bVector, unsigned int num_points){
+ volk_gnsssdr_8i_x2_add_8i_a_orc_impl(cVector, aVector, bVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+#endif /* INCLUDED_volk_gnsssdr_8i_x2_add_8i_a_H */
diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_conjugate_8ic.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_conjugate_8ic.h
--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_conjugate_8ic.h 1970-01-01 01:00:00.000000000 +0100
+++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_conjugate_8ic.h 2014-10-15 01:55:08.000000000 +0200
@@ -0,0 +1,326 @@
+/*!
+ * \file volk_gnsssdr_8ic_conjugate_8ic.h
+ * \brief Volk protokernel: calculates the conjugate of a 16 bits vector
+ * \authors <ul>
+ * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
+ * </ul>
+ *
+ * Volk protokernel that calculates the conjugate of a
+ * 16 bits vector (8 bits the real part and 8 bits the imaginary part)
+ *
+ * -------------------------------------------------------------------------
+ *
+ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
+ *
+ * GNSS-SDR is a software defined Global Navigation
+ * Satellite Systems receiver
+ *
+ * This file is part of GNSS-SDR.
+ *
+ * GNSS-SDR is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * at your option) any later version.
+ *
+ * GNSS-SDR is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * -------------------------------------------------------------------------
+ */
+
+#ifndef INCLUDED_volk_gnsssdr_8ic_conjugate_8ic_u_H
+#define INCLUDED_volk_gnsssdr_8ic_conjugate_8ic_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk_gnsssdr/volk_gnsssdr_complex.h>
+
+#ifdef LV_HAVE_AVX
+#include "immintrin.h"
+/*!
+ \brief Takes the conjugate of an unsigned char vector.
+ \param cVector The vector where the results will be stored
+ \param aVector Vector to be conjugated
+ \param num_points The number of unsigned char values in aVector to be conjugated and stored into cVector
+ */
+static inline void volk_gnsssdr_8ic_conjugate_8ic_u_avx(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points){
+ const unsigned int sse_iters = num_points / 16;
+
+ lv_8sc_t* c = cVector;
+ const lv_8sc_t* a = aVector;
+
+ __m256 tmp;
+ __m128i tmp128lo, tmp128hi;
+ __m256 conjugator1 = _mm256_castsi256_ps(_mm256_setr_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255));
+ __m128i conjugator2 = _mm_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1);
+
+ for (int i = 0; i < sse_iters; ++i)
+ {
+ tmp = _mm256_loadu_ps((float*)a);
+ tmp = _mm256_xor_ps(tmp, conjugator1);
+ tmp128lo = _mm256_castsi256_si128(_mm256_castps_si256(tmp));
+ tmp128lo = _mm_add_epi8(tmp128lo, conjugator2);
+ tmp128hi = _mm256_extractf128_si256(_mm256_castps_si256(tmp),1);
+ tmp128hi = _mm_add_epi8(tmp128hi, conjugator2);
+ //tmp = _mm256_set_m128i(tmp128hi , tmp128lo); //not defined in some versions of immintrin.h
+ tmp = _mm256_insertf128_si256(_mm256_castsi128_si256(tmp128lo),(tmp128hi),1);
+ _mm256_storeu_ps((float*)c, tmp);
+
+ a += 16;
+ c += 16;
+ }
+
+ for (int i = 0; i<(num_points % 16); ++i)
+ {
+ *c++ = lv_conj(*a++);
+ }
+}
+#endif /* LV_HAVE_AVX */
+
+#ifdef LV_HAVE_SSSE3
+#include "tmmintrin.h"
+/*!
+ \brief Takes the conjugate of an unsigned char vector.
+ \param cVector The vector where the results will be stored
+ \param aVector Vector to be conjugated
+ \param num_points The number of unsigned char values in aVector to be conjugated and stored into cVector
+ */
+static inline void volk_gnsssdr_8ic_conjugate_8ic_u_ssse3(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points){
+ const unsigned int sse_iters = num_points / 8;
+
+ lv_8sc_t* c = cVector;
+ const lv_8sc_t* a = aVector;
+ __m128i tmp;
+
+ __m128i conjugator = _mm_setr_epi8(1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1);
+
+ for (int i = 0; i < sse_iters; ++i)
+ {
+ tmp = _mm_lddqu_si128((__m128i*)a);
+ tmp = _mm_sign_epi8(tmp, conjugator);
+ _mm_storeu_si128((__m128i*)c, tmp);
+ a += 8;
+ c += 8;
+ }
+
+ for (int i = 0; i<(num_points % 8); ++i)
+ {
+ *c++ = lv_conj(*a++);
+ }
+
+}
+#endif /* LV_HAVE_SSSE3 */
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+/*!
+ \brief Takes the conjugate of an unsigned char vector.
+ \param cVector The vector where the results will be stored
+ \param aVector Vector to be conjugated
+ \param num_points The number of unsigned char values in aVector to be conjugated and stored into cVector
+ */
+static inline void volk_gnsssdr_8ic_conjugate_8ic_u_sse3(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points){
+ const unsigned int sse_iters = num_points / 8;
+
+ lv_8sc_t* c = cVector;
+ const lv_8sc_t* a = aVector;
+ __m128i tmp;
+
+ __m128i conjugator1 = _mm_setr_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
+ __m128i conjugator2 = _mm_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1);
+
+ for (int i = 0; i < sse_iters; ++i)
+ {
+ tmp = _mm_lddqu_si128((__m128i*)a);
+ tmp = _mm_xor_si128(tmp, conjugator1);
+ tmp = _mm_add_epi8(tmp, conjugator2);
+ _mm_storeu_si128((__m128i*)c, tmp);
+ a += 8;
+ c += 8;
+ }
+
+ for (int i = 0; i<(num_points % 8); ++i)
+ {
+ *c++ = lv_conj(*a++);
+ }
+
+}
+#endif /* LV_HAVE_SSE3 */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Takes the conjugate of an unsigned char vector.
+ \param cVector The vector where the results will be stored
+ \param aVector Vector to be conjugated
+ \param num_points The number of unsigned char values in aVector to be conjugated and stored into cVector
+ */
+static inline void volk_gnsssdr_8ic_conjugate_8ic_generic(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points){
+ lv_8sc_t* cPtr = cVector;
+ const lv_8sc_t* aPtr = aVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *cPtr++ = lv_conj(*aPtr++);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_volk_gnsssdr_8ic_conjugate_8ic_u_H */
+
+
+#ifndef INCLUDED_volk_gnsssdr_8ic_conjugate_8ic_a_H
+#define INCLUDED_volk_gnsssdr_8ic_conjugate_8ic_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk_gnsssdr/volk_gnsssdr_complex.h>
+
+#ifdef LV_HAVE_AVX
+#include "immintrin.h"
+/*!
+ \brief Takes the conjugate of an unsigned char vector.
+ \param cVector The vector where the results will be stored
+ \param aVector Vector to be conjugated
+ \param num_points The number of unsigned char values in aVector to be conjugated and stored into cVector
+ */
+static inline void volk_gnsssdr_8ic_conjugate_8ic_a_avx(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points){
+ const unsigned int sse_iters = num_points / 16;
+
+ lv_8sc_t* c = cVector;
+ const lv_8sc_t* a = aVector;
+
+ __m256 tmp;
+ __m128i tmp128lo, tmp128hi;
+ __m256 conjugator1 = _mm256_castsi256_ps(_mm256_setr_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255));
+ __m128i conjugator2 = _mm_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1);
+
+ for (int i = 0; i < sse_iters; ++i)
+ {
+ tmp = _mm256_load_ps((float*)a);
+ tmp = _mm256_xor_ps(tmp, conjugator1);
+ tmp128lo = _mm256_castsi256_si128(_mm256_castps_si256(tmp));
+ tmp128lo = _mm_add_epi8(tmp128lo, conjugator2);
+ tmp128hi = _mm256_extractf128_si256(_mm256_castps_si256(tmp),1);
+ tmp128hi = _mm_add_epi8(tmp128hi, conjugator2);
+ //tmp = _mm256_set_m128i(tmp128hi , tmp128lo); //not defined in some versions of immintrin.h
+ tmp = _mm256_insertf128_si256(_mm256_castsi128_si256(tmp128lo),(tmp128hi),1);
+ _mm256_store_ps((float*)c, tmp);
+
+ a += 16;
+ c += 16;
+ }
+
+ for (int i = 0; i<(num_points % 16); ++i)
+ {
+ *c++ = lv_conj(*a++);
+ }
+}
+#endif /* LV_HAVE_AVX */
+
+#ifdef LV_HAVE_SSSE3
+#include "tmmintrin.h"
+/*!
+ \brief Takes the conjugate of an unsigned char vector.
+ \param cVector The vector where the results will be stored
+ \param aVector Vector to be conjugated
+ \param num_points The number of unsigned char values in aVector to be conjugated and stored into cVector
+ */
+static inline void volk_gnsssdr_8ic_conjugate_8ic_a_ssse3(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points){
+ const unsigned int sse_iters = num_points / 8;
+
+ lv_8sc_t* c = cVector;
+ const lv_8sc_t* a = aVector;
+ __m128i tmp;
+
+ __m128i conjugator = _mm_setr_epi8(1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1);
+
+ for (int i = 0; i < sse_iters; ++i)
+ {
+ tmp = _mm_load_si128((__m128i*)a);
+ tmp = _mm_sign_epi8(tmp, conjugator);
+ _mm_store_si128((__m128i*)c, tmp);
+ a += 8;
+ c += 8;
+ }
+
+ for (int i = 0; i<(num_points % 8); ++i)
+ {
+ *c++ = lv_conj(*a++);
+ }
+
+}
+#endif /* LV_HAVE_SSSE3 */
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+/*!
+ \brief Takes the conjugate of an unsigned char vector.
+ \param cVector The vector where the results will be stored
+ \param aVector Vector to be conjugated
+ \param num_points The number of unsigned char values in aVector to be conjugated and stored into cVector
+ */
+static inline void volk_gnsssdr_8ic_conjugate_8ic_a_sse3(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points){
+ const unsigned int sse_iters = num_points / 8;
+
+ lv_8sc_t* c = cVector;
+ const lv_8sc_t* a = aVector;
+ __m128i tmp;
+
+ __m128i conjugator1 = _mm_setr_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
+ __m128i conjugator2 = _mm_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1);
+
+ for (int i = 0; i < sse_iters; ++i)
+ {
+ tmp = _mm_load_si128((__m128i*)a);
+ tmp = _mm_xor_si128(tmp, conjugator1);
+ tmp = _mm_add_epi8(tmp, conjugator2);
+ _mm_store_si128((__m128i*)c, tmp);
+ a += 8;
+ c += 8;
+ }
+
+ for (int i = 0; i<(num_points % 8); ++i)
+ {
+ *c++ = lv_conj(*a++);
+ }
+
+}
+#endif /* LV_HAVE_SSE3 */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Takes the conjugate of an unsigned char vector.
+ \param cVector The vector where the results will be stored
+ \param aVector Vector to be conjugated
+ \param num_points The number of unsigned char values in aVector to be conjugated and stored into cVector
+ */
+static inline void volk_gnsssdr_8ic_conjugate_8ic_a_generic(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points){
+ lv_8sc_t* cPtr = cVector;
+ const lv_8sc_t* aPtr = aVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *cPtr++ = lv_conj(*aPtr++);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#ifdef LV_HAVE_ORC
+/*!
+ \brief Takes the conjugate of an unsigned char vector.
+ \param cVector The vector where the results will be stored
+ \param aVector Vector to be conjugated
+ \param num_points The number of unsigned char values in aVector to be conjugated and stored into cVector
+ */
+extern void volk_gnsssdr_8ic_conjugate_8ic_a_orc_impl(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points);
+static inline void volk_gnsssdr_8ic_conjugate_8ic_u_orc(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points){
+ volk_gnsssdr_8ic_conjugate_8ic_a_orc_impl(cVector, aVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+#endif /* INCLUDED_volk_gnsssdr_8ic_conjugate_8ic_a_H */
diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_magnitude_squared_8i.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_magnitude_squared_8i.h
--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_magnitude_squared_8i.h 1970-01-01 01:00:00.000000000 +0100
+++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_magnitude_squared_8i.h 2014-10-15 01:55:08.000000000 +0200
@@ -0,0 +1,320 @@
+/*!
+ * \file volk_gnsssdr_8ic_magnitude_squared_8i.h
+ * \brief Volk protokernel: calculates the magnitude squared of a 16 bits vector
+ * \authors <ul>
+ * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
+ * </ul>
+ *
+ * Volk protokernel that calculates the magnitude squared of a
+ * 16 bits vector (8 bits the real part and 8 bits the imaginary part)
+ * result = (real*real) + (imag*imag)
+ *
+ * -------------------------------------------------------------------------
+ *
+ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
+ *
+ * GNSS-SDR is a software defined Global Navigation
+ * Satellite Systems receiver
+ *
+ * This file is part of GNSS-SDR.
+ *
+ * GNSS-SDR is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * at your option) any later version.
+ *
+ * GNSS-SDR is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * -------------------------------------------------------------------------
+ */
+
+#ifndef INCLUDED_volk_gnsssdr_8ic_magnitude_squared_8i_u_H
+#define INCLUDED_volk_gnsssdr_8ic_magnitude_squared_8i_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+#include "tmmintrin.h"
+/*!
+ \brief Calculates the magnitude squared of complexVector and stores the results in magnitudeVector
+ \param complexVector The vector containing the complex input values
+ \param magnitudeVector The vector containing the real output values
+ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+ */
+static inline void volk_gnsssdr_8ic_magnitude_squared_8i_u_sse3(char* magnitudeVector, const lv_8sc_t* complexVector, unsigned int num_points){
+
+ const unsigned int sse_iters = num_points / 16;
+
+ const char* complexVectorPtr = (char*)complexVector;
+ char* magnitudeVectorPtr = magnitudeVector;
+
+ __m128i zero, result8;
+ __m128i avector, avectorhi, avectorlo, avectorlomult, avectorhimult, aadded, maska;
+ __m128i bvector, bvectorhi, bvectorlo, bvectorlomult, bvectorhimult, badded, maskb;
+
+ zero = _mm_setzero_si128();
+ maska = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
+ maskb = _mm_set_epi8(14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
+
+ for(int number = 0;number < sse_iters; number++)
+ {
+ avector = _mm_lddqu_si128((__m128i*)complexVectorPtr);
+ avectorlo = _mm_unpacklo_epi8 (avector, zero);
+ avectorhi = _mm_unpackhi_epi8 (avector, zero);
+ avectorlomult = _mm_mullo_epi16 (avectorlo, avectorlo);
+ avectorhimult = _mm_mullo_epi16 (avectorhi, avectorhi);
+ aadded = _mm_hadd_epi16 (avectorlomult, avectorhimult);
+
+ complexVectorPtr += 16;
+
+ bvector = _mm_lddqu_si128((__m128i*)complexVectorPtr);
+ bvectorlo = _mm_unpacklo_epi8 (bvector, zero);
+ bvectorhi = _mm_unpackhi_epi8 (bvector, zero);
+ bvectorlomult = _mm_mullo_epi16 (bvectorlo, bvectorlo);
+ bvectorhimult = _mm_mullo_epi16 (bvectorhi, bvectorhi);
+ badded = _mm_hadd_epi16 (bvectorlomult, bvectorhimult);
+
+ complexVectorPtr += 16;
+
+ result8 = _mm_or_si128(_mm_shuffle_epi8(aadded, maska), _mm_shuffle_epi8(badded, maskb));
+
+ _mm_storeu_si128((__m128i*)magnitudeVectorPtr, result8);
+
+ magnitudeVectorPtr += 16;
+
+
+ }
+
+ for (int i = 0; i<(num_points % 16); ++i)
+ {
+ const char valReal = *complexVectorPtr++;
+ const char valImag = *complexVectorPtr++;
+ *magnitudeVectorPtr++ = (valReal * valReal) + (valImag * valImag);
+ }
+}
+#endif /* LV_HAVE_SSE3 */
+
+//#ifdef LV_HAVE_SSE
+//#include <xmmintrin.h>
+///*!
+// \brief Calculates the magnitude squared of complexVector and stores the results in magnitudeVector
+// \param complexVector The vector containing the complex input values
+// \param magnitudeVector The vector containing the real output values
+// \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+// */
+//static inline void volk_gnsssdr_8ic_magnitude_squared_8i_u_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
+// unsigned int number = 0;
+// const unsigned int quarterPoints = num_points / 4;
+//
+// const float* complexVectorPtr = (float*)complexVector;
+// float* magnitudeVectorPtr = magnitudeVector;
+//
+// __m128 cplxValue1, cplxValue2, iValue, qValue, result;
+// for(;number < quarterPoints; number++){
+// cplxValue1 = _mm_loadu_ps(complexVectorPtr);
+// complexVectorPtr += 4;
+//
+// cplxValue2 = _mm_loadu_ps(complexVectorPtr);
+// complexVectorPtr += 4;
+//
+// // Arrange in i1i2i3i4 format
+// iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+// // Arrange in q1q2q3q4 format
+// qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
+//
+// iValue = _mm_mul_ps(iValue, iValue); // Square the I values
+// qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values
+//
+// result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values
+//
+// _mm_storeu_ps(magnitudeVectorPtr, result);
+// magnitudeVectorPtr += 4;
+// }
+//
+// number = quarterPoints * 4;
+// for(; number < num_points; number++){
+// float val1Real = *complexVectorPtr++;
+// float val1Imag = *complexVectorPtr++;
+// *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
+// }
+//}
+//#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Calculates the magnitude squared of complexVector and stores the results in magnitudeVector
+ \param complexVector The vector containing the complex input values
+ \param magnitudeVector The vector containing the real output values
+ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+ */
+static inline void volk_gnsssdr_8ic_magnitude_squared_8i_generic(char* magnitudeVector, const lv_8sc_t* complexVector, unsigned int num_points){
+ const char* complexVectorPtr = (char*)complexVector;
+ char* magnitudeVectorPtr = magnitudeVector;
+
+ for(int number = 0; number < num_points; number++){
+ const char real = *complexVectorPtr++;
+ const char imag = *complexVectorPtr++;
+ *magnitudeVectorPtr++ = (real*real) + (imag*imag);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_volk_gnsssdr_32fc_magnitude_32f_u_H */
+
+
+#ifndef INCLUDED_volk_gnsssdr_8ic_magnitude_squared_8i_a_H
+#define INCLUDED_volk_gnsssdr_8ic_magnitude_squared_8i_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+/*!
+ \brief Calculates the magnitude squared of complexVector and stores the results in magnitudeVector
+ \param complexVector The vector containing the complex input values
+ \param magnitudeVector The vector containing the real output values
+ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+ */
+static inline void volk_gnsssdr_8ic_magnitude_squared_8i_a_sse3(char* magnitudeVector, const lv_8sc_t* complexVector, unsigned int num_points){
+
+ const unsigned int sse_iters = num_points / 16;
+
+ const char* complexVectorPtr = (char*)complexVector;
+ char* magnitudeVectorPtr = magnitudeVector;
+
+ __m128i zero, result8;
+ __m128i avector, avectorhi, avectorlo, avectorlomult, avectorhimult, aadded, maska;
+ __m128i bvector, bvectorhi, bvectorlo, bvectorlomult, bvectorhimult, badded, maskb;
+
+ zero = _mm_setzero_si128();
+ maska = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
+ maskb = _mm_set_epi8(14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
+
+ for(int number = 0;number < sse_iters; number++)
+ {
+ avector = _mm_load_si128((__m128i*)complexVectorPtr);
+ avectorlo = _mm_unpacklo_epi8 (avector, zero);
+ avectorhi = _mm_unpackhi_epi8 (avector, zero);
+ avectorlomult = _mm_mullo_epi16 (avectorlo, avectorlo);
+ avectorhimult = _mm_mullo_epi16 (avectorhi, avectorhi);
+ aadded = _mm_hadd_epi16 (avectorlomult, avectorhimult);
+
+ complexVectorPtr += 16;
+
+ bvector = _mm_load_si128((__m128i*)complexVectorPtr);
+ bvectorlo = _mm_unpacklo_epi8 (bvector, zero);
+ bvectorhi = _mm_unpackhi_epi8 (bvector, zero);
+ bvectorlomult = _mm_mullo_epi16 (bvectorlo, bvectorlo);
+ bvectorhimult = _mm_mullo_epi16 (bvectorhi, bvectorhi);
+ badded = _mm_hadd_epi16 (bvectorlomult, bvectorhimult);
+
+ complexVectorPtr += 16;
+
+ result8 = _mm_or_si128(_mm_shuffle_epi8(aadded, maska), _mm_shuffle_epi8(badded, maskb));
+
+ _mm_store_si128((__m128i*)magnitudeVectorPtr, result8);
+
+ magnitudeVectorPtr += 16;
+
+
+ }
+
+ for (int i = 0; i<(num_points % 16); ++i)
+ {
+ const char valReal = *complexVectorPtr++;
+ const char valImag = *complexVectorPtr++;
+ *magnitudeVectorPtr++ = (valReal * valReal) + (valImag * valImag);
+ }
+}
+#endif /* LV_HAVE_SSE3 */
+
+//#ifdef LV_HAVE_SSE
+//#include <xmmintrin.h>
+///*!
+// \brief Calculates the magnitude squared of complexVector and stores the results in magnitudeVector
+// \param complexVector The vector containing the complex input values
+// \param magnitudeVector The vector containing the real output values
+// \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+// */
+//static inline void volk_gnsssdr_8ic_magnitude_squared_8i_a_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
+// unsigned int number = 0;
+// const unsigned int quarterPoints = num_points / 4;
+//
+// const float* complexVectorPtr = (float*)complexVector;
+// float* magnitudeVectorPtr = magnitudeVector;
+//
+// __m128 cplxValue1, cplxValue2, iValue, qValue, result;
+// for(;number < quarterPoints; number++){
+// cplxValue1 = _mm_load_ps(complexVectorPtr);
+// complexVectorPtr += 4;
+//
+// cplxValue2 = _mm_load_ps(complexVectorPtr);
+// complexVectorPtr += 4;
+//
+// // Arrange in i1i2i3i4 format
+// iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+// // Arrange in q1q2q3q4 format
+// qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
+//
+// iValue = _mm_mul_ps(iValue, iValue); // Square the I values
+// qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values
+//
+// result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values
+//
+// _mm_store_ps(magnitudeVectorPtr, result);
+// magnitudeVectorPtr += 4;
+// }
+//
+// number = quarterPoints * 4;
+// for(; number < num_points; number++){
+// float val1Real = *complexVectorPtr++;
+// float val1Imag = *complexVectorPtr++;
+// *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
+// }
+//}
+//#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Calculates the magnitude squared of complexVector and stores the results in magnitudeVector
+ \param complexVector The vector containing the complex input values
+ \param magnitudeVector The vector containing the real output values
+ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+ */
+static inline void volk_gnsssdr_8ic_magnitude_squared_8i_a_generic(char* magnitudeVector, const lv_8sc_t* complexVector, unsigned int num_points){
+ const char* complexVectorPtr = (char*)complexVector;
+ char* magnitudeVectorPtr = magnitudeVector;
+
+ for(int number = 0; number < num_points; number++){
+ const char real = *complexVectorPtr++;
+ const char imag = *complexVectorPtr++;
+ *magnitudeVectorPtr++ = (real*real) + (imag*imag);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#ifdef LV_HAVE_ORC
+/*!
+ \brief Calculates the magnitude squared of complexVector and stores the results in magnitudeVector
+ \param complexVector The vector containing the complex input values
+ \param magnitudeVector The vector containing the real output values
+ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+ */
+extern void volk_gnsssdr_8ic_magnitude_squared_8i_a_orc_impl(char* magnitudeVector, const lv_8sc_t* complexVector, unsigned int num_points);
+static inline void volk_gnsssdr_8ic_magnitude_squared_8i_u_orc(char* magnitudeVector, const lv_8sc_t* complexVector, unsigned int num_points){
+ volk_gnsssdr_8ic_magnitude_squared_8i_a_orc_impl(magnitudeVector, complexVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+#endif /* INCLUDED_volk_gnsssdr_32fc_magnitude_32f_a_H */
diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_s8ic_multiply_8ic.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_s8ic_multiply_8ic.h
--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_s8ic_multiply_8ic.h 1970-01-01 01:00:00.000000000 +0100
+++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_s8ic_multiply_8ic.h 2014-10-15 01:55:08.000000000 +0200
@@ -0,0 +1,271 @@
+/*!
+ * \file volk_gnsssdr_8ic_s8ic_multiply_8ic.h
+ * \brief Volk protokernel: multiplies a group of 16 bits vectors by one constant vector
+ * \authors <ul>
+ * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
+ * </ul>
+ *
+ * Volk protokernel that multiplies a group of 16 bits vectors
+ * (8 bits the real part and 8 bits the imaginary part) by one constant vector
+ *
+ * -------------------------------------------------------------------------
+ *
+ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
+ *
+ * GNSS-SDR is a software defined Global Navigation
+ * Satellite Systems receiver
+ *
+ * This file is part of GNSS-SDR.
+ *
+ * GNSS-SDR is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * at your option) any later version.
+ *
+ * GNSS-SDR is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * -------------------------------------------------------------------------
+ */
+
+#ifndef INCLUDED_volk_gnsssdr_8ic_s8ic_multiply_8ic_u_H
+#define INCLUDED_volk_gnsssdr_8ic_s8ic_multiply_8ic_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk_gnsssdr/volk_gnsssdr_complex.h>
+#include <float.h>
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+/*!
+ \brief Multiplies the input vector by a scalar and stores the results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector The vector to be multiplied
+ \param scalar The complex scalar to multiply aVector
+ \param num_points The number of complex values in aVector to be multiplied by sacalar and stored into cVector
+ */
+static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_u_sse3(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t scalar, unsigned int num_points){
+
+ const unsigned int sse_iters = num_points / 8;
+
+ __m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc;
+
+ lv_8sc_t* c = cVector;
+ const lv_8sc_t* a = aVector;
+
+ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
+
+ y = _mm_set1_epi16 (*(short*)&scalar);
+ imagy = _mm_srli_si128 (y, 1);
+ imagy = _mm_and_si128 (imagy, mult1);
+ realy = _mm_and_si128 (y, mult1);
+
+ for(int number = 0;number < sse_iters; number++){
+
+ x = _mm_lddqu_si128((__m128i*)a);
+
+ imagx = _mm_srli_si128 (x, 1);
+ imagx = _mm_and_si128 (imagx, mult1);
+ realx = _mm_and_si128 (x, mult1);
+
+ realx_mult_realy = _mm_mullo_epi16 (realx, realy);
+ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);
+ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);
+ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);
+
+ realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
+ realc = _mm_and_si128 (realc, mult1);
+ imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
+ imagc = _mm_and_si128 (imagc, mult1);
+ imagc = _mm_slli_si128 (imagc, 1);
+
+ totalc = _mm_or_si128 (realc, imagc);
+
+ _mm_storeu_si128((__m128i*)c, totalc);
+
+ a += 8;
+ c += 8;
+ }
+
+ for (int i = 0; i<(num_points % 8); ++i)
+ {
+ *c++ = (*a++) * scalar;
+ }
+
+}
+#endif /* LV_HAVE_SSE3 */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Multiplies the input vector by a scalar and stores the results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector The vector to be multiplied
+ \param scalar The complex scalar to multiply aVector
+ \param num_points The number of complex values in aVector to be multiplied by sacalar and stored into cVector
+ */
+static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_generic(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t scalar, unsigned int num_points){
+
+ /*lv_8sc_t* cPtr = cVector;
+ const lv_8sc_t* aPtr = aVector;
+
+ for (int i = 0; i<num_points; ++i)
+ {
+ *cPtr++ = (*aPtr++) * scalar;
+ }*/
+
+ lv_8sc_t* cPtr = cVector;
+ const lv_8sc_t* aPtr = aVector;
+ unsigned int number = num_points;
+
+ // unwrap loop
+ while (number >= 8){
+ *cPtr++ = (*aPtr++) * scalar;
+ *cPtr++ = (*aPtr++) * scalar;
+ *cPtr++ = (*aPtr++) * scalar;
+ *cPtr++ = (*aPtr++) * scalar;
+ *cPtr++ = (*aPtr++) * scalar;
+ *cPtr++ = (*aPtr++) * scalar;
+ *cPtr++ = (*aPtr++) * scalar;
+ *cPtr++ = (*aPtr++) * scalar;
+ number -= 8;
+ }
+
+ // clean up any remaining
+ while (number-- > 0)
+ *cPtr++ = *aPtr++ * scalar;
+}
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_u_H */
+
+
+#ifndef INCLUDED_volk_gnsssdr_8ic_s8ic_multiply_8ic_a_H
+#define INCLUDED_volk_gnsssdr_8ic_s8ic_multiply_8ic_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk_gnsssdr/volk_gnsssdr_complex.h>
+#include <float.h>
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+/*!
+ \brief Multiplies the input vector by a scalar and stores the results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector The vector to be multiplied
+ \param scalar The complex scalar to multiply aVector
+ \param num_points The number of complex values in aVector to be multiplied by sacalar and stored into cVector
+ */
+static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_a_sse3(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t scalar, unsigned int num_points){
+
+ const unsigned int sse_iters = num_points / 8;
+
+ __m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc;
+
+ lv_8sc_t* c = cVector;
+ const lv_8sc_t* a = aVector;
+
+ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
+
+ y = _mm_set1_epi16 (*(short*)&scalar);
+ imagy = _mm_srli_si128 (y, 1);
+ imagy = _mm_and_si128 (imagy, mult1);
+ realy = _mm_and_si128 (y, mult1);
+
+ for(int number = 0;number < sse_iters; number++){
+
+ x = _mm_load_si128((__m128i*)a);
+
+ imagx = _mm_srli_si128 (x, 1);
+ imagx = _mm_and_si128 (imagx, mult1);
+ realx = _mm_and_si128 (x, mult1);
+
+ realx_mult_realy = _mm_mullo_epi16 (realx, realy);
+ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);
+ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);
+ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);
+
+ realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
+ realc = _mm_and_si128 (realc, mult1);
+ imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
+ imagc = _mm_and_si128 (imagc, mult1);
+ imagc = _mm_slli_si128 (imagc, 1);
+
+ totalc = _mm_or_si128 (realc, imagc);
+
+ _mm_store_si128((__m128i*)c, totalc);
+
+ a += 8;
+ c += 8;
+ }
+
+ for (int i = 0; i<(num_points % 8); ++i)
+ {
+ *c++ = (*a++) * scalar;
+ }
+
+}
+#endif /* LV_HAVE_SSE3 */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Multiplies the input vector by a scalar and stores the results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector The vector to be multiplied
+ \param scalar The complex scalar to multiply aVector
+ \param num_points The number of complex values in aVector to be multiplied by sacalar and stored into cVector
+ */
+static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_a_generic(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t scalar, unsigned int num_points){
+
+ /*lv_8sc_t* cPtr = cVector;
+ const lv_8sc_t* aPtr = aVector;
+
+ for (int i = 0; i<num_points; ++i)
+ {
+ *cPtr++ = (*aPtr++) * scalar;
+ }*/
+
+ lv_8sc_t* cPtr = cVector;
+ const lv_8sc_t* aPtr = aVector;
+ unsigned int number = num_points;
+
+ // unwrap loop
+ while (number >= 8){
+ *cPtr++ = (*aPtr++) * scalar;
+ *cPtr++ = (*aPtr++) * scalar;
+ *cPtr++ = (*aPtr++) * scalar;
+ *cPtr++ = (*aPtr++) * scalar;
+ *cPtr++ = (*aPtr++) * scalar;
+ *cPtr++ = (*aPtr++) * scalar;
+ *cPtr++ = (*aPtr++) * scalar;
+ *cPtr++ = (*aPtr++) * scalar;
+ number -= 8;
+ }
+
+ // clean up any remaining
+ while (number-- > 0)
+ *cPtr++ = *aPtr++ * scalar;
+}
+#endif /* LV_HAVE_GENERIC */
+
+#ifdef LV_HAVE_ORC
+/*!
+ \brief Multiplies the input vector by a scalar and stores the results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector The vector to be multiplied
+ \param scalar The complex scalar to multiply aVector
+ \param num_points The number of complex values in aVector to be multiplied by sacalar and stored into cVector
+ */
+extern void volk_gnsssdr_8ic_s8ic_multiply_8ic_a_orc_impl(lv_8sc_t* cVector, const lv_8sc_t* aVector, const char scalarreal, const char scalarimag, unsigned int num_points);
+static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_u_orc(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t scalar, unsigned int num_points){
+ volk_gnsssdr_8ic_s8ic_multiply_8ic_a_orc_impl(cVector, aVector, lv_creal(scalar), lv_cimag(scalar), num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+#endif /* INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_a_H */
diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x2_dot_prod_8ic.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x2_dot_prod_8ic.h
--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x2_dot_prod_8ic.h 1970-01-01 01:00:00.000000000 +0100
+++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x2_dot_prod_8ic.h 2014-10-15 01:55:08.000000000 +0200
@@ -0,0 +1,499 @@
+/*!
+ * \file volk_gnsssdr_8ic_x2_dot_prod_8ic.h
+ * \brief Volk protokernel: multiplies two 16 bits vectors and accumulates them
+ * \authors <ul>
+ * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
+ * </ul>
+ *
+ * Volk protokernel that multiplies two 16 bits vectors (8 bits the real part
+ * and 8 bits the imaginary part) and accumulates them
+ *
+ * -------------------------------------------------------------------------
+ *
+ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
+ *
+ * GNSS-SDR is a software defined Global Navigation
+ * Satellite Systems receiver
+ *
+ * This file is part of GNSS-SDR.
+ *
+ * GNSS-SDR is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * at your option) any later version.
+ *
+ * GNSS-SDR is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * -------------------------------------------------------------------------
+ */
+
+#ifndef INCLUDED_volk_gnsssdr_8ic_x2_dot_prod_8ic_u_H
+#define INCLUDED_volk_gnsssdr_8ic_x2_dot_prod_8ic_u_H
+
+#include <volk_gnsssdr/volk_gnsssdr_common.h>
+#include <volk_gnsssdr/volk_gnsssdr_complex.h>
+#include <stdio.h>
+#include <string.h>
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Multiplies the two input complex vectors and accumulates them, storing the result in the third vector
+ \param cVector The vector where the accumulated result will be stored
+ \param aVector One of the vectors to be multiplied and accumulated
+ \param bVector One of the vectors to be multiplied and accumulated
+ \param num_points The number of complex values in aVector and bVector to be multiplied together, accumulated and stored into cVector
+ */
+static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_generic(lv_8sc_t* result, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points) {
+
+ /*lv_8sc_t* cPtr = result;
+ const lv_8sc_t* aPtr = input;
+ const lv_8sc_t* bPtr = taps;
+
+ for(int number = 0; number < num_points; number++){
+ *cPtr += (*aPtr++) * (*bPtr++);
+ }*/
+
+ char * res = (char*) result;
+ char * in = (char*) input;
+ char * tp = (char*) taps;
+ unsigned int n_2_ccomplex_blocks = num_points/2;
+ unsigned int isodd = num_points & 1;
+
+ char sum0[2] = {0,0};
+ char sum1[2] = {0,0};
+ unsigned int i = 0;
+
+ for(i = 0; i < n_2_ccomplex_blocks; ++i) {
+ sum0[0] += in[0] * tp[0] - in[1] * tp[1];
+ sum0[1] += in[0] * tp[1] + in[1] * tp[0];
+ sum1[0] += in[2] * tp[2] - in[3] * tp[3];
+ sum1[1] += in[2] * tp[3] + in[3] * tp[2];
+
+ in += 4;
+ tp += 4;
+ }
+
+ res[0] = sum0[0] + sum1[0];
+ res[1] = sum0[1] + sum1[1];
+
+ // Cleanup if we had an odd number of points
+ for(i = 0; i < isodd; ++i) {
+ *result += input[num_points - 1] * taps[num_points - 1];
+ }
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+#ifdef LV_HAVE_SSE2
+#include "emmintrin.h"
+/*!
+ \brief Multiplies the two input complex vectors and accumulates them, storing the result in the third vector
+ \param cVector The vector where the accumulated result will be stored
+ \param aVector One of the vectors to be multiplied and accumulated
+ \param bVector One of the vectors to be multiplied and accumulated
+ \param num_points The number of complex values in aVector and bVector to be multiplied together, accumulated and stored into cVector
+ */
+static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_sse2(lv_8sc_t* result, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points) {
+
+ lv_8sc_t dotProduct;
+ memset(&dotProduct, 0x0, 2*sizeof(char));
+
+ const lv_8sc_t* a = input;
+ const lv_8sc_t* b = taps;
+
+ const unsigned int sse_iters = num_points/8;
+
+ if (sse_iters>0)
+ {
+ __m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc, realcacc, imagcacc;
+
+ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
+ realcacc = _mm_setzero_si128();
+ imagcacc = _mm_setzero_si128();
+
+ for(int number = 0; number < sse_iters; number++){
+
+ x = _mm_lddqu_si128((__m128i*)a);
+ y = _mm_lddqu_si128((__m128i*)b);
+
+ imagx = _mm_srli_si128 (x, 1);
+ imagx = _mm_and_si128 (imagx, mult1);
+ realx = _mm_and_si128 (x, mult1);
+
+ imagy = _mm_srli_si128 (y, 1);
+ imagy = _mm_and_si128 (imagy, mult1);
+ realy = _mm_and_si128 (y, mult1);
+
+ realx_mult_realy = _mm_mullo_epi16 (realx, realy);
+ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);
+ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);
+ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);
+
+ realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
+ imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
+
+ realcacc = _mm_add_epi16 (realcacc, realc);
+ imagcacc = _mm_add_epi16 (imagcacc, imagc);
+
+ a += 8;
+ b += 8;
+ }
+
+ realcacc = _mm_and_si128 (realcacc, mult1);
+ imagcacc = _mm_and_si128 (imagcacc, mult1);
+ imagcacc = _mm_slli_si128 (imagcacc, 1);
+
+ totalc = _mm_or_si128 (realcacc, imagcacc);
+
+ __VOLK_ATTR_ALIGNED(16) lv_8sc_t dotProductVector[8];
+
+ _mm_storeu_si128((__m128i*)dotProductVector,totalc); // Store the results back into the dot product vector
+
+ for (int i = 0; i<8; ++i)
+ {
+ dotProduct += dotProductVector[i];
+ }
+ }
+
+ for (int i = 0; i<(num_points % 8); ++i)
+ {
+ dotProduct += (*a++) * (*b++);
+ }
+
+ *result = dotProduct;
+}
+
+#endif /*LV_HAVE_SSE2*/
+
+#ifdef LV_HAVE_SSE4_1
+#include "smmintrin.h"
+/*!
+ \brief Multiplies the two input complex vectors and accumulates them, storing the result in the third vector
+ \param cVector The vector where the accumulated result will be stored
+ \param aVector One of the vectors to be multiplied and accumulated
+ \param bVector One of the vectors to be multiplied and accumulated
+ \param num_points The number of complex values in aVector and bVector to be multiplied together, accumulated and stored into cVector
+ */
+static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_sse4_1(lv_8sc_t* result, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points) {
+
+ lv_8sc_t dotProduct;
+ memset(&dotProduct, 0x0, 2*sizeof(char));
+
+ const lv_8sc_t* a = input;
+ const lv_8sc_t* b = taps;
+
+ const unsigned int sse_iters = num_points/8;
+
+ if (sse_iters>0)
+ {
+ __m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc, realcacc, imagcacc;
+
+ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
+ realcacc = _mm_setzero_si128();
+ imagcacc = _mm_setzero_si128();
+
+ for(int number = 0; number < sse_iters; number++){
+
+ x = _mm_lddqu_si128((__m128i*)a);
+ y = _mm_lddqu_si128((__m128i*)b);
+
+ imagx = _mm_srli_si128 (x, 1);
+ imagx = _mm_and_si128 (imagx, mult1);
+ realx = _mm_and_si128 (x, mult1);
+
+ imagy = _mm_srli_si128 (y, 1);
+ imagy = _mm_and_si128 (imagy, mult1);
+ realy = _mm_and_si128 (y, mult1);
+
+ realx_mult_realy = _mm_mullo_epi16 (realx, realy);
+ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);
+ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);
+ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);
+
+ realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
+ imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
+
+ realcacc = _mm_add_epi16 (realcacc, realc);
+ imagcacc = _mm_add_epi16 (imagcacc, imagc);
+
+ a += 8;
+ b += 8;
+ }
+
+ imagcacc = _mm_slli_si128 (imagcacc, 1);
+
+ totalc = _mm_blendv_epi8 (imagcacc, realcacc, mult1);
+
+ __VOLK_ATTR_ALIGNED(16) lv_8sc_t dotProductVector[8];
+
+ _mm_storeu_si128((__m128i*)dotProductVector,totalc); // Store the results back into the dot product vector
+
+ for (int i = 0; i<8; ++i)
+ {
+ dotProduct += dotProductVector[i];
+ }
+ }
+
+ for (int i = 0; i<(num_points % 8); ++i)
+ {
+ dotProduct += (*a++) * (*b++);
+ }
+
+ *result = dotProduct;
+}
+
+#endif /*LV_HAVE_SSE4_1*/
+
+#endif /*INCLUDED_volk_gnsssdr_8ic_x2_dot_prod_8ic_u_H*/
+
+
+#ifndef INCLUDED_volk_gnsssdr_8ic_x2_dot_prod_8ic_a_H
+#define INCLUDED_volk_gnsssdr_8ic_x2_dot_prod_8ic_a_H
+
+#include <volk_gnsssdr/volk_gnsssdr_common.h>
+#include <volk_gnsssdr/volk_gnsssdr_complex.h>
+#include <stdio.h>
+#include <string.h>
+
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Multiplies the two input complex vectors and accumulates them, storing the result in the third vector
+ \param cVector The vector where the accumulated result will be stored
+ \param aVector One of the vectors to be multiplied and accumulated
+ \param bVector One of the vectors to be multiplied and accumulated
+ \param num_points The number of complex values in aVector and bVector to be multiplied together, accumulated and stored into cVector
+ */
+static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_generic(lv_8sc_t* result, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points) {
+
+ /*lv_8sc_t* cPtr = result;
+ const lv_8sc_t* aPtr = input;
+ const lv_8sc_t* bPtr = taps;
+
+ for(int number = 0; number < num_points; number++){
+ *cPtr += (*aPtr++) * (*bPtr++);
+ }*/
+
+ char * res = (char*) result;
+ char * in = (char*) input;
+ char * tp = (char*) taps;
+ unsigned int n_2_ccomplex_blocks = num_points/2;
+ unsigned int isodd = num_points & 1;
+
+ char sum0[2] = {0,0};
+ char sum1[2] = {0,0};
+ unsigned int i = 0;
+
+ for(i = 0; i < n_2_ccomplex_blocks; ++i) {
+ sum0[0] += in[0] * tp[0] - in[1] * tp[1];
+ sum0[1] += in[0] * tp[1] + in[1] * tp[0];
+ sum1[0] += in[2] * tp[2] - in[3] * tp[3];
+ sum1[1] += in[2] * tp[3] + in[3] * tp[2];
+
+ in += 4;
+ tp += 4;
+ }
+
+ res[0] = sum0[0] + sum1[0];
+ res[1] = sum0[1] + sum1[1];
+
+ // Cleanup if we had an odd number of points
+ for(i = 0; i < isodd; ++i) {
+ *result += input[num_points - 1] * taps[num_points - 1];
+ }
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+#ifdef LV_HAVE_SSE2
+#include "emmintrin.h"
+/*!
+ \brief Multiplies the two input complex vectors and accumulates them, storing the result in the third vector
+ \param cVector The vector where the accumulated result will be stored
+ \param aVector One of the vectors to be multiplied and accumulated
+ \param bVector One of the vectors to be multiplied and accumulated
+ \param num_points The number of complex values in aVector and bVector to be multiplied together, accumulated and stored into cVector
+ */
+static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_sse2(lv_8sc_t* result, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points) {
+
+ lv_8sc_t dotProduct;
+ memset(&dotProduct, 0x0, 2*sizeof(char));
+
+ const lv_8sc_t* a = input;
+ const lv_8sc_t* b = taps;
+
+ const unsigned int sse_iters = num_points/8;
+
+ if (sse_iters>0)
+ {
+ __m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc, realcacc, imagcacc;
+
+ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
+ realcacc = _mm_setzero_si128();
+ imagcacc = _mm_setzero_si128();
+
+ for(int number = 0; number < sse_iters; number++){
+
+ x = _mm_load_si128((__m128i*)a);
+ y = _mm_load_si128((__m128i*)b);
+
+ imagx = _mm_srli_si128 (x, 1);
+ imagx = _mm_and_si128 (imagx, mult1);
+ realx = _mm_and_si128 (x, mult1);
+
+ imagy = _mm_srli_si128 (y, 1);
+ imagy = _mm_and_si128 (imagy, mult1);
+ realy = _mm_and_si128 (y, mult1);
+
+ realx_mult_realy = _mm_mullo_epi16 (realx, realy);
+ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);
+ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);
+ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);
+
+ realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
+ imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
+
+ realcacc = _mm_add_epi16 (realcacc, realc);
+ imagcacc = _mm_add_epi16 (imagcacc, imagc);
+
+ a += 8;
+ b += 8;
+ }
+
+ realcacc = _mm_and_si128 (realcacc, mult1);
+ imagcacc = _mm_and_si128 (imagcacc, mult1);
+ imagcacc = _mm_slli_si128 (imagcacc, 1);
+
+ totalc = _mm_or_si128 (realcacc, imagcacc);
+
+ __VOLK_ATTR_ALIGNED(16) lv_8sc_t dotProductVector[8];
+
+ _mm_store_si128((__m128i*)dotProductVector,totalc); // Store the results back into the dot product vector
+
+ for (int i = 0; i<8; ++i)
+ {
+ dotProduct += dotProductVector[i];
+ }
+ }
+
+ for (int i = 0; i<(num_points % 8); ++i)
+ {
+ dotProduct += (*a++) * (*b++);
+ }
+
+ *result = dotProduct;
+}
+
+#endif /*LV_HAVE_SSE2*/
+
+#ifdef LV_HAVE_SSE4_1
+#include "smmintrin.h"
+/*!
+ \brief Multiplies the two input complex vectors and accumulates them, storing the result in the third vector
+ \param cVector The vector where the accumulated result will be stored
+ \param aVector One of the vectors to be multiplied and accumulated
+ \param bVector One of the vectors to be multiplied and accumulated
+ \param num_points The number of complex values in aVector and bVector to be multiplied together, accumulated and stored into cVector
+ */
+static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_sse4_1(lv_8sc_t* result, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points) {
+
+ lv_8sc_t dotProduct;
+ memset(&dotProduct, 0x0, 2*sizeof(char));
+
+ const lv_8sc_t* a = input;
+ const lv_8sc_t* b = taps;
+
+ const unsigned int sse_iters = num_points/8;
+
+ if (sse_iters>0)
+ {
+ __m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc, realcacc, imagcacc;
+
+ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
+ realcacc = _mm_setzero_si128();
+ imagcacc = _mm_setzero_si128();
+
+ for(int number = 0; number < sse_iters; number++){
+
+ x = _mm_load_si128((__m128i*)a);
+ y = _mm_load_si128((__m128i*)b);
+
+ imagx = _mm_srli_si128 (x, 1);
+ imagx = _mm_and_si128 (imagx, mult1);
+ realx = _mm_and_si128 (x, mult1);
+
+ imagy = _mm_srli_si128 (y, 1);
+ imagy = _mm_and_si128 (imagy, mult1);
+ realy = _mm_and_si128 (y, mult1);
+
+ realx_mult_realy = _mm_mullo_epi16 (realx, realy);
+ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);
+ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);
+ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);
+
+ realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
+ imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
+
+ realcacc = _mm_add_epi16 (realcacc, realc);
+ imagcacc = _mm_add_epi16 (imagcacc, imagc);
+
+ a += 8;
+ b += 8;
+ }
+
+ imagcacc = _mm_slli_si128 (imagcacc, 1);
+
+ totalc = _mm_blendv_epi8 (imagcacc, realcacc, mult1);
+
+ __VOLK_ATTR_ALIGNED(16) lv_8sc_t dotProductVector[8];
+
+ _mm_store_si128((__m128i*)dotProductVector,totalc); // Store the results back into the dot product vector
+
+ for (int i = 0; i<8; ++i)
+ {
+ dotProduct += dotProductVector[i];
+ }
+ }
+
+ for (int i = 0; i<(num_points % 8); ++i)
+ {
+ dotProduct += (*a++) * (*b++);
+ }
+
+ *result = dotProduct;
+}
+
+#endif /*LV_HAVE_SSE4_1*/
+
+#ifdef LV_HAVE_ORC
+/*!
+ \brief Multiplies the two input complex vectors and accumulates them, storing the result in the third vector
+ \param cVector The vector where the accumulated result will be stored
+ \param aVector One of the vectors to be multiplied and accumulated
+ \param bVector One of the vectors to be multiplied and accumulated
+ \param num_points The number of complex values in aVector and bVector to be multiplied together, accumulated and stored into cVector
+ */
+extern void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_orc_impl(short* resRealShort, short* resImagShort, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points);
+static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_orc(lv_8sc_t* result, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points){
+
+ short resReal = 0;
+ char* resRealChar = (char*)&resReal;
+ resRealChar++;
+
+ short resImag = 0;
+ char* resImagChar = (char*)&resImag;
+ resImagChar++;
+
+ volk_gnsssdr_8ic_x2_dot_prod_8ic_a_orc_impl(&resReal, &resImag, input, taps, num_points);
+
+ *result = lv_cmake(*resRealChar, *resImagChar);
+}
+#endif /* LV_HAVE_ORC */
+
+#endif /*INCLUDED_volk_gnsssdr_8ic_x2_dot_prod_8ic_a_H*/
diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x2_multiply_8ic.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x2_multiply_8ic.h
--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x2_multiply_8ic.h 1970-01-01 01:00:00.000000000 +0100
+++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x2_multiply_8ic.h 2014-10-15 01:55:08.000000000 +0200
@@ -0,0 +1,346 @@
+/*!
+ * \file volk_gnsssdr_8ic_x2_multiply_8ic.h
+ * \brief Volk protokernel: multiplies two 16 bits vectors
+ * \authors <ul>
+ * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
+ * </ul>
+ *
+ * Volk protokernel that multiplies two 16 bits vectors (8 bits the real part
+ * and 8 bits the imaginary part)
+ *
+ * -------------------------------------------------------------------------
+ *
+ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
+ *
+ * GNSS-SDR is a software defined Global Navigation
+ * Satellite Systems receiver
+ *
+ * This file is part of GNSS-SDR.
+ *
+ * GNSS-SDR is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * at your option) any later version.
+ *
+ * GNSS-SDR is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * -------------------------------------------------------------------------
+ */
+
+#ifndef INCLUDED_volk_gnsssdr_8ic_x2_multiply_8ic_u_H
+#define INCLUDED_volk_gnsssdr_8ic_x2_multiply_8ic_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk_gnsssdr/volk_gnsssdr_complex.h>
+
+#ifdef LV_HAVE_SSE2
+#include "emmintrin.h"
+/*!
+ \brief Multiplies the two input complex vectors and stores their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be multiplied
+ \param bVector One of the vectors to be multiplied
+ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+ */
+static inline void volk_gnsssdr_8ic_x2_multiply_8ic_u_sse2(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){
+
+ const unsigned int sse_iters = num_points / 8;
+
+ __m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc;
+ lv_8sc_t* c = cVector;
+ const lv_8sc_t* a = aVector;
+ const lv_8sc_t* b = bVector;
+
+ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
+
+ for(int number = 0;number < sse_iters; number++){
+
+ x = _mm_lddqu_si128((__m128i*)a);
+ y = _mm_lddqu_si128((__m128i*)b);
+
+ imagx = _mm_srli_si128 (x, 1);
+ imagx = _mm_and_si128 (imagx, mult1);
+ realx = _mm_and_si128 (x, mult1);
+
+ imagy = _mm_srli_si128 (y, 1);
+ imagy = _mm_and_si128 (imagy, mult1);
+ realy = _mm_and_si128 (y, mult1);
+
+ realx_mult_realy = _mm_mullo_epi16 (realx, realy);
+ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);
+ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);
+ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);
+
+ realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
+ realc = _mm_and_si128 (realc, mult1);
+ imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
+ imagc = _mm_and_si128 (imagc, mult1);
+ imagc = _mm_slli_si128 (imagc, 1);
+
+ totalc = _mm_or_si128 (realc, imagc);
+
+ _mm_storeu_si128((__m128i*)c, totalc);
+
+ a += 8;
+ b += 8;
+ c += 8;
+ }
+
+ for (int i = 0; i<(num_points % 8); ++i)
+ {
+ *c++ = (*a++) * (*b++);
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#ifdef LV_HAVE_SSE4_1
+#include "smmintrin.h"
+/*!
+ \brief Multiplies the two input complex vectors and stores their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be multiplied
+ \param bVector One of the vectors to be multiplied
+ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+ */
+static inline void volk_gnsssdr_8ic_x2_multiply_8ic_u_sse4_1(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){
+
+ const unsigned int sse_iters = num_points / 8;
+
+ __m128i x, y, zero;
+ __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc;
+ lv_8sc_t* c = cVector;
+ const lv_8sc_t* a = aVector;
+ const lv_8sc_t* b = bVector;
+
+ zero = _mm_setzero_si128();
+ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
+
+ for(int number = 0;number < sse_iters; number++){
+
+ x = _mm_lddqu_si128((__m128i*)a);
+ y = _mm_lddqu_si128((__m128i*)b);
+
+ imagx = _mm_srli_si128 (x, 1);
+ imagx = _mm_and_si128 (imagx, mult1);
+ realx = _mm_and_si128 (x, mult1);
+
+ imagy = _mm_srli_si128 (y, 1);
+ imagy = _mm_and_si128 (imagy, mult1);
+ realy = _mm_and_si128 (y, mult1);
+
+ realx_mult_realy = _mm_mullo_epi16 (realx, realy);
+ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);
+ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);
+ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);
+
+ realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
+ imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
+ imagc = _mm_slli_si128 (imagc, 1);
+
+ totalc = _mm_blendv_epi8 (imagc, realc, mult1);
+
+ _mm_storeu_si128((__m128i*)c, totalc);
+
+ a += 8;
+ b += 8;
+ c += 8;
+ }
+
+ for (int i = 0; i<(num_points % 8); ++i)
+ {
+ *c++ = (*a++) * (*b++);
+ }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Multiplies the two input complex vectors and stores their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be multiplied
+ \param bVector One of the vectors to be multiplied
+ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+ */
+static inline void volk_gnsssdr_8ic_x2_multiply_8ic_generic(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){
+ lv_8sc_t* cPtr = cVector;
+ const lv_8sc_t* aPtr = aVector;
+ const lv_8sc_t* bPtr = bVector;
+
+ for(int number = 0; number < num_points; number++){
+ *cPtr++ = (*aPtr++) * (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_volk_gnsssdr_8ic_x2_multiply_8ic_u_H */
+
+
+#ifndef INCLUDED_volk_gnsssdr_8ic_x2_multiply_8ic_a_H
+#define INCLUDED_volk_gnsssdr_8ic_x2_multiply_8ic_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk_gnsssdr/volk_gnsssdr_complex.h>
+
+#ifdef LV_HAVE_SSE2
+#include "emmintrin.h"
+/*!
+ \brief Multiplies the two input complex vectors and stores their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be multiplied
+ \param bVector One of the vectors to be multiplied
+ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+ */
+static inline void volk_gnsssdr_8ic_x2_multiply_8ic_a_sse2(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){
+
+ const unsigned int sse_iters = num_points / 8;
+
+ __m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc;
+ lv_8sc_t* c = cVector;
+ const lv_8sc_t* a = aVector;
+ const lv_8sc_t* b = bVector;
+
+ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
+
+ for(int number = 0;number < sse_iters; number++){
+
+ x = _mm_load_si128((__m128i*)a);
+ y = _mm_load_si128((__m128i*)b);
+
+ imagx = _mm_srli_si128 (x, 1);
+ imagx = _mm_and_si128 (imagx, mult1);
+ realx = _mm_and_si128 (x, mult1);
+
+ imagy = _mm_srli_si128 (y, 1);
+ imagy = _mm_and_si128 (imagy, mult1);
+ realy = _mm_and_si128 (y, mult1);
+
+ realx_mult_realy = _mm_mullo_epi16 (realx, realy);
+ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);
+ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);
+ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);
+
+ realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
+ realc = _mm_and_si128 (realc, mult1);
+ imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
+ imagc = _mm_and_si128 (imagc, mult1);
+ imagc = _mm_slli_si128 (imagc, 1);
+
+ totalc = _mm_or_si128 (realc, imagc);
+
+ _mm_store_si128((__m128i*)c, totalc);
+
+ a += 8;
+ b += 8;
+ c += 8;
+ }
+
+ for (int i = 0; i<(num_points % 8); ++i)
+ {
+ *c++ = (*a++) * (*b++);
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#ifdef LV_HAVE_SSE4_1
+#include "smmintrin.h"
+/*!
+ \brief Multiplies the two input complex vectors and stores their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be multiplied
+ \param bVector One of the vectors to be multiplied
+ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+ */
+static inline void volk_gnsssdr_8ic_x2_multiply_8ic_a_sse4_1(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){
+
+ const unsigned int sse_iters = num_points / 8;
+
+ __m128i x, y, zero;
+ __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc;
+ lv_8sc_t* c = cVector;
+ const lv_8sc_t* a = aVector;
+ const lv_8sc_t* b = bVector;
+
+ zero = _mm_setzero_si128();
+ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
+
+ for(int number = 0;number < sse_iters; number++){
+
+ x = _mm_load_si128((__m128i*)a);
+ y = _mm_load_si128((__m128i*)b);
+
+ imagx = _mm_srli_si128 (x, 1);
+ imagx = _mm_and_si128 (imagx, mult1);
+ realx = _mm_and_si128 (x, mult1);
+
+ imagy = _mm_srli_si128 (y, 1);
+ imagy = _mm_and_si128 (imagy, mult1);
+ realy = _mm_and_si128 (y, mult1);
+
+ realx_mult_realy = _mm_mullo_epi16 (realx, realy);
+ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);
+ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);
+ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);
+
+ realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
+ imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
+ imagc = _mm_slli_si128 (imagc, 1);
+
+ totalc = _mm_blendv_epi8 (imagc, realc, mult1);
+
+ _mm_store_si128((__m128i*)c, totalc);
+
+ a += 8;
+ b += 8;
+ c += 8;
+ }
+
+ for (int i = 0; i<(num_points % 8); ++i)
+ {
+ *c++ = (*a++) * (*b++);
+ }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Multiplies the two input complex vectors and stores their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be multiplied
+ \param bVector One of the vectors to be multiplied
+ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+ */
+static inline void volk_gnsssdr_8ic_x2_multiply_8ic_a_generic(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){
+ lv_8sc_t* cPtr = cVector;
+ const lv_8sc_t* aPtr = aVector;
+ const lv_8sc_t* bPtr = bVector;
+
+ for(int number = 0; number < num_points; number++){
+ *cPtr++ = (*aPtr++) * (*bPtr++);
+ }
+
+}
+#endif /* LV_HAVE_GENERIC */
+
+#ifdef LV_HAVE_ORC
+/*!
+ \brief Multiplies the two input complex vectors and stores their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be multiplied
+ \param bVector One of the vectors to be multiplied
+ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+ */
+extern void volk_gnsssdr_8ic_x2_multiply_8ic_a_orc_impl(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points);
+static inline void volk_gnsssdr_8ic_x2_multiply_8ic_u_orc(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){
+ volk_gnsssdr_8ic_x2_multiply_8ic_a_orc_impl(cVector, aVector, bVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+#endif /* INCLUDED_volk_gnsssdr_8ic_x2_multiply_8ic_a_H */
diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3.h
--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3.h 1970-01-01 01:00:00.000000000 +0100
+++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3.h 2014-10-15 01:55:08.000000000 +0200
@@ -0,0 +1,613 @@
+/*!
+ * \file volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3.h
+ * \brief Volk protokernel: performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation with 16 bits vectors, and accumulates the results into float32.
+ * \authors <ul>
+ * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
+ * </ul>
+ *
+ * Volk protokernel that performs the carrier wipe-off mixing and the
+ * Early, Prompt, and Late correlation with 16 bits vectors (8 bits the
+ * real part and 8 bits the imaginary part), and accumulates the result
+ * in 32 bits single point values, returning float32 values:
+ * - The carrier wipe-off is done by multiplying the input signal by the
+ * carrier (multiplication of 16 bits vectors) It returns the input
+ * signal in base band (BB)
+ * - Early values are calculated by multiplying the input signal in BB by the
+ * early code (multiplication of 16 bits vectors), accumulating the results into float32 values
+ * - Prompt values are calculated by multiplying the input signal in BB by the
+ * prompt code (multiplication of 16 bits vectors), accumulating the results into float32 values
+ * - Late values are calculated by multiplying the input signal in BB by the
+ * late code (multiplication of 16 bits vectors), accumulating the results into float32 values
+ *
+ * -------------------------------------------------------------------------
+ *
+ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
+ *
+ * GNSS-SDR is a software defined Global Navigation
+ * Satellite Systems receiver
+ *
+ * This file is part of GNSS-SDR.
+ *
+ * GNSS-SDR is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * at your option) any later version.
+ *
+ * GNSS-SDR is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * -------------------------------------------------------------------------
+ */
+
+#ifndef INCLUDED_gnsssdr_volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_u_H
+#define INCLUDED_gnsssdr_volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk_gnsssdr/volk_gnsssdr_complex.h>
+#include <float.h>
+#include <string.h>
+
+#ifdef LV_HAVE_SSE4_1
+#include "smmintrin.h"
+#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h"
+#include "CommonMacros/CommonMacros.h"
+/*!
+ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
+ \param input The input signal input
+ \param carrier The carrier signal input
+ \param E_code Early PRN code replica input
+ \param P_code Early PRN code replica input
+ \param L_code Early PRN code replica input
+ \param E_out Early correlation output
+ \param P_out Early correlation output
+ \param L_out Early correlation output
+ \param num_points The number of complex values in vectors
+ */
+static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_u_sse4_1(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points)
+{
+ const unsigned int sse_iters = num_points / 8;
+
+ __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample;
+ __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output;
+
+ __m128 E_code_acc, P_code_acc, L_code_acc;
+ __m128i input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2;
+ __m128 output_ps;
+
+ const lv_8sc_t* input_ptr = input;
+ const lv_8sc_t* carrier_ptr = carrier;
+
+ const lv_8sc_t* E_code_ptr = E_code;
+ lv_32fc_t* E_out_ptr = E_out;
+ const lv_8sc_t* L_code_ptr = L_code;
+ lv_32fc_t* L_out_ptr = L_out;
+ const lv_8sc_t* P_code_ptr = P_code;
+ lv_32fc_t* P_out_ptr = P_out;
+
+ *E_out_ptr = 0;
+ *P_out_ptr = 0;
+ *L_out_ptr = 0;
+
+ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
+
+ E_code_acc = _mm_setzero_ps();
+ L_code_acc = _mm_setzero_ps();
+ P_code_acc = _mm_setzero_ps();
+
+ if (sse_iters>0)
+ {
+ for(int number = 0;number < sse_iters; number++){
+
+ //Perform the carrier wipe-off
+ x = _mm_lddqu_si128((__m128i*)input_ptr);
+ y = _mm_lddqu_si128((__m128i*)carrier_ptr);
+
+ CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(x, mult1, realx, imagx)
+ CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy)
+
+ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample)
+
+ //Get early values
+ y = _mm_lddqu_si128((__m128i*)E_code_ptr);
+
+ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
+
+ E_code_acc = _mm_add_ps (E_code_acc, output_ps);
+
+ //Get prompt values
+ y = _mm_lddqu_si128((__m128i*)P_code_ptr);
+
+ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
+
+ P_code_acc = _mm_add_ps (P_code_acc, output_ps);
+
+ //Get late values
+ y = _mm_lddqu_si128((__m128i*)L_code_ptr);
+
+ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
+
+ L_code_acc = _mm_add_ps (L_code_acc, output_ps);
+
+ input_ptr += 8;
+ carrier_ptr += 8;
+ E_code_ptr += 8;
+ P_code_ptr += 8;
+ L_code_ptr += 8;
+ }
+
+ __VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2];
+ __VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2];
+ __VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2];
+
+ _mm_storeu_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector
+ _mm_storeu_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector
+ _mm_storeu_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector
+
+ for (int i = 0; i<2; ++i)
+ {
+ *E_out_ptr += E_dotProductVector[i];
+ *P_out_ptr += P_dotProductVector[i];
+ *L_out_ptr += L_dotProductVector[i];
+ }
+ }
+
+ lv_8sc_t bb_signal_sample;
+ for(int i=0; i < num_points%8; ++i)
+ {
+ //Perform the carrier wipe-off
+ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
+ // Now get early, late, and prompt values for each
+ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++));
+ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++));
+ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++));
+ }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#ifdef LV_HAVE_SSE2
+#include "emmintrin.h"
+#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h"
+#include "CommonMacros/CommonMacros.h"
+/*!
+ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
+ \param input The input signal input
+ \param carrier The carrier signal input
+ \param E_code Early PRN code replica input
+ \param P_code Early PRN code replica input
+ \param L_code Early PRN code replica input
+ \param E_out Early correlation output
+ \param P_out Early correlation output
+ \param L_out Early correlation output
+ \param num_points The number of complex values in vectors
+ */
+static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_u_sse2(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points)
+{
+ const unsigned int sse_iters = num_points / 8;
+
+ __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample;
+ __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output;
+
+ __m128 E_code_acc, P_code_acc, L_code_acc;
+ __m128i input_i_1, input_i_2, output_i32;
+ __m128 output_ps_1, output_ps_2;
+
+ const lv_8sc_t* input_ptr = input;
+ const lv_8sc_t* carrier_ptr = carrier;
+
+ const lv_8sc_t* E_code_ptr = E_code;
+ lv_32fc_t* E_out_ptr = E_out;
+ const lv_8sc_t* L_code_ptr = L_code;
+ lv_32fc_t* L_out_ptr = L_out;
+ const lv_8sc_t* P_code_ptr = P_code;
+ lv_32fc_t* P_out_ptr = P_out;
+
+ *E_out_ptr = 0;
+ *P_out_ptr = 0;
+ *L_out_ptr = 0;
+
+ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
+
+ E_code_acc = _mm_setzero_ps();
+ L_code_acc = _mm_setzero_ps();
+ P_code_acc = _mm_setzero_ps();
+
+ if (sse_iters>0)
+ {
+ for(int number = 0;number < sse_iters; number++){
+
+ //Perform the carrier wipe-off
+ x = _mm_lddqu_si128((__m128i*)input_ptr);
+ y = _mm_lddqu_si128((__m128i*)carrier_ptr);
+
+ CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(x, mult1, realx, imagx)
+ CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy)
+
+ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample)
+
+ //Get early values
+ y = _mm_lddqu_si128((__m128i*)E_code_ptr);
+
+ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
+
+ E_code_acc = _mm_add_ps (E_code_acc, output_ps_1);
+ E_code_acc = _mm_add_ps (E_code_acc, output_ps_2);
+
+ //Get prompt values
+ y = _mm_lddqu_si128((__m128i*)P_code_ptr);
+
+ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
+
+ P_code_acc = _mm_add_ps (P_code_acc, output_ps_1);
+ P_code_acc = _mm_add_ps (P_code_acc, output_ps_2);
+
+ //Get late values
+ y = _mm_lddqu_si128((__m128i*)L_code_ptr);
+
+ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
+
+ L_code_acc = _mm_add_ps (L_code_acc, output_ps_1);
+ L_code_acc = _mm_add_ps (L_code_acc, output_ps_2);
+
+ input_ptr += 8;
+ carrier_ptr += 8;
+ E_code_ptr += 8;
+ P_code_ptr += 8;
+ L_code_ptr += 8;
+ }
+
+ __VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2];
+ __VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2];
+ __VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2];
+
+ _mm_storeu_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector
+ _mm_storeu_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector
+ _mm_storeu_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector
+
+ for (int i = 0; i<2; ++i)
+ {
+ *E_out_ptr += E_dotProductVector[i];
+ *P_out_ptr += P_dotProductVector[i];
+ *L_out_ptr += L_dotProductVector[i];
+ }
+ }
+
+ lv_8sc_t bb_signal_sample;
+ for(int i=0; i < num_points%8; ++i)
+ {
+ //Perform the carrier wipe-off
+ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
+ // Now get early, late, and prompt values for each
+ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++));
+ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++));
+ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++));
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
+ \param input The input signal input
+ \param carrier The carrier signal input
+ \param E_code Early PRN code replica input
+ \param P_code Early PRN code replica input
+ \param L_code Early PRN code replica input
+ \param E_out Early correlation output
+ \param P_out Early correlation output
+ \param L_out Early correlation output
+ \param num_points The number of complex values in vectors
+ */
+static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_generic(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points)
+{
+ lv_8sc_t bb_signal_sample;
+
+ bb_signal_sample = lv_cmake(0, 0);
+
+ *E_out = 0;
+ *P_out = 0;
+ *L_out = 0;
+ // perform Early, Prompt and Late correlation
+ for(int i=0; i < num_points; ++i)
+ {
+ //Perform the carrier wipe-off
+ bb_signal_sample = input[i] * carrier[i];
+ // Now get early, late, and prompt values for each
+ *E_out += (lv_32fc_t) (bb_signal_sample * E_code[i]);
+ *P_out += (lv_32fc_t) (bb_signal_sample * P_code[i]);
+ *L_out += (lv_32fc_t) (bb_signal_sample * L_code[i]);
+ }
+}
+
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_gnsssdr_volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_u_H */
+
+
+#ifndef INCLUDED_gnsssdr_volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_a_H
+#define INCLUDED_gnsssdr_volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk_gnsssdr/volk_gnsssdr_complex.h>
+#include <float.h>
+#include <string.h>
+
+#ifdef LV_HAVE_SSE4_1
+#include "smmintrin.h"
+#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h"
+#include "CommonMacros/CommonMacros.h"
+/*!
+ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
+ \param input The input signal input
+ \param carrier The carrier signal input
+ \param E_code Early PRN code replica input
+ \param P_code Early PRN code replica input
+ \param L_code Early PRN code replica input
+ \param E_out Early correlation output
+ \param P_out Early correlation output
+ \param L_out Early correlation output
+ \param num_points The number of complex values in vectors
+ */
+static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_a_sse4_1(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points)
+{
+ const unsigned int sse_iters = num_points / 8;
+
+ __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample;
+ __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output;
+
+ __m128 E_code_acc, P_code_acc, L_code_acc;
+ __m128i input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2;
+ __m128 output_ps;
+
+ const lv_8sc_t* input_ptr = input;
+ const lv_8sc_t* carrier_ptr = carrier;
+
+ const lv_8sc_t* E_code_ptr = E_code;
+ lv_32fc_t* E_out_ptr = E_out;
+ const lv_8sc_t* L_code_ptr = L_code;
+ lv_32fc_t* L_out_ptr = L_out;
+ const lv_8sc_t* P_code_ptr = P_code;
+ lv_32fc_t* P_out_ptr = P_out;
+
+ *E_out_ptr = 0;
+ *P_out_ptr = 0;
+ *L_out_ptr = 0;
+
+ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
+
+ E_code_acc = _mm_setzero_ps();
+ L_code_acc = _mm_setzero_ps();
+ P_code_acc = _mm_setzero_ps();
+
+ if (sse_iters>0)
+ {
+ for(int number = 0;number < sse_iters; number++){
+
+ //Perform the carrier wipe-off
+ x = _mm_load_si128((__m128i*)input_ptr);
+ y = _mm_load_si128((__m128i*)carrier_ptr);
+
+ CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(x, mult1, realx, imagx)
+ CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy)
+
+ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample)
+
+ //Get early values
+ y = _mm_load_si128((__m128i*)E_code_ptr);
+
+ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
+
+ E_code_acc = _mm_add_ps (E_code_acc, output_ps);
+
+ //Get prompt values
+ y = _mm_load_si128((__m128i*)P_code_ptr);
+
+ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
+
+ P_code_acc = _mm_add_ps (P_code_acc, output_ps);
+
+ //Get late values
+ y = _mm_load_si128((__m128i*)L_code_ptr);
+
+ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
+
+ L_code_acc = _mm_add_ps (L_code_acc, output_ps);
+
+ input_ptr += 8;
+ carrier_ptr += 8;
+ E_code_ptr += 8;
+ P_code_ptr += 8;
+ L_code_ptr += 8;
+ }
+
+ __VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2];
+ __VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2];
+ __VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2];
+
+ _mm_store_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector
+ _mm_store_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector
+ _mm_store_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector
+
+ for (int i = 0; i<2; ++i)
+ {
+ *E_out_ptr += E_dotProductVector[i];
+ *P_out_ptr += P_dotProductVector[i];
+ *L_out_ptr += L_dotProductVector[i];
+ }
+ }
+
+ lv_8sc_t bb_signal_sample;
+ for(int i=0; i < num_points%8; ++i)
+ {
+ //Perform the carrier wipe-off
+ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
+ // Now get early, late, and prompt values for each
+ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++));
+ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++));
+ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++));
+ }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#ifdef LV_HAVE_SSE2
+#include "emmintrin.h"
+#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h"
+#include "CommonMacros/CommonMacros.h"
+/*!
+ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
+ \param input The input signal input
+ \param carrier The carrier signal input
+ \param E_code Early PRN code replica input
+ \param P_code Early PRN code replica input
+ \param L_code Early PRN code replica input
+ \param E_out Early correlation output
+ \param P_out Early correlation output
+ \param L_out Early correlation output
+ \param num_points The number of complex values in vectors
+ */
+static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_a_sse2(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points)
+{
+ const unsigned int sse_iters = num_points / 8;
+
+ __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample;
+ __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output;
+
+ __m128 E_code_acc, P_code_acc, L_code_acc;
+ __m128i input_i_1, input_i_2, output_i32;
+ __m128 output_ps_1, output_ps_2;
+
+ const lv_8sc_t* input_ptr = input;
+ const lv_8sc_t* carrier_ptr = carrier;
+
+ const lv_8sc_t* E_code_ptr = E_code;
+ lv_32fc_t* E_out_ptr = E_out;
+ const lv_8sc_t* L_code_ptr = L_code;
+ lv_32fc_t* L_out_ptr = L_out;
+ const lv_8sc_t* P_code_ptr = P_code;
+ lv_32fc_t* P_out_ptr = P_out;
+
+ *E_out_ptr = 0;
+ *P_out_ptr = 0;
+ *L_out_ptr = 0;
+
+ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
+
+ E_code_acc = _mm_setzero_ps();
+ L_code_acc = _mm_setzero_ps();
+ P_code_acc = _mm_setzero_ps();
+
+ if (sse_iters>0)
+ {
+ for(int number = 0;number < sse_iters; number++){
+
+ //Perform the carrier wipe-off
+ x = _mm_load_si128((__m128i*)input_ptr);
+ y = _mm_load_si128((__m128i*)carrier_ptr);
+
+ CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(x, mult1, realx, imagx)
+ CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy)
+
+ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample)
+
+ //Get early values
+ y = _mm_load_si128((__m128i*)E_code_ptr);
+
+ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
+
+ E_code_acc = _mm_add_ps (E_code_acc, output_ps_1);
+ E_code_acc = _mm_add_ps (E_code_acc, output_ps_2);
+
+ //Get prompt values
+ y = _mm_load_si128((__m128i*)P_code_ptr);
+
+ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
+
+ P_code_acc = _mm_add_ps (P_code_acc, output_ps_1);
+ P_code_acc = _mm_add_ps (P_code_acc, output_ps_2);
+
+ //Get late values
+ y = _mm_load_si128((__m128i*)L_code_ptr);
+
+ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
+
+ L_code_acc = _mm_add_ps (L_code_acc, output_ps_1);
+ L_code_acc = _mm_add_ps (L_code_acc, output_ps_2);
+
+ input_ptr += 8;
+ carrier_ptr += 8;
+ E_code_ptr += 8;
+ P_code_ptr += 8;
+ L_code_ptr += 8;
+ }
+
+ __VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2];
+ __VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2];
+ __VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2];
+
+ _mm_store_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector
+ _mm_store_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector
+ _mm_store_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector
+
+ for (int i = 0; i<2; ++i)
+ {
+ *E_out_ptr += E_dotProductVector[i];
+ *P_out_ptr += P_dotProductVector[i];
+ *L_out_ptr += L_dotProductVector[i];
+ }
+ }
+
+ lv_8sc_t bb_signal_sample;
+ for(int i=0; i < num_points%8; ++i)
+ {
+ //Perform the carrier wipe-off
+ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
+ // Now get early, late, and prompt values for each
+ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++));
+ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++));
+ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++));
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
+ \param input The input signal input
+ \param carrier The carrier signal input
+ \param E_code Early PRN code replica input
+ \param P_code Early PRN code replica input
+ \param L_code Early PRN code replica input
+ \param E_out Early correlation output
+ \param P_out Early correlation output
+ \param L_out Early correlation output
+ \param num_points The number of complex values in vectors
+ */
+static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_a_generic(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points)
+{
+ lv_8sc_t bb_signal_sample;
+
+ bb_signal_sample = lv_cmake(0, 0);
+
+ *E_out = 0;
+ *P_out = 0;
+ *L_out = 0;
+ // perform Early, Prompt and Late correlation
+ for(int i=0; i < num_points; ++i)
+ {
+ //Perform the carrier wipe-off
+ bb_signal_sample = input[i] * carrier[i];
+ // Now get early, late, and prompt values for each
+ *E_out += (lv_32fc_t) (bb_signal_sample * E_code[i]);
+ *P_out += (lv_32fc_t) (bb_signal_sample * P_code[i]);
+ *L_out += (lv_32fc_t) (bb_signal_sample * L_code[i]);
+ }
+}
+
+#endif /* LV_HAVE_GENERIC */
+#endif /* INCLUDED_gnsssdr_volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_a_H */
\ No newline at end of file
diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3.h
--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3.h 1970-01-01 01:00:00.000000000 +0100
+++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3.h 2014-10-15 01:55:08.000000000 +0200
@@ -0,0 +1,874 @@
+/*!
+ * \file volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3.h
+ * \brief Volk protokernel: performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation with 16 bits vectors
+ * \authors <ul>
+ * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
+ * </ul>
+ *
+ * Volk protokernel that performs the carrier wipe-off mixing and the
+ * Early, Prompt, and Late correlation with 16 bits vectors (8 bits the
+ * real part and 8 bits the imaginary part):
+ * - The carrier wipe-off is done by multiplying the input signal by the
+ * carrier (multiplication of 16 bits vectors) It returns the input
+ * signal in base band (BB)
+ * - Early values are calculated by multiplying the input signal in BB by the
+ * early code (multiplication of 16 bits vectors), accumulating the results
+ * - Prompt values are calculated by multiplying the input signal in BB by the
+ * prompt code (multiplication of 16 bits vectors), accumulating the results
+ * - Late values are calculated by multiplying the input signal in BB by the
+ * late code (multiplication of 16 bits vectors), accumulating the results
+ *
+ * -------------------------------------------------------------------------
+ *
+ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
+ *
+ * GNSS-SDR is a software defined Global Navigation
+ * Satellite Systems receiver
+ *
+ * This file is part of GNSS-SDR.
+ *
+ * GNSS-SDR is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * at your option) any later version.
+ *
+ * GNSS-SDR is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * -------------------------------------------------------------------------
+ */
+
+#ifndef INCLUDED_gnsssdr_volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_u_H
+#define INCLUDED_gnsssdr_volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk_gnsssdr/volk_gnsssdr_complex.h>
+#include <float.h>
+#include <string.h>
+
+#ifdef LV_HAVE_SSE4_1
+#include "smmintrin.h"
+ /*!
+ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
+ \param input The input signal input
+ \param carrier The carrier signal input
+ \param E_code Early PRN code replica input
+ \param P_code Early PRN code replica input
+ \param L_code Early PRN code replica input
+ \param E_out Early correlation output
+ \param P_out Early correlation output
+ \param L_out Early correlation output
+ \param num_points The number of complex values in vectors
+ */
+static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_u_sse4_1(lv_8sc_t* E_out, lv_8sc_t* P_out, lv_8sc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points)
+{
+ const unsigned int sse_iters = num_points / 8;
+
+ __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample, real_E_code_acc, imag_E_code_acc, real_L_code_acc, imag_L_code_acc, real_P_code_acc, imag_P_code_acc;
+ __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output;
+
+ const lv_8sc_t* input_ptr = input;
+ const lv_8sc_t* carrier_ptr = carrier;
+
+ const lv_8sc_t* E_code_ptr = E_code;
+ lv_8sc_t* E_out_ptr = E_out;
+ const lv_8sc_t* L_code_ptr = L_code;
+ lv_8sc_t* L_out_ptr = L_out;
+ const lv_8sc_t* P_code_ptr = P_code;
+ lv_8sc_t* P_out_ptr = P_out;
+
+ *E_out_ptr = 0;
+ *P_out_ptr = 0;
+ *L_out_ptr = 0;
+
+ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
+
+ real_E_code_acc = _mm_setzero_si128();
+ imag_E_code_acc = _mm_setzero_si128();
+ real_L_code_acc = _mm_setzero_si128();
+ imag_L_code_acc = _mm_setzero_si128();
+ real_P_code_acc = _mm_setzero_si128();
+ imag_P_code_acc = _mm_setzero_si128();
+
+ if (sse_iters>0)
+ {
+ for(int number = 0;number < sse_iters; number++){
+
+ //Perform the carrier wipe-off
+ x = _mm_lddqu_si128((__m128i*)input_ptr);
+ y = _mm_lddqu_si128((__m128i*)carrier_ptr);
+
+ imagx = _mm_srli_si128 (x, 1);
+ imagx = _mm_and_si128 (imagx, mult1);
+ realx = _mm_and_si128 (x, mult1);
+
+ imagy = _mm_srli_si128 (y, 1);
+ imagy = _mm_and_si128 (imagy, mult1);
+ realy = _mm_and_si128 (y, mult1);
+
+ realx_mult_realy = _mm_mullo_epi16 (realx, realy);
+ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);
+ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);
+ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);
+
+ real_bb_signal_sample = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
+ imag_bb_signal_sample = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
+
+ //Get early values
+ y = _mm_lddqu_si128((__m128i*)E_code_ptr);
+
+ imagy = _mm_srli_si128 (y, 1);
+ imagy = _mm_and_si128 (imagy, mult1);
+ realy = _mm_and_si128 (y, mult1);
+
+ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy);
+ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy);
+ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy);
+ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy);
+
+ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
+ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
+
+ real_E_code_acc = _mm_add_epi16 (real_E_code_acc, real_output);
+ imag_E_code_acc = _mm_add_epi16 (imag_E_code_acc, imag_output);
+
+ //Get late values
+ y = _mm_lddqu_si128((__m128i*)L_code_ptr);
+
+ imagy = _mm_srli_si128 (y, 1);
+ imagy = _mm_and_si128 (imagy, mult1);
+ realy = _mm_and_si128 (y, mult1);
+
+ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy);
+ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy);
+ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy);
+ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy);
+
+ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
+ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
+
+ real_L_code_acc = _mm_add_epi16 (real_L_code_acc, real_output);
+ imag_L_code_acc = _mm_add_epi16 (imag_L_code_acc, imag_output);
+
+ //Get prompt values
+ y = _mm_lddqu_si128((__m128i*)P_code_ptr);
+
+ imagy = _mm_srli_si128 (y, 1);
+ imagy = _mm_and_si128 (imagy, mult1);
+ realy = _mm_and_si128 (y, mult1);
+
+ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy);
+ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy);
+ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy);
+ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy);
+
+ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
+ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
+
+ real_P_code_acc = _mm_add_epi16 (real_P_code_acc, real_output);
+ imag_P_code_acc = _mm_add_epi16 (imag_P_code_acc, imag_output);
+
+ input_ptr += 8;
+ carrier_ptr += 8;
+ E_code_ptr += 8;
+ L_code_ptr += 8;
+ P_code_ptr += 8;
+ }
+
+ __VOLK_ATTR_ALIGNED(16) lv_8sc_t E_dotProductVector[8];
+ __VOLK_ATTR_ALIGNED(16) lv_8sc_t L_dotProductVector[8];
+ __VOLK_ATTR_ALIGNED(16) lv_8sc_t P_dotProductVector[8];
+
+ imag_E_code_acc = _mm_slli_si128 (imag_E_code_acc, 1);
+ output = _mm_blendv_epi8 (imag_E_code_acc, real_E_code_acc, mult1);
+ _mm_storeu_si128((__m128i*)E_dotProductVector, output);
+
+ imag_L_code_acc = _mm_slli_si128 (imag_L_code_acc, 1);
+ output = _mm_blendv_epi8 (imag_L_code_acc, real_L_code_acc, mult1);
+ _mm_storeu_si128((__m128i*)L_dotProductVector, output);
+
+ imag_P_code_acc = _mm_slli_si128 (imag_P_code_acc, 1);
+ output = _mm_blendv_epi8 (imag_P_code_acc, real_P_code_acc, mult1);
+ _mm_storeu_si128((__m128i*)P_dotProductVector, output);
+
+ for (int i = 0; i<8; ++i)
+ {
+ *E_out_ptr += E_dotProductVector[i];
+ *L_out_ptr += L_dotProductVector[i];
+ *P_out_ptr += P_dotProductVector[i];
+ }
+ }
+
+ lv_8sc_t bb_signal_sample;
+ for(int i=0; i < num_points%8; ++i)
+ {
+ //Perform the carrier wipe-off
+ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
+ // Now get early, late, and prompt values for each
+ *E_out_ptr += bb_signal_sample * (*E_code_ptr++);
+ *P_out_ptr += bb_signal_sample * (*P_code_ptr++);
+ *L_out_ptr += bb_signal_sample * (*L_code_ptr++);
+ }
+}
+
+#endif /* LV_HAVE_SSE4_1 */
+
+#ifdef LV_HAVE_SSE2
+#include "emmintrin.h"
+/*!
+ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
+ \param input The input signal input
+ \param carrier The carrier signal input
+ \param E_code Early PRN code replica input
+ \param P_code Early PRN code replica input
+ \param L_code Early PRN code replica input
+ \param E_out Early correlation output
+ \param P_out Early correlation output
+ \param L_out Early correlation output
+ \param num_points The number of complex values in vectors
+ */
+static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_u_sse2(lv_8sc_t* E_out, lv_8sc_t* P_out, lv_8sc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points)
+{
+ const unsigned int sse_iters = num_points / 8;
+
+ __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample, real_E_code_acc, imag_E_code_acc, real_L_code_acc, imag_L_code_acc, real_P_code_acc, imag_P_code_acc;
+ __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output;
+
+ const lv_8sc_t* input_ptr = input;
+ const lv_8sc_t* carrier_ptr = carrier;
+
+ const lv_8sc_t* E_code_ptr = E_code;
+ lv_8sc_t* E_out_ptr = E_out;
+ const lv_8sc_t* L_code_ptr = L_code;
+ lv_8sc_t* L_out_ptr = L_out;
+ const lv_8sc_t* P_code_ptr = P_code;
+ lv_8sc_t* P_out_ptr = P_out;
+
+ *E_out_ptr = 0;
+ *P_out_ptr = 0;
+ *L_out_ptr = 0;
+
+ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
+
+ real_E_code_acc = _mm_setzero_si128();
+ imag_E_code_acc = _mm_setzero_si128();
+ real_L_code_acc = _mm_setzero_si128();
+ imag_L_code_acc = _mm_setzero_si128();
+ real_P_code_acc = _mm_setzero_si128();
+ imag_P_code_acc = _mm_setzero_si128();
+
+ if (sse_iters>0)
+ {
+ for(int number = 0;number < sse_iters; number++){
+
+ //Perform the carrier wipe-off
+ x = _mm_lddqu_si128((__m128i*)input_ptr);
+ y = _mm_lddqu_si128((__m128i*)carrier_ptr);
+
+ imagx = _mm_srli_si128 (x, 1);
+ imagx = _mm_and_si128 (imagx, mult1);
+ realx = _mm_and_si128 (x, mult1);
+
+ imagy = _mm_srli_si128 (y, 1);
+ imagy = _mm_and_si128 (imagy, mult1);
+ realy = _mm_and_si128 (y, mult1);
+
+ realx_mult_realy = _mm_mullo_epi16 (realx, realy);
+ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);
+ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);
+ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);
+
+ real_bb_signal_sample = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
+ imag_bb_signal_sample = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
+
+ //Get early values
+ y = _mm_lddqu_si128((__m128i*)E_code_ptr);
+
+ imagy = _mm_srli_si128 (y, 1);
+ imagy = _mm_and_si128 (imagy, mult1);
+ realy = _mm_and_si128 (y, mult1);
+
+ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy);
+ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy);
+ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy);
+ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy);
+
+ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
+ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
+
+ real_E_code_acc = _mm_add_epi16 (real_E_code_acc, real_output);
+ imag_E_code_acc = _mm_add_epi16 (imag_E_code_acc, imag_output);
+
+ //Get late values
+ y = _mm_lddqu_si128((__m128i*)L_code_ptr);
+
+ imagy = _mm_srli_si128 (y, 1);
+ imagy = _mm_and_si128 (imagy, mult1);
+ realy = _mm_and_si128 (y, mult1);
+
+ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy);
+ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy);
+ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy);
+ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy);
+
+ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
+ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
+
+ real_L_code_acc = _mm_add_epi16 (real_L_code_acc, real_output);
+ imag_L_code_acc = _mm_add_epi16 (imag_L_code_acc, imag_output);
+
+ //Get prompt values
+ y = _mm_lddqu_si128((__m128i*)P_code_ptr);
+
+ imagy = _mm_srli_si128 (y, 1);
+ imagy = _mm_and_si128 (imagy, mult1);
+ realy = _mm_and_si128 (y, mult1);
+
+ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy);
+ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy);
+ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy);
+ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy);
+
+ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
+ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
+
+ real_P_code_acc = _mm_add_epi16 (real_P_code_acc, real_output);
+ imag_P_code_acc = _mm_add_epi16 (imag_P_code_acc, imag_output);
+
+ input_ptr += 8;
+ carrier_ptr += 8;
+ E_code_ptr += 8;
+ L_code_ptr += 8;
+ P_code_ptr += 8;
+ }
+
+ __VOLK_ATTR_ALIGNED(16) lv_8sc_t E_dotProductVector[8];
+ __VOLK_ATTR_ALIGNED(16) lv_8sc_t L_dotProductVector[8];
+ __VOLK_ATTR_ALIGNED(16) lv_8sc_t P_dotProductVector[8];
+
+ real_E_code_acc = _mm_and_si128 (real_E_code_acc, mult1);
+ imag_E_code_acc = _mm_and_si128 (imag_E_code_acc, mult1);
+ imag_E_code_acc = _mm_slli_si128 (imag_E_code_acc, 1);
+ output = _mm_or_si128 (real_E_code_acc, imag_E_code_acc);
+ _mm_storeu_si128((__m128i*)E_dotProductVector, output);
+
+ real_L_code_acc = _mm_and_si128 (real_L_code_acc, mult1);
+ imag_L_code_acc = _mm_and_si128 (imag_L_code_acc, mult1);
+ imag_L_code_acc = _mm_slli_si128 (imag_L_code_acc, 1);
+ output = _mm_or_si128 (real_L_code_acc, imag_L_code_acc);
+ _mm_storeu_si128((__m128i*)L_dotProductVector, output);
+
+ real_P_code_acc = _mm_and_si128 (real_P_code_acc, mult1);
+ imag_P_code_acc = _mm_and_si128 (imag_P_code_acc, mult1);
+ imag_P_code_acc = _mm_slli_si128 (imag_P_code_acc, 1);
+ output = _mm_or_si128 (real_P_code_acc, imag_P_code_acc);
+ _mm_storeu_si128((__m128i*)P_dotProductVector, output);
+
+ for (int i = 0; i<8; ++i)
+ {
+ *E_out_ptr += E_dotProductVector[i];
+ *L_out_ptr += L_dotProductVector[i];
+ *P_out_ptr += P_dotProductVector[i];
+ }
+ }
+
+ lv_8sc_t bb_signal_sample;
+ for(int i=0; i < num_points%8; ++i)
+ {
+ //Perform the carrier wipe-off
+ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
+ // Now get early, late, and prompt values for each
+ *E_out_ptr += bb_signal_sample * (*E_code_ptr++);
+ *P_out_ptr += bb_signal_sample * (*P_code_ptr++);
+ *L_out_ptr += bb_signal_sample * (*L_code_ptr++);
+ }
+}
+
+#endif /* LV_HAVE_SSE2 */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
+ \param input The input signal input
+ \param carrier The carrier signal input
+ \param E_code Early PRN code replica input
+ \param P_code Early PRN code replica input
+ \param L_code Early PRN code replica input
+ \param E_out Early correlation output
+ \param P_out Early correlation output
+ \param L_out Early correlation output
+ \param num_points The number of complex values in vectors
+ */
+static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_generic(lv_8sc_t* E_out, lv_8sc_t* P_out, lv_8sc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points)
+{
+ lv_8sc_t bb_signal_sample;
+
+ bb_signal_sample = lv_cmake(0, 0);
+
+ *E_out = 0;
+ *P_out = 0;
+ *L_out = 0;
+ // perform Early, Prompt and Late correlation
+ for(int i=0; i < num_points; ++i)
+ {
+ //Perform the carrier wipe-off
+ bb_signal_sample = input[i] * carrier[i];
+ // Now get early, late, and prompt values for each
+ *E_out += bb_signal_sample * E_code[i];
+ *P_out += bb_signal_sample * P_code[i];
+ *L_out += bb_signal_sample * L_code[i];
+ }
+}
+
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_gnsssdr_volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_u_H */
+
+
+#ifndef INCLUDED_gnsssdr_volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_a_H
+#define INCLUDED_gnsssdr_volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk_gnsssdr/volk_gnsssdr_complex.h>
+#include <float.h>
+#include <string.h>
+
+#ifdef LV_HAVE_SSE4_1
+#include "smmintrin.h"
+/*!
+ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
+ \param input The input signal input
+ \param carrier The carrier signal input
+ \param E_code Early PRN code replica input
+ \param P_code Early PRN code replica input
+ \param L_code Early PRN code replica input
+ \param E_out Early correlation output
+ \param P_out Early correlation output
+ \param L_out Early correlation output
+ \param num_points The number of complex values in vectors
+ */
+static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_a_sse4_1(lv_8sc_t* E_out, lv_8sc_t* P_out, lv_8sc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points)
+{
+ const unsigned int sse_iters = num_points / 8;
+
+ __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample, real_E_code_acc, imag_E_code_acc, real_L_code_acc, imag_L_code_acc, real_P_code_acc, imag_P_code_acc;
+ __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output;
+
+ const lv_8sc_t* input_ptr = input;
+ const lv_8sc_t* carrier_ptr = carrier;
+
+ const lv_8sc_t* E_code_ptr = E_code;
+ lv_8sc_t* E_out_ptr = E_out;
+ const lv_8sc_t* L_code_ptr = L_code;
+ lv_8sc_t* L_out_ptr = L_out;
+ const lv_8sc_t* P_code_ptr = P_code;
+ lv_8sc_t* P_out_ptr = P_out;
+
+ *E_out_ptr = 0;
+ *P_out_ptr = 0;
+ *L_out_ptr = 0;
+
+ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
+
+ real_E_code_acc = _mm_setzero_si128();
+ imag_E_code_acc = _mm_setzero_si128();
+ real_L_code_acc = _mm_setzero_si128();
+ imag_L_code_acc = _mm_setzero_si128();
+ real_P_code_acc = _mm_setzero_si128();
+ imag_P_code_acc = _mm_setzero_si128();
+
+ if (sse_iters>0)
+ {
+ for(int number = 0;number < sse_iters; number++){
+
+ //Perform the carrier wipe-off
+ x = _mm_load_si128((__m128i*)input_ptr);
+ y = _mm_load_si128((__m128i*)carrier_ptr);
+
+ imagx = _mm_srli_si128 (x, 1);
+ imagx = _mm_and_si128 (imagx, mult1);
+ realx = _mm_and_si128 (x, mult1);
+
+ imagy = _mm_srli_si128 (y, 1);
+ imagy = _mm_and_si128 (imagy, mult1);
+ realy = _mm_and_si128 (y, mult1);
+
+ realx_mult_realy = _mm_mullo_epi16 (realx, realy);
+ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);
+ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);
+ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);
+
+ real_bb_signal_sample = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
+ imag_bb_signal_sample = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
+
+ //Get early values
+ y = _mm_load_si128((__m128i*)E_code_ptr);
+
+ imagy = _mm_srli_si128 (y, 1);
+ imagy = _mm_and_si128 (imagy, mult1);
+ realy = _mm_and_si128 (y, mult1);
+
+ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy);
+ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy);
+ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy);
+ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy);
+
+ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
+ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
+
+ real_E_code_acc = _mm_add_epi16 (real_E_code_acc, real_output);
+ imag_E_code_acc = _mm_add_epi16 (imag_E_code_acc, imag_output);
+
+ //Get late values
+ y = _mm_load_si128((__m128i*)L_code_ptr);
+
+ imagy = _mm_srli_si128 (y, 1);
+ imagy = _mm_and_si128 (imagy, mult1);
+ realy = _mm_and_si128 (y, mult1);
+
+ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy);
+ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy);
+ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy);
+ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy);
+
+ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
+ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
+
+ real_L_code_acc = _mm_add_epi16 (real_L_code_acc, real_output);
+ imag_L_code_acc = _mm_add_epi16 (imag_L_code_acc, imag_output);
+
+ //Get prompt values
+ y = _mm_load_si128((__m128i*)P_code_ptr);
+
+ imagy = _mm_srli_si128 (y, 1);
+ imagy = _mm_and_si128 (imagy, mult1);
+ realy = _mm_and_si128 (y, mult1);
+
+ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy);
+ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy);
+ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy);
+ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy);
+
+ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
+ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
+
+ real_P_code_acc = _mm_add_epi16 (real_P_code_acc, real_output);
+ imag_P_code_acc = _mm_add_epi16 (imag_P_code_acc, imag_output);
+
+ input_ptr += 8;
+ carrier_ptr += 8;
+ E_code_ptr += 8;
+ L_code_ptr += 8;
+ P_code_ptr += 8;
+ }
+
+ __VOLK_ATTR_ALIGNED(16) lv_8sc_t E_dotProductVector[8];
+ __VOLK_ATTR_ALIGNED(16) lv_8sc_t L_dotProductVector[8];
+ __VOLK_ATTR_ALIGNED(16) lv_8sc_t P_dotProductVector[8];
+
+ imag_E_code_acc = _mm_slli_si128 (imag_E_code_acc, 1);
+ output = _mm_blendv_epi8 (imag_E_code_acc, real_E_code_acc, mult1);
+ _mm_store_si128((__m128i*)E_dotProductVector, output);
+
+ imag_L_code_acc = _mm_slli_si128 (imag_L_code_acc, 1);
+ output = _mm_blendv_epi8 (imag_L_code_acc, real_L_code_acc, mult1);
+ _mm_store_si128((__m128i*)L_dotProductVector, output);
+
+ imag_P_code_acc = _mm_slli_si128 (imag_P_code_acc, 1);
+ output = _mm_blendv_epi8 (imag_P_code_acc, real_P_code_acc, mult1);
+ _mm_store_si128((__m128i*)P_dotProductVector, output);
+
+ for (int i = 0; i<8; ++i)
+ {
+ *E_out_ptr += E_dotProductVector[i];
+ *L_out_ptr += L_dotProductVector[i];
+ *P_out_ptr += P_dotProductVector[i];
+ }
+ }
+
+ lv_8sc_t bb_signal_sample;
+ for(int i=0; i < num_points%8; ++i)
+ {
+ //Perform the carrier wipe-off
+ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
+ // Now get early, late, and prompt values for each
+ *E_out_ptr += bb_signal_sample * (*E_code_ptr++);
+ *P_out_ptr += bb_signal_sample * (*P_code_ptr++);
+ *L_out_ptr += bb_signal_sample * (*L_code_ptr++);
+ }
+}
+
+#endif /* LV_HAVE_SSE4_1 */
+
+#ifdef LV_HAVE_SSE2
+#include "emmintrin.h"
+/*!
+ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
+ \param input The input signal input
+ \param carrier The carrier signal input
+ \param E_code Early PRN code replica input
+ \param P_code Early PRN code replica input
+ \param L_code Early PRN code replica input
+ \param E_out Early correlation output
+ \param P_out Early correlation output
+ \param L_out Early correlation output
+ \param num_points The number of complex values in vectors
+ */
+static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_a_sse2(lv_8sc_t* E_out, lv_8sc_t* P_out, lv_8sc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points)
+{
+ const unsigned int sse_iters = num_points / 8;
+
+ __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample, real_E_code_acc, imag_E_code_acc, real_L_code_acc, imag_L_code_acc, real_P_code_acc, imag_P_code_acc;
+ __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output;
+
+ const lv_8sc_t* input_ptr = input;
+ const lv_8sc_t* carrier_ptr = carrier;
+
+ const lv_8sc_t* E_code_ptr = E_code;
+ lv_8sc_t* E_out_ptr = E_out;
+ const lv_8sc_t* L_code_ptr = L_code;
+ lv_8sc_t* L_out_ptr = L_out;
+ const lv_8sc_t* P_code_ptr = P_code;
+ lv_8sc_t* P_out_ptr = P_out;
+
+ *E_out_ptr = 0;
+ *P_out_ptr = 0;
+ *L_out_ptr = 0;
+
+ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
+
+ real_E_code_acc = _mm_setzero_si128();
+ imag_E_code_acc = _mm_setzero_si128();
+ real_L_code_acc = _mm_setzero_si128();
+ imag_L_code_acc = _mm_setzero_si128();
+ real_P_code_acc = _mm_setzero_si128();
+ imag_P_code_acc = _mm_setzero_si128();
+
+ if (sse_iters>0)
+ {
+ for(int number = 0;number < sse_iters; number++){
+
+ //Perform the carrier wipe-off
+ x = _mm_load_si128((__m128i*)input_ptr);
+ y = _mm_load_si128((__m128i*)carrier_ptr);
+
+ imagx = _mm_srli_si128 (x, 1);
+ imagx = _mm_and_si128 (imagx, mult1);
+ realx = _mm_and_si128 (x, mult1);
+
+ imagy = _mm_srli_si128 (y, 1);
+ imagy = _mm_and_si128 (imagy, mult1);
+ realy = _mm_and_si128 (y, mult1);
+
+ realx_mult_realy = _mm_mullo_epi16 (realx, realy);
+ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);
+ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);
+ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);
+
+ real_bb_signal_sample = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
+ imag_bb_signal_sample = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
+
+ //Get early values
+ y = _mm_load_si128((__m128i*)E_code_ptr);
+
+ imagy = _mm_srli_si128 (y, 1);
+ imagy = _mm_and_si128 (imagy, mult1);
+ realy = _mm_and_si128 (y, mult1);
+
+ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy);
+ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy);
+ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy);
+ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy);
+
+ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
+ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
+
+ real_E_code_acc = _mm_add_epi16 (real_E_code_acc, real_output);
+ imag_E_code_acc = _mm_add_epi16 (imag_E_code_acc, imag_output);
+
+ //Get late values
+ y = _mm_load_si128((__m128i*)L_code_ptr);
+
+ imagy = _mm_srli_si128 (y, 1);
+ imagy = _mm_and_si128 (imagy, mult1);
+ realy = _mm_and_si128 (y, mult1);
+
+ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy);
+ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy);
+ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy);
+ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy);
+
+ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
+ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
+
+ real_L_code_acc = _mm_add_epi16 (real_L_code_acc, real_output);
+ imag_L_code_acc = _mm_add_epi16 (imag_L_code_acc, imag_output);
+
+ //Get prompt values
+ y = _mm_load_si128((__m128i*)P_code_ptr);
+
+ imagy = _mm_srli_si128 (y, 1);
+ imagy = _mm_and_si128 (imagy, mult1);
+ realy = _mm_and_si128 (y, mult1);
+
+ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy);
+ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy);
+ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy);
+ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy);
+
+ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
+ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
+
+ real_P_code_acc = _mm_add_epi16 (real_P_code_acc, real_output);
+ imag_P_code_acc = _mm_add_epi16 (imag_P_code_acc, imag_output);
+
+ input_ptr += 8;
+ carrier_ptr += 8;
+ E_code_ptr += 8;
+ L_code_ptr += 8;
+ P_code_ptr += 8;
+ }
+
+ __VOLK_ATTR_ALIGNED(16) lv_8sc_t E_dotProductVector[8];
+ __VOLK_ATTR_ALIGNED(16) lv_8sc_t L_dotProductVector[8];
+ __VOLK_ATTR_ALIGNED(16) lv_8sc_t P_dotProductVector[8];
+
+ real_E_code_acc = _mm_and_si128 (real_E_code_acc, mult1);
+ imag_E_code_acc = _mm_and_si128 (imag_E_code_acc, mult1);
+ imag_E_code_acc = _mm_slli_si128 (imag_E_code_acc, 1);
+ output = _mm_or_si128 (real_E_code_acc, imag_E_code_acc);
+ _mm_store_si128((__m128i*)E_dotProductVector, output);
+
+ real_L_code_acc = _mm_and_si128 (real_L_code_acc, mult1);
+ imag_L_code_acc = _mm_and_si128 (imag_L_code_acc, mult1);
+ imag_L_code_acc = _mm_slli_si128 (imag_L_code_acc, 1);
+ output = _mm_or_si128 (real_L_code_acc, imag_L_code_acc);
+ _mm_store_si128((__m128i*)L_dotProductVector, output);
+
+ real_P_code_acc = _mm_and_si128 (real_P_code_acc, mult1);
+ imag_P_code_acc = _mm_and_si128 (imag_P_code_acc, mult1);
+ imag_P_code_acc = _mm_slli_si128 (imag_P_code_acc, 1);
+ output = _mm_or_si128 (real_P_code_acc, imag_P_code_acc);
+ _mm_store_si128((__m128i*)P_dotProductVector, output);
+
+ for (int i = 0; i<8; ++i)
+ {
+ *E_out_ptr += E_dotProductVector[i];
+ *L_out_ptr += L_dotProductVector[i];
+ *P_out_ptr += P_dotProductVector[i];
+ }
+ }
+
+ lv_8sc_t bb_signal_sample;
+ for(int i=0; i < num_points%8; ++i)
+ {
+ //Perform the carrier wipe-off
+ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
+ // Now get early, late, and prompt values for each
+ *E_out_ptr += bb_signal_sample * (*E_code_ptr++);
+ *P_out_ptr += bb_signal_sample * (*P_code_ptr++);
+ *L_out_ptr += bb_signal_sample * (*L_code_ptr++);
+ }
+}
+
+#endif /* LV_HAVE_SSE2 */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
+ \param input The input signal input
+ \param carrier The carrier signal input
+ \param E_code Early PRN code replica input
+ \param P_code Early PRN code replica input
+ \param L_code Early PRN code replica input
+ \param E_out Early correlation output
+ \param P_out Early correlation output
+ \param L_out Early correlation output
+ \param num_points The number of complex values in vectors
+ */
+static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_a_generic(lv_8sc_t* E_out, lv_8sc_t* P_out, lv_8sc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points)
+{
+ lv_8sc_t bb_signal_sample;
+
+ bb_signal_sample = lv_cmake(0, 0);
+
+ *E_out = 0;
+ *P_out = 0;
+ *L_out = 0;
+ // perform Early, Prompt and Late correlation
+ for(int i=0; i < num_points; ++i)
+ {
+ //Perform the carrier wipe-off
+ bb_signal_sample = input[i] * carrier[i];
+ // Now get early, late, and prompt values for each
+ *E_out += bb_signal_sample * E_code[i];
+ *P_out += bb_signal_sample * P_code[i];
+ *L_out += bb_signal_sample * L_code[i];
+ }
+}
+
+#endif /* LV_HAVE_GENERIC */
+
+#ifdef LV_HAVE_ORC
+/*!
+ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
+ \param input The input signal input
+ \param carrier The carrier signal input
+ \param E_code Early PRN code replica input
+ \param P_code Early PRN code replica input
+ \param L_code Early PRN code replica input
+ \param E_out Early correlation output
+ \param P_out Early correlation output
+ \param L_out Early correlation output
+ \param num_points The number of complex values in vectors
+ */
+
+extern void volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_first_a_orc_impl(short* E_out_real, short* E_out_imag, short* P_out_real, short* P_out_imag, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, unsigned int num_points);
+extern void volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_second_a_orc_impl(short* L_out_real, short* L_out_imag, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* L_code, unsigned int num_points);
+static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_u_orc(lv_8sc_t* E_out, lv_8sc_t* P_out, lv_8sc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points){
+
+ short E_out_real = 0;
+ short E_out_imag = 0;
+ char* E_out_real_c = (char*)&E_out_real;
+ E_out_real_c++;
+ char* E_out_imag_c = (char*)&E_out_imag;
+ E_out_imag_c++;
+
+ short P_out_real = 0;
+ short P_out_imag = 0;
+ char* P_out_real_c = (char*)&P_out_real;
+ P_out_real_c++;
+ char* P_out_imag_c = (char*)&P_out_imag;
+ P_out_imag_c++;
+
+ short L_out_real = 0;
+ short L_out_imag = 0;
+ char* L_out_real_c = (char*)&L_out_real;
+ L_out_real_c++;
+ char* L_out_imag_c = (char*)&L_out_imag;
+ L_out_imag_c++;
+
+ volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_first_a_orc_impl( &E_out_real, &E_out_imag, &P_out_real, &P_out_imag, input, carrier, E_code, P_code, num_points);
+ volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_second_a_orc_impl( &L_out_real, &L_out_imag, input, carrier, L_code, num_points);
+
+ //ORC implementation of 8ic_x5_cw_epl_corr_8ic_x3 is done in two different functions because it seems that
+ //in one function the length of the code gives memory problems (bad access, segmentation fault).
+ //Also, the maximum number of accumulators that can be used is 4 (and we need 6).
+ //The "carrier wipe-off" step is done two times: one in the first function and another one in the second.
+ //Joining all the ORC code in one function would be quicker because the "carrier wipe-off" step would be done just
+ //one time.
+
+ *E_out = lv_cmake(*E_out_real_c, *E_out_imag_c);
+ *P_out = lv_cmake(*P_out_real_c, *P_out_imag_c);
+ *L_out = lv_cmake(*L_out_real_c, *L_out_imag_c);
+}
+#endif /* LV_HAVE_ORC */
+
+#endif /* INCLUDED_gnsssdr_volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_a_H */
diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5.h
--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5.h 1970-01-01 01:00:00.000000000 +0100
+++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5.h 2014-10-15 01:55:08.000000000 +0200
@@ -0,0 +1,797 @@
+/*!
+ * \file volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5.h
+ * \brief Volk protokernel: performs the carrier wipe-off mixing and the Very early, Early, Prompt, Late and very late correlation with 16 bits vectors, and accumulates the results into float32. In order to avoid overflow, If input, carrier and XX_code have the same number of bits, they must be values between —3 and 3 (2 bits).
+ * \authors <ul>
+ * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
+ * </ul>
+ *
+ * Volk protokernel that performs the carrier wipe-off mixing and the
+ * Very early, Early, Prompt, Late and very late correlation with 16 bits vectors (8 bits the
+ * real part and 8 bits the imaginary part), and accumulates the result
+ * in 32 bits single point values, returning float32 values:
+ * - The carrier wipe-off is done by multiplying the input signal by the
+ * carrier (multiplication of 16 bits vectors) It returns the input
+ * signal in base band (BB)
+ * - Very Early values are calculated by multiplying the input signal in BB by the
+ * very early code (multiplication of 16 bits vectors), accumulating the results into float32 values
+ * - Early values are calculated by multiplying the input signal in BB by the
+ * early code (multiplication of 16 bits vectors), accumulating the results into float32 values
+ * - Prompt values are calculated by multiplying the input signal in BB by the
+ * prompt code (multiplication of 16 bits vectors), accumulating the results into float32 values
+ * - Late values are calculated by multiplying the input signal in BB by the
+ * late code (multiplication of 16 bits vectors), accumulating the results into float32 values
+ * - Very Late values are calculated by multiplying the input signal in BB by the
+ * very late code (multiplication of 16 bits vectors), accumulating the results into float32 values
+ *
+ * -------------------------------------------------------------------------
+ * Bits analysis
+ *
+ * input = 8 bits
+ * carrier = 8 bits
+ * XX_code = 8 bits
+ * XX_out = 8 bits
+ * bb_signal_sample = 8 bits
+ *
+ * bb_signal_sample = input*carrier -> 17 bits limited to 8 bits = input and carrier must be values between —7 and 7 to avoid overflow (3 bits)
+ *
+ * XX_out16 = XX_code*bb_signal_sample -> 17 bits limited to 8 bits = XX_code and bb_signal_sample must be values between —7 and 7 to avoid overflow (3 bits)
+ *
+ * conclusion = input and carrier must be values between —1 and 1 (1 bit) and XX_code must be values between —7 and 7 to avoid overflow (3 bits)
+ * If input, carrier and XX_code have the same number of bits, they must be values between —3 and 3 to avoid overflow (2 bits).
+ * -------------------------------------------------------------------------
+ *
+ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
+ *
+ * GNSS-SDR is a software defined Global Navigation
+ * Satellite Systems receiver
+ *
+ * This file is part of GNSS-SDR.
+ *
+ * GNSS-SDR is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * at your option) any later version.
+ *
+ * GNSS-SDR is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * -------------------------------------------------------------------------
+ */
+
+#ifndef INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_u_H
+#define INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk_gnsssdr/volk_gnsssdr_complex.h>
+#include <float.h>
+#include <string.h>
+
+#ifdef LV_HAVE_SSE4_1
+#include "smmintrin.h"
+#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h"
+#include "CommonMacros/CommonMacros.h"
+/*!
+ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
+ \param input The input signal input
+ \param carrier The carrier signal input
+ \param VE_code Very Early PRN code replica input
+ \param E_code Early PRN code replica input
+ \param P_code Prompt PRN code replica input
+ \param L_code Late PRN code replica input
+ \param VL_code Very Late PRN code replica input
+ \param VE_out Very Early correlation output
+ \param E_out Early correlation output
+ \param P_out Prompt correlation output
+ \param L_out Late correlation output
+ \param VL_out Very Late correlation output
+ \param num_points The number of complex values in vectors
+ */
+static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_u_sse4_1(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points)
+{
+ const unsigned int sse_iters = num_points / 8;
+
+ __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample;
+ __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output;
+
+ __m128 VE_code_acc, E_code_acc, P_code_acc, L_code_acc, VL_code_acc;
+ __m128i input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2;
+ __m128 output_ps;
+
+ const lv_8sc_t* input_ptr = input;
+ const lv_8sc_t* carrier_ptr = carrier;
+
+ const lv_8sc_t* VE_code_ptr = VE_code;
+ lv_32fc_t* VE_out_ptr = VE_out;
+ const lv_8sc_t* E_code_ptr = E_code;
+ lv_32fc_t* E_out_ptr = E_out;
+ const lv_8sc_t* P_code_ptr = P_code;
+ lv_32fc_t* P_out_ptr = P_out;
+ const lv_8sc_t* L_code_ptr = L_code;
+ lv_32fc_t* L_out_ptr = L_out;
+ const lv_8sc_t* VL_code_ptr = VL_code;
+ lv_32fc_t* VL_out_ptr = VL_out;
+
+ *VE_out_ptr = 0;
+ *E_out_ptr = 0;
+ *P_out_ptr = 0;
+ *L_out_ptr = 0;
+ *VL_out_ptr = 0;
+
+ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
+
+ VE_code_acc = _mm_setzero_ps();
+ E_code_acc = _mm_setzero_ps();
+ P_code_acc = _mm_setzero_ps();
+ L_code_acc = _mm_setzero_ps();
+ VL_code_acc = _mm_setzero_ps();
+
+ if (sse_iters>0)
+ {
+ for(int number = 0;number < sse_iters; number++){
+
+ //Perform the carrier wipe-off
+ x = _mm_lddqu_si128((__m128i*)input_ptr);
+ y = _mm_lddqu_si128((__m128i*)carrier_ptr);
+
+ CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(x, mult1, realx, imagx)
+ CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy)
+
+ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample)
+
+ //Get very early values
+ y = _mm_lddqu_si128((__m128i*)VE_code_ptr);
+
+ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
+
+ VE_code_acc = _mm_add_ps (VE_code_acc, output_ps);
+
+ //Get early values
+ y = _mm_lddqu_si128((__m128i*)E_code_ptr);
+
+ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
+
+ E_code_acc = _mm_add_ps (E_code_acc, output_ps);
+
+ //Get prompt values
+ y = _mm_lddqu_si128((__m128i*)P_code_ptr);
+
+ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
+
+ P_code_acc = _mm_add_ps (P_code_acc, output_ps);
+
+ //Get late values
+ y = _mm_lddqu_si128((__m128i*)L_code_ptr);
+
+ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
+
+ L_code_acc = _mm_add_ps (L_code_acc, output_ps);
+
+ //Get very late values
+ y = _mm_lddqu_si128((__m128i*)VL_code_ptr);
+
+ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
+
+ VL_code_acc = _mm_add_ps (VL_code_acc, output_ps);
+
+ input_ptr += 8;
+ carrier_ptr += 8;
+ VE_code_ptr += 8;
+ E_code_ptr += 8;
+ P_code_ptr += 8;
+ L_code_ptr += 8;
+ VL_code_ptr += 8;
+ }
+
+ __VOLK_ATTR_ALIGNED(16) lv_32fc_t VE_dotProductVector[2];
+ __VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2];
+ __VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2];
+ __VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2];
+ __VOLK_ATTR_ALIGNED(16) lv_32fc_t VL_dotProductVector[2];
+
+ _mm_storeu_ps((float*)VE_dotProductVector,VE_code_acc); // Store the results back into the dot product vector
+ _mm_storeu_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector
+ _mm_storeu_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector
+ _mm_storeu_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector
+ _mm_storeu_ps((float*)VL_dotProductVector,VL_code_acc); // Store the results back into the dot product vector
+
+ for (int i = 0; i<2; ++i)
+ {
+ *VE_out_ptr += VE_dotProductVector[i];
+ *E_out_ptr += E_dotProductVector[i];
+ *P_out_ptr += P_dotProductVector[i];
+ *L_out_ptr += L_dotProductVector[i];
+ *VL_out_ptr += VL_dotProductVector[i];
+ }
+ }
+
+ lv_8sc_t bb_signal_sample;
+ for(int i=0; i < num_points%8; ++i)
+ {
+ //Perform the carrier wipe-off
+ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
+ // Now get very early, early, prompt, late and very late values for each
+ *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VE_code_ptr++));
+ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++));
+ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++));
+ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++));
+ *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VL_code_ptr++));
+ }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#ifdef LV_HAVE_SSE2
+#include "emmintrin.h"
+#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h"
+#include "CommonMacros/CommonMacros.h"
+/*!
+ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
+ \param input The input signal input
+ \param carrier The carrier signal input
+ \param VE_code Very Early PRN code replica input
+ \param E_code Early PRN code replica input
+ \param P_code Prompt PRN code replica input
+ \param L_code Late PRN code replica input
+ \param VL_code Very Late PRN code replica input
+ \param VE_out Very Early correlation output
+ \param E_out Early correlation output
+ \param P_out Prompt correlation output
+ \param L_out Late correlation output
+ \param VL_out Very Late correlation output
+ \param num_points The number of complex values in vectors
+ */
+static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_u_sse2(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points)
+{
+ const unsigned int sse_iters = num_points / 8;
+
+ __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample;
+ __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output;
+
+ __m128 VE_code_acc, E_code_acc, P_code_acc, L_code_acc, VL_code_acc;
+ __m128i input_i_1, input_i_2, output_i32;
+ __m128 output_ps_1, output_ps_2;
+
+ const lv_8sc_t* input_ptr = input;
+ const lv_8sc_t* carrier_ptr = carrier;
+
+ const lv_8sc_t* VE_code_ptr = VE_code;
+ lv_32fc_t* VE_out_ptr = VE_out;
+ const lv_8sc_t* E_code_ptr = E_code;
+ lv_32fc_t* E_out_ptr = E_out;
+ const lv_8sc_t* P_code_ptr = P_code;
+ lv_32fc_t* P_out_ptr = P_out;
+ const lv_8sc_t* L_code_ptr = L_code;
+ lv_32fc_t* L_out_ptr = L_out;
+ const lv_8sc_t* VL_code_ptr = VL_code;
+ lv_32fc_t* VL_out_ptr = VL_out;
+
+ *VE_out_ptr = 0;
+ *E_out_ptr = 0;
+ *P_out_ptr = 0;
+ *L_out_ptr = 0;
+ *VL_out_ptr = 0;
+
+ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
+
+ VE_code_acc = _mm_setzero_ps();
+ E_code_acc = _mm_setzero_ps();
+ P_code_acc = _mm_setzero_ps();
+ L_code_acc = _mm_setzero_ps();
+ VL_code_acc = _mm_setzero_ps();
+
+ if (sse_iters>0)
+ {
+ for(int number = 0;number < sse_iters; number++){
+
+ //Perform the carrier wipe-off
+ x = _mm_lddqu_si128((__m128i*)input_ptr);
+ y = _mm_lddqu_si128((__m128i*)carrier_ptr);
+
+ CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(x, mult1, realx, imagx)
+ CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy)
+
+ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample)
+
+ //Get very early values
+ y = _mm_lddqu_si128((__m128i*)VE_code_ptr);
+
+ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
+
+ VE_code_acc = _mm_add_ps (VE_code_acc, output_ps_1);
+ VE_code_acc = _mm_add_ps (VE_code_acc, output_ps_2);
+
+ //Get early values
+ y = _mm_lddqu_si128((__m128i*)E_code_ptr);
+
+ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
+
+ E_code_acc = _mm_add_ps (E_code_acc, output_ps_1);
+ E_code_acc = _mm_add_ps (E_code_acc, output_ps_2);
+
+ //Get prompt values
+ y = _mm_lddqu_si128((__m128i*)P_code_ptr);
+
+ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
+
+ P_code_acc = _mm_add_ps (P_code_acc, output_ps_1);
+ P_code_acc = _mm_add_ps (P_code_acc, output_ps_2);
+
+ //Get late values
+ y = _mm_lddqu_si128((__m128i*)L_code_ptr);
+
+ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
+
+ L_code_acc = _mm_add_ps (L_code_acc, output_ps_1);
+ L_code_acc = _mm_add_ps (L_code_acc, output_ps_2);
+
+ //Get very late values
+ y = _mm_lddqu_si128((__m128i*)VL_code_ptr);
+
+ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
+
+ VL_code_acc = _mm_add_ps (VL_code_acc, output_ps_1);
+ VL_code_acc = _mm_add_ps (VL_code_acc, output_ps_2);
+
+ input_ptr += 8;
+ carrier_ptr += 8;
+ VE_code_ptr += 8;
+ E_code_ptr += 8;
+ P_code_ptr += 8;
+ L_code_ptr += 8;
+ VL_code_ptr += 8;
+ }
+
+ __VOLK_ATTR_ALIGNED(16) lv_32fc_t VE_dotProductVector[2];
+ __VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2];
+ __VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2];
+ __VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2];
+ __VOLK_ATTR_ALIGNED(16) lv_32fc_t VL_dotProductVector[2];
+
+ _mm_storeu_ps((float*)VE_dotProductVector,VE_code_acc); // Store the results back into the dot product vector
+ _mm_storeu_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector
+ _mm_storeu_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector
+ _mm_storeu_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector
+ _mm_storeu_ps((float*)VL_dotProductVector,VL_code_acc); // Store the results back into the dot product vector
+
+ for (int i = 0; i<2; ++i)
+ {
+ *VE_out_ptr += VE_dotProductVector[i];
+ *E_out_ptr += E_dotProductVector[i];
+ *P_out_ptr += P_dotProductVector[i];
+ *L_out_ptr += L_dotProductVector[i];
+ *VL_out_ptr += VL_dotProductVector[i];
+ }
+ }
+
+ lv_8sc_t bb_signal_sample;
+ for(int i=0; i < num_points%8; ++i)
+ {
+ //Perform the carrier wipe-off
+ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
+ // Now get very early, early, prompt, late and very late values for each
+ *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VE_code_ptr++));
+ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++));
+ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++));
+ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++));
+ *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VL_code_ptr++));
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
+ \param input The input signal input
+ \param carrier The carrier signal input
+ \param VE_code Very Early PRN code replica input
+ \param E_code Early PRN code replica input
+ \param P_code Prompt PRN code replica input
+ \param L_code Late PRN code replica input
+ \param VL_code Very Late PRN code replica input
+ \param VE_out Very Early correlation output
+ \param E_out Early correlation output
+ \param P_out Prompt correlation output
+ \param L_out Late correlation output
+ \param VL_out Very Late correlation output
+ \param num_points The number of complex values in vectors
+ */
+static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points)
+{
+ lv_8sc_t bb_signal_sample;
+
+ bb_signal_sample = lv_cmake(0, 0);
+
+ *VE_out = 0;
+ *E_out = 0;
+ *P_out = 0;
+ *L_out = 0;
+ *VL_out = 0;
+ // perform very early, Early, Prompt, Late and very late correlation
+ for(int i=0; i < num_points; ++i)
+ {
+ //Perform the carrier wipe-off
+ bb_signal_sample = input[i] * carrier[i];
+
+ *VE_out += (lv_32fc_t) (bb_signal_sample * VE_code[i]);
+ *E_out += (lv_32fc_t) (bb_signal_sample * E_code[i]);
+ *P_out += (lv_32fc_t) (bb_signal_sample * P_code[i]);
+ *L_out += (lv_32fc_t) (bb_signal_sample * L_code[i]);
+ *VL_out += (lv_32fc_t) (bb_signal_sample * VL_code[i]);
+ }
+}
+
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_u_H */
+
+
+#ifndef INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_a_H
+#define INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk_gnsssdr/volk_gnsssdr_complex.h>
+#include <float.h>
+#include <string.h>
+
+#ifdef LV_HAVE_SSE4_1
+#include "smmintrin.h"
+#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h"
+#include "CommonMacros/CommonMacros.h"
+/*!
+ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
+ \param input The input signal input
+ \param carrier The carrier signal input
+ \param VE_code Very Early PRN code replica input
+ \param E_code Early PRN code replica input
+ \param P_code Prompt PRN code replica input
+ \param L_code Late PRN code replica input
+ \param VL_code Very Late PRN code replica input
+ \param VE_out Very Early correlation output
+ \param E_out Early correlation output
+ \param P_out Prompt correlation output
+ \param L_out Late correlation output
+ \param VL_out Very Late correlation output
+ \param num_points The number of complex values in vectors
+ */
+static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_a_sse4_1(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points)
+{
+ const unsigned int sse_iters = num_points / 8;
+
+ __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample;
+ __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output;
+
+ __m128 VE_code_acc, E_code_acc, P_code_acc, L_code_acc, VL_code_acc;
+ __m128i input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2;
+ __m128 output_ps;
+
+ const lv_8sc_t* input_ptr = input;
+ const lv_8sc_t* carrier_ptr = carrier;
+
+ const lv_8sc_t* VE_code_ptr = VE_code;
+ lv_32fc_t* VE_out_ptr = VE_out;
+ const lv_8sc_t* E_code_ptr = E_code;
+ lv_32fc_t* E_out_ptr = E_out;
+ const lv_8sc_t* P_code_ptr = P_code;
+ lv_32fc_t* P_out_ptr = P_out;
+ const lv_8sc_t* L_code_ptr = L_code;
+ lv_32fc_t* L_out_ptr = L_out;
+ const lv_8sc_t* VL_code_ptr = VL_code;
+ lv_32fc_t* VL_out_ptr = VL_out;
+
+ *VE_out_ptr = 0;
+ *E_out_ptr = 0;
+ *P_out_ptr = 0;
+ *L_out_ptr = 0;
+ *VL_out_ptr = 0;
+
+ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
+
+ VE_code_acc = _mm_setzero_ps();
+ E_code_acc = _mm_setzero_ps();
+ P_code_acc = _mm_setzero_ps();
+ L_code_acc = _mm_setzero_ps();
+ VL_code_acc = _mm_setzero_ps();
+
+ if (sse_iters>0)
+ {
+ for(int number = 0;number < sse_iters; number++){
+
+ //Perform the carrier wipe-off
+ x = _mm_load_si128((__m128i*)input_ptr);
+ y = _mm_load_si128((__m128i*)carrier_ptr);
+
+ CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(x, mult1, realx, imagx)
+ CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy)
+
+ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample)
+
+ //Get very early values
+ y = _mm_load_si128((__m128i*)VE_code_ptr);
+
+ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
+
+ VE_code_acc = _mm_add_ps (VE_code_acc, output_ps);
+
+ //Get early values
+ y = _mm_load_si128((__m128i*)E_code_ptr);
+
+ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
+
+ E_code_acc = _mm_add_ps (E_code_acc, output_ps);
+
+ //Get prompt values
+ y = _mm_load_si128((__m128i*)P_code_ptr);
+
+ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
+
+ P_code_acc = _mm_add_ps (P_code_acc, output_ps);
+
+ //Get late values
+ y = _mm_load_si128((__m128i*)L_code_ptr);
+
+ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
+
+ L_code_acc = _mm_add_ps (L_code_acc, output_ps);
+
+ //Get very late values
+ y = _mm_load_si128((__m128i*)VL_code_ptr);
+
+ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
+
+ VL_code_acc = _mm_add_ps (VL_code_acc, output_ps);
+
+ input_ptr += 8;
+ carrier_ptr += 8;
+ VE_code_ptr += 8;
+ E_code_ptr += 8;
+ P_code_ptr += 8;
+ L_code_ptr += 8;
+ VL_code_ptr += 8;
+ }
+
+ __VOLK_ATTR_ALIGNED(16) lv_32fc_t VE_dotProductVector[2];
+ __VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2];
+ __VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2];
+ __VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2];
+ __VOLK_ATTR_ALIGNED(16) lv_32fc_t VL_dotProductVector[2];
+
+ _mm_store_ps((float*)VE_dotProductVector,VE_code_acc); // Store the results back into the dot product vector
+ _mm_store_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector
+ _mm_store_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector
+ _mm_store_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector
+ _mm_store_ps((float*)VL_dotProductVector,VL_code_acc); // Store the results back into the dot product vector
+
+ for (int i = 0; i<2; ++i)
+ {
+ *VE_out_ptr += VE_dotProductVector[i];
+ *E_out_ptr += E_dotProductVector[i];
+ *P_out_ptr += P_dotProductVector[i];
+ *L_out_ptr += L_dotProductVector[i];
+ *VL_out_ptr += VL_dotProductVector[i];
+ }
+ }
+
+ lv_8sc_t bb_signal_sample;
+ for(int i=0; i < num_points%8; ++i)
+ {
+ //Perform the carrier wipe-off
+ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
+ // Now get very early, early, prompt, late and very late values for each
+ *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VE_code_ptr++));
+ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++));
+ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++));
+ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++));
+ *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VL_code_ptr++));
+ }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#ifdef LV_HAVE_SSE2
+#include "emmintrin.h"
+#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h"
+#include "CommonMacros/CommonMacros.h"
+/*!
+ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
+ \param input The input signal input
+ \param carrier The carrier signal input
+ \param VE_code Very Early PRN code replica input
+ \param E_code Early PRN code replica input
+ \param P_code Prompt PRN code replica input
+ \param L_code Late PRN code replica input
+ \param VL_code Very Late PRN code replica input
+ \param VE_out Very Early correlation output
+ \param E_out Early correlation output
+ \param P_out Prompt correlation output
+ \param L_out Late correlation output
+ \param VL_out Very Late correlation output
+ \param num_points The number of complex values in vectors
+ */
+static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_a_sse2(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points)
+{
+ const unsigned int sse_iters = num_points / 8;
+
+ __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample;
+ __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output;
+
+ __m128 VE_code_acc, E_code_acc, P_code_acc, L_code_acc, VL_code_acc;
+ __m128i input_i_1, input_i_2, output_i32;
+ __m128 output_ps_1, output_ps_2;
+
+ const lv_8sc_t* input_ptr = input;
+ const lv_8sc_t* carrier_ptr = carrier;
+
+ const lv_8sc_t* VE_code_ptr = VE_code;
+ lv_32fc_t* VE_out_ptr = VE_out;
+ const lv_8sc_t* E_code_ptr = E_code;
+ lv_32fc_t* E_out_ptr = E_out;
+ const lv_8sc_t* P_code_ptr = P_code;
+ lv_32fc_t* P_out_ptr = P_out;
+ const lv_8sc_t* L_code_ptr = L_code;
+ lv_32fc_t* L_out_ptr = L_out;
+ const lv_8sc_t* VL_code_ptr = VL_code;
+ lv_32fc_t* VL_out_ptr = VL_out;
+
+ *VE_out_ptr = 0;
+ *E_out_ptr = 0;
+ *P_out_ptr = 0;
+ *L_out_ptr = 0;
+ *VL_out_ptr = 0;
+
+ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
+
+ VE_code_acc = _mm_setzero_ps();
+ E_code_acc = _mm_setzero_ps();
+ P_code_acc = _mm_setzero_ps();
+ L_code_acc = _mm_setzero_ps();
+ VL_code_acc = _mm_setzero_ps();
+
+ if (sse_iters>0)
+ {
+ for(int number = 0;number < sse_iters; number++){
+
+ //Perform the carrier wipe-off
+ x = _mm_load_si128((__m128i*)input_ptr);
+ y = _mm_load_si128((__m128i*)carrier_ptr);
+
+ CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(x, mult1, realx, imagx)
+ CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy)
+
+ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample)
+
+ //Get very early values
+ y = _mm_load_si128((__m128i*)VE_code_ptr);
+
+ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
+
+ VE_code_acc = _mm_add_ps (VE_code_acc, output_ps_1);
+ VE_code_acc = _mm_add_ps (VE_code_acc, output_ps_2);
+
+ //Get early values
+ y = _mm_load_si128((__m128i*)E_code_ptr);
+
+ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
+
+ E_code_acc = _mm_add_ps (E_code_acc, output_ps_1);
+ E_code_acc = _mm_add_ps (E_code_acc, output_ps_2);
+
+ //Get prompt values
+ y = _mm_load_si128((__m128i*)P_code_ptr);
+
+ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
+
+ P_code_acc = _mm_add_ps (P_code_acc, output_ps_1);
+ P_code_acc = _mm_add_ps (P_code_acc, output_ps_2);
+
+ //Get late values
+ y = _mm_load_si128((__m128i*)L_code_ptr);
+
+ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
+
+ L_code_acc = _mm_add_ps (L_code_acc, output_ps_1);
+ L_code_acc = _mm_add_ps (L_code_acc, output_ps_2);
+
+ //Get very late values
+ y = _mm_load_si128((__m128i*)VL_code_ptr);
+
+ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
+
+ VL_code_acc = _mm_add_ps (VL_code_acc, output_ps_1);
+ VL_code_acc = _mm_add_ps (VL_code_acc, output_ps_2);
+
+ input_ptr += 8;
+ carrier_ptr += 8;
+ VE_code_ptr += 8;
+ E_code_ptr += 8;
+ P_code_ptr += 8;
+ L_code_ptr += 8;
+ VL_code_ptr += 8;
+ }
+
+ __VOLK_ATTR_ALIGNED(16) lv_32fc_t VE_dotProductVector[2];
+ __VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2];
+ __VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2];
+ __VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2];
+ __VOLK_ATTR_ALIGNED(16) lv_32fc_t VL_dotProductVector[2];
+
+ _mm_store_ps((float*)VE_dotProductVector,VE_code_acc); // Store the results back into the dot product vector
+ _mm_store_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector
+ _mm_store_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector
+ _mm_store_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector
+ _mm_store_ps((float*)VL_dotProductVector,VL_code_acc); // Store the results back into the dot product vector
+
+ for (int i = 0; i<2; ++i)
+ {
+ *VE_out_ptr += VE_dotProductVector[i];
+ *E_out_ptr += E_dotProductVector[i];
+ *P_out_ptr += P_dotProductVector[i];
+ *L_out_ptr += L_dotProductVector[i];
+ *VL_out_ptr += VL_dotProductVector[i];
+ }
+ }
+
+ lv_8sc_t bb_signal_sample;
+ for(int i=0; i < num_points%8; ++i)
+ {
+ //Perform the carrier wipe-off
+ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
+ // Now get very early, early, prompt, late and very late values for each
+ *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VE_code_ptr++));
+ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++));
+ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++));
+ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++));
+ *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VL_code_ptr++));
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
+ \param input The input signal input
+ \param carrier The carrier signal input
+ \param VE_code Very Early PRN code replica input
+ \param E_code Early PRN code replica input
+ \param P_code Prompt PRN code replica input
+ \param L_code Late PRN code replica input
+ \param VL_code Very Late PRN code replica input
+ \param VE_out Very Early correlation output
+ \param E_out Early correlation output
+ \param P_out Prompt correlation output
+ \param L_out Late correlation output
+ \param VL_out Very Late correlation output
+ \param num_points The number of complex values in vectors
+ */
+static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_a_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points)
+{
+ lv_8sc_t bb_signal_sample;
+
+ bb_signal_sample = lv_cmake(0, 0);
+
+ *VE_out = 0;
+ *E_out = 0;
+ *P_out = 0;
+ *L_out = 0;
+ *VL_out = 0;
+ // perform very early, Early, Prompt, Late and very late correlation
+ for(int i=0; i < num_points; ++i)
+ {
+ //Perform the carrier wipe-off
+ bb_signal_sample = input[i] * carrier[i];
+
+ *VE_out += (lv_32fc_t) (bb_signal_sample * VE_code[i]);
+ *E_out += (lv_32fc_t) (bb_signal_sample * E_code[i]);
+ *P_out += (lv_32fc_t) (bb_signal_sample * P_code[i]);
+ *L_out += (lv_32fc_t) (bb_signal_sample * L_code[i]);
+ *VL_out += (lv_32fc_t) (bb_signal_sample * VL_code[i]);
+ }
+}
+
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_a_H */
\ No newline at end of file
diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5.h
--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5.h 1970-01-01 01:00:00.000000000 +0100
+++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5.h 2014-10-15 01:55:08.000000000 +0200
@@ -0,0 +1,1520 @@
+/*!
+ * \file volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5.h
+ * \brief Volk protokernel: performs the carrier wipe-off mixing and the Very early, Early, Prompt, Late and very late correlation with 16 bits vectors using different methods: inside u_sse4_1_first there is one method, inside u_sse4_1_second there is another... This protokernel has been created to test the performance of different methods.
+ * \authors <ul>
+ * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
+ * </ul>
+ *
+ * Volk protokernel that performs the carrier wipe-off mixing and the
+ * Very early, Early, Prompt, Late and very late correlation with 16 bits vectors (8 bits the
+ * real part and 8 bits the imaginary part), and accumulates the result
+ * in 32 bits single point values, returning float32 values:
+ * - The carrier wipe-off is done by multiplying the input signal by the
+ * carrier (multiplication of 16 bits vectors) It returns the input
+ * signal in base band (BB)
+ * - Very Early values are calculated by multiplying the input signal in BB by the
+ * very early code (multiplication of 16 bits vectors), accumulating the results into float32 values
+ * - Early values are calculated by multiplying the input signal in BB by the
+ * early code (multiplication of 16 bits vectors), accumulating the results into float32 values
+ * - Prompt values are calculated by multiplying the input signal in BB by the
+ * prompt code (multiplication of 16 bits vectors), accumulating the results into float32 values
+ * - Late values are calculated by multiplying the input signal in BB by the
+ * late code (multiplication of 16 bits vectors), accumulating the results into float32 values
+ * - Very Late values are calculated by multiplying the input signal in BB by the
+ * very late code (multiplication of 16 bits vectors), accumulating the results into float32 values
+ *
+ * -------------------------------------------------------------------------
+ *
+ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
+ *
+ * GNSS-SDR is a software defined Global Navigation
+ * Satellite Systems receiver
+ *
+ * This file is part of GNSS-SDR.
+ *
+ * GNSS-SDR is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * at your option) any later version.
+ *
+ * GNSS-SDR is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * -------------------------------------------------------------------------
+ */
+
+#ifndef INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5_u_H
+#define INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk_gnsssdr/volk_gnsssdr_complex.h>
+#include <float.h>
+#include <string.h>
+
+#ifdef LV_HAVE_SSE4_1
+#include "smmintrin.h"
+#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h"
+#include "CommonMacros/CommonMacros.h"
+/*!
+ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
+ \param input The input signal input
+ \param carrier The carrier signal input
+ \param VE_code Very Early PRN code replica input
+ \param E_code Early PRN code replica input
+ \param P_code Prompt PRN code replica input
+ \param L_code Late PRN code replica input
+ \param VL_code Very Late PRN code replica input
+ \param VE_out Very Early correlation output
+ \param E_out Early correlation output
+ \param P_out Prompt correlation output
+ \param L_out Late correlation output
+ \param VL_out Very Late correlation output
+ \param num_points The number of complex values in vectors
+ */
+static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5_u_sse4_1_first(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points)
+{
+ const unsigned int sse_iters = num_points / 8;
+
+ __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample;
+ __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output;
+
+ __m128 VE_code_acc, E_code_acc, P_code_acc, L_code_acc, VL_code_acc;
+ __m128i input_i_1, input_i_2, output_i32;
+ __m128 output_ps_1, output_ps_2;
+
+ const lv_8sc_t* input_ptr = input;
+ const lv_8sc_t* carrier_ptr = carrier;
+
+ const lv_8sc_t* VE_code_ptr = VE_code;
+ lv_32fc_t* VE_out_ptr = VE_out;
+ const lv_8sc_t* E_code_ptr = E_code;
+ lv_32fc_t* E_out_ptr = E_out;
+ const lv_8sc_t* P_code_ptr = P_code;
+ lv_32fc_t* P_out_ptr = P_out;
+ const lv_8sc_t* L_code_ptr = L_code;
+ lv_32fc_t* L_out_ptr = L_out;
+ const lv_8sc_t* VL_code_ptr = VL_code;
+ lv_32fc_t* VL_out_ptr = VL_out;
+
+ *VE_out_ptr = 0;
+ *E_out_ptr = 0;
+ *P_out_ptr = 0;
+ *L_out_ptr = 0;
+ *VL_out_ptr = 0;
+
+ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
+
+ VE_code_acc = _mm_setzero_ps();
+ E_code_acc = _mm_setzero_ps();
+ P_code_acc = _mm_setzero_ps();
+ L_code_acc = _mm_setzero_ps();
+ VL_code_acc = _mm_setzero_ps();
+
+ if (sse_iters>0)
+ {
+ for(int number = 0;number < sse_iters; number++){
+
+ //Perform the carrier wipe-off
+ x = _mm_lddqu_si128((__m128i*)input_ptr);
+ y = _mm_lddqu_si128((__m128i*)carrier_ptr);
+
+ imagx = _mm_srli_si128 (x, 1);
+ imagx = _mm_and_si128 (imagx, mult1);
+ realx = _mm_and_si128 (x, mult1);
+
+ imagy = _mm_srli_si128 (y, 1);
+ imagy = _mm_and_si128 (imagy, mult1);
+ realy = _mm_and_si128 (y, mult1);
+
+ realx_mult_realy = _mm_mullo_epi16 (realx, realy);
+ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);
+ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);
+ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);
+
+ real_bb_signal_sample = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
+ imag_bb_signal_sample = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
+
+ //Get very early values
+ y = _mm_lddqu_si128((__m128i*)VE_code_ptr);
+
+ imagy = _mm_srli_si128 (y, 1);
+ imagy = _mm_and_si128 (imagy, mult1);
+ realy = _mm_and_si128 (y, mult1);
+
+ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy);
+ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy);
+ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy);
+ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy);
+
+ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
+ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
+
+ imag_output = _mm_slli_si128 (imag_output, 1);
+ output = _mm_blendv_epi8 (imag_output, real_output, mult1);
+
+ input_i_1 = _mm_cvtepi8_epi32(output);
+ output = _mm_srli_si128 (output, 4);
+ input_i_2 = _mm_cvtepi8_epi32(output);
+ output = _mm_srli_si128 (output, 4);
+ output_i32 = _mm_add_epi32 (input_i_1, input_i_2);
+ output_ps_1 = _mm_cvtepi32_ps(output_i32);
+
+ input_i_1 = _mm_cvtepi8_epi32(output);
+ output = _mm_srli_si128 (output, 4);
+ input_i_2 = _mm_cvtepi8_epi32(output);
+ output = _mm_srli_si128 (output, 4);
+ output_i32 = _mm_add_epi32 (input_i_1, input_i_2);
+ output_ps_2 = _mm_cvtepi32_ps(output_i32);
+
+ VE_code_acc = _mm_add_ps (VE_code_acc, output_ps_1);
+ VE_code_acc = _mm_add_ps (VE_code_acc, output_ps_2);
+
+ //Get early values
+ y = _mm_lddqu_si128((__m128i*)E_code_ptr);
+
+ imagy = _mm_srli_si128 (y, 1);
+ imagy = _mm_and_si128 (imagy, mult1);
+ realy = _mm_and_si128 (y, mult1);
+
+ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy);
+ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy);
+ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy);
+ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy);
+
+ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
+ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
+
+ imag_output = _mm_slli_si128 (imag_output, 1);
+ output = _mm_blendv_epi8 (imag_output, real_output, mult1);
+
+ input_i_1 = _mm_cvtepi8_epi32(output);
+ output = _mm_srli_si128 (output, 4);
+ input_i_2 = _mm_cvtepi8_epi32(output);
+ output = _mm_srli_si128 (output, 4);
+ output_i32 = _mm_add_epi32 (input_i_1, input_i_2);
+ output_ps_1 = _mm_cvtepi32_ps(output_i32);
+
+ input_i_1 = _mm_cvtepi8_epi32(output);
+ output = _mm_srli_si128 (output, 4);
+ input_i_2 = _mm_cvtepi8_epi32(output);
+ output = _mm_srli_si128 (output, 4);
+ output_i32 = _mm_add_epi32 (input_i_1, input_i_2);
+ output_ps_2 = _mm_cvtepi32_ps(output_i32);
+
+ E_code_acc = _mm_add_ps (E_code_acc, output_ps_1);
+ E_code_acc = _mm_add_ps (E_code_acc, output_ps_2);
+
+ //Get prompt values
+ y = _mm_lddqu_si128((__m128i*)P_code_ptr);
+
+ imagy = _mm_srli_si128 (y, 1);
+ imagy = _mm_and_si128 (imagy, mult1);
+ realy = _mm_and_si128 (y, mult1);
+
+ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy);
+ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy);
+ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy);
+ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy);
+
+ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
+ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
+
+ imag_output = _mm_slli_si128 (imag_output, 1);
+ output = _mm_blendv_epi8 (imag_output, real_output, mult1);
+
+ input_i_1 = _mm_cvtepi8_epi32(output);
+ output = _mm_srli_si128 (output, 4);
+ input_i_2 = _mm_cvtepi8_epi32(output);
+ output = _mm_srli_si128 (output, 4);
+ output_i32 = _mm_add_epi32 (input_i_1, input_i_2);
+ output_ps_1 = _mm_cvtepi32_ps(output_i32);
+
+ input_i_1 = _mm_cvtepi8_epi32(output);
+ output = _mm_srli_si128 (output, 4);
+ input_i_2 = _mm_cvtepi8_epi32(output);
+ output = _mm_srli_si128 (output, 4);
+ output_i32 = _mm_add_epi32 (input_i_1, input_i_2);
+ output_ps_2 = _mm_cvtepi32_ps(output_i32);
+
+ P_code_acc = _mm_add_ps (P_code_acc, output_ps_1);
+ P_code_acc = _mm_add_ps (P_code_acc, output_ps_2);
+
+ //Get late values
+ y = _mm_lddqu_si128((__m128i*)L_code_ptr);
+
+ imagy = _mm_srli_si128 (y, 1);
+ imagy = _mm_and_si128 (imagy, mult1);
+ realy = _mm_and_si128 (y, mult1);
+
+ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy);
+ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy);
+ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy);
+ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy);
+
+ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
+ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
+
+ imag_output = _mm_slli_si128 (imag_output, 1);
+ output = _mm_blendv_epi8 (imag_output, real_output, mult1);
+
+ input_i_1 = _mm_cvtepi8_epi32(output);
+ output = _mm_srli_si128 (output, 4);
+ input_i_2 = _mm_cvtepi8_epi32(output);
+ output = _mm_srli_si128 (output, 4);
+ output_i32 = _mm_add_epi32 (input_i_1, input_i_2);
+ output_ps_1 = _mm_cvtepi32_ps(output_i32);
+
+ input_i_1 = _mm_cvtepi8_epi32(output);
+ output = _mm_srli_si128 (output, 4);
+ input_i_2 = _mm_cvtepi8_epi32(output);
+ output = _mm_srli_si128 (output, 4);
+ output_i32 = _mm_add_epi32 (input_i_1, input_i_2);
+ output_ps_2 = _mm_cvtepi32_ps(output_i32);
+
+ L_code_acc = _mm_add_ps (L_code_acc, output_ps_1);
+ L_code_acc = _mm_add_ps (L_code_acc, output_ps_2);
+
+ //Get very late values
+ y = _mm_lddqu_si128((__m128i*)VL_code_ptr);
+
+ imagy = _mm_srli_si128 (y, 1);
+ imagy = _mm_and_si128 (imagy, mult1);
+ realy = _mm_and_si128 (y, mult1);
+
+ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy);
+ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy);
+ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy);
+ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy);
+
+ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
+ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
+
+ imag_output = _mm_slli_si128 (imag_output, 1);
+ output = _mm_blendv_epi8 (imag_output, real_output, mult1);
+
+ input_i_1 = _mm_cvtepi8_epi32(output);
+ output = _mm_srli_si128 (output, 4);
+ input_i_2 = _mm_cvtepi8_epi32(output);
+ output = _mm_srli_si128 (output, 4);
+ output_i32 = _mm_add_epi32 (input_i_1, input_i_2);
+ output_ps_1 = _mm_cvtepi32_ps(output_i32);
+
+ input_i_1 = _mm_cvtepi8_epi32(output);
+ output = _mm_srli_si128 (output, 4);
+ input_i_2 = _mm_cvtepi8_epi32(output);
+ output = _mm_srli_si128 (output, 4);
+ output_i32 = _mm_add_epi32 (input_i_1, input_i_2);
+ output_ps_2 = _mm_cvtepi32_ps(output_i32);
+
+ VL_code_acc = _mm_add_ps (VL_code_acc, output_ps_1);
+ VL_code_acc = _mm_add_ps (VL_code_acc, output_ps_2);
+
+ input_ptr += 8;
+ carrier_ptr += 8;
+ VE_code_ptr += 8;
+ E_code_ptr += 8;
+ P_code_ptr += 8;
+ L_code_ptr += 8;
+ VL_code_ptr += 8;
+ }
+
+ __VOLK_ATTR_ALIGNED(16) lv_32fc_t VE_dotProductVector[2];
+ __VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2];
+ __VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2];
+ __VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2];
+ __VOLK_ATTR_ALIGNED(16) lv_32fc_t VL_dotProductVector[2];
+
+ _mm_storeu_ps((float*)VE_dotProductVector,VE_code_acc); // Store the results back into the dot product vector
+ _mm_storeu_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector
+ _mm_storeu_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector
+ _mm_storeu_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector
+ _mm_storeu_ps((float*)VL_dotProductVector,VL_code_acc); // Store the results back into the dot product vector
+
+ for (int i = 0; i<2; ++i)
+ {
+ *VE_out_ptr += VE_dotProductVector[i];
+ *E_out_ptr += E_dotProductVector[i];
+ *P_out_ptr += P_dotProductVector[i];
+ *L_out_ptr += L_dotProductVector[i];
+ *VL_out_ptr += VL_dotProductVector[i];
+ }
+ }
+
+ lv_8sc_t bb_signal_sample;
+ for(int i=0; i < num_points%8; ++i)
+ {
+ //Perform the carrier wipe-off
+ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
+ // Now get very early, early, prompt, late and very late values for each
+ *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VE_code_ptr++));
+ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++));
+ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++));
+ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++));
+ *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VL_code_ptr++));
+ }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#ifdef LV_HAVE_SSE4_1
+#include "smmintrin.h"
+#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h"
+#include "CommonMacros/CommonMacros.h"
+/*!
+ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
+ \param input The input signal input
+ \param carrier The carrier signal input
+ \param VE_code Very Early PRN code replica input
+ \param E_code Early PRN code replica input
+ \param P_code Prompt PRN code replica input
+ \param L_code Late PRN code replica input
+ \param VL_code Very Late PRN code replica input
+ \param VE_out Very Early correlation output
+ \param E_out Early correlation output
+ \param P_out Prompt correlation output
+ \param L_out Late correlation output
+ \param VL_out Very Late correlation output
+ \param num_points The number of complex values in vectors
+ */
+static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5_u_sse4_1_second(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points)
+{
+ const unsigned int sse_iters = num_points / 8;
+
+ __m128i x, x_abs, y, y_aux, bb_signal_sample_aux, bb_signal_sample_aux_abs;;
+ __m128i mult1, output, real_output, imag_output;
+
+ __m128 VE_code_acc, E_code_acc, P_code_acc, L_code_acc, VL_code_acc;
+ __m128i input_i_1, input_i_2, output_i32;
+ __m128 output_ps_1, output_ps_2;
+
+ __m128i check_sign_sequence = _mm_set_epi8 (255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1);
+
+ const lv_8sc_t* input_ptr = input;
+ const lv_8sc_t* carrier_ptr = carrier;
+
+ const lv_8sc_t* VE_code_ptr = VE_code;
+ lv_32fc_t* VE_out_ptr = VE_out;
+ const lv_8sc_t* E_code_ptr = E_code;
+ lv_32fc_t* E_out_ptr = E_out;
+ const lv_8sc_t* P_code_ptr = P_code;
+ lv_32fc_t* P_out_ptr = P_out;
+ const lv_8sc_t* L_code_ptr = L_code;
+ lv_32fc_t* L_out_ptr = L_out;
+ const lv_8sc_t* VL_code_ptr = VL_code;
+ lv_32fc_t* VL_out_ptr = VL_out;
+
+ *VE_out_ptr = 0;
+ *E_out_ptr = 0;
+ *P_out_ptr = 0;
+ *L_out_ptr = 0;
+ *VL_out_ptr = 0;
+
+ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
+
+ VE_code_acc = _mm_setzero_ps();
+ E_code_acc = _mm_setzero_ps();
+ P_code_acc = _mm_setzero_ps();
+ L_code_acc = _mm_setzero_ps();
+ VL_code_acc = _mm_setzero_ps();
+
+ if (sse_iters>0)
+ {
+ for(int number = 0;number < sse_iters; number++){
+
+ //Perform the carrier wipe-off
+ x = _mm_lddqu_si128((__m128i*)input_ptr);
+ y = _mm_lddqu_si128((__m128i*)carrier_ptr);
+
+ x_abs = _mm_abs_epi8 (x);
+
+ y_aux = _mm_sign_epi8 (y, x);
+ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence);
+ real_output = _mm_maddubs_epi16 (x_abs, y_aux);
+
+ y_aux = _mm_shuffle_epi8 (y, _mm_set_epi8 (14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1));
+ y_aux = _mm_sign_epi8 (y_aux, x);
+ imag_output = _mm_maddubs_epi16 (x_abs, y_aux);
+
+ imag_output = _mm_slli_si128 (imag_output, 1);
+ bb_signal_sample_aux = _mm_blendv_epi8 (imag_output, real_output, mult1);
+
+ bb_signal_sample_aux_abs = _mm_abs_epi8 (bb_signal_sample_aux);
+
+ //Get very early values
+ y = _mm_lddqu_si128((__m128i*)VE_code_ptr);
+
+ y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux);
+ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence);
+ real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux);
+
+ y_aux = _mm_shuffle_epi8 (y, _mm_set_epi8 (14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1));
+ y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux);
+ imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux);
+
+ imag_output = _mm_slli_si128 (imag_output, 1);
+ output = _mm_blendv_epi8 (imag_output, real_output, mult1);
+
+ input_i_1 = _mm_cvtepi8_epi32(output);
+ output = _mm_srli_si128 (output, 4);
+ input_i_2 = _mm_cvtepi8_epi32(output);
+ output = _mm_srli_si128 (output, 4);
+ output_i32 = _mm_add_epi32 (input_i_1, input_i_2);
+ output_ps_1 = _mm_cvtepi32_ps(output_i32);
+
+ input_i_1 = _mm_cvtepi8_epi32(output);
+ output = _mm_srli_si128 (output, 4);
+ input_i_2 = _mm_cvtepi8_epi32(output);
+ output = _mm_srli_si128 (output, 4);
+ output_i32 = _mm_add_epi32 (input_i_1, input_i_2);
+ output_ps_2 = _mm_cvtepi32_ps(output_i32);
+
+ VE_code_acc = _mm_add_ps (VE_code_acc, output_ps_1);
+ VE_code_acc = _mm_add_ps (VE_code_acc, output_ps_2);
+
+ //Get early values
+ y = _mm_lddqu_si128((__m128i*)E_code_ptr);
+
+ y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux);
+ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence);
+ real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux);
+
+ y_aux = _mm_shuffle_epi8 (y, _mm_set_epi8 (14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1));
+ y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux);
+ imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux);
+
+ imag_output = _mm_slli_si128 (imag_output, 1);
+ output = _mm_blendv_epi8 (imag_output, real_output, mult1);
+
+ input_i_1 = _mm_cvtepi8_epi32(output);
+ output = _mm_srli_si128 (output, 4);
+ input_i_2 = _mm_cvtepi8_epi32(output);
+ output = _mm_srli_si128 (output, 4);
+ output_i32 = _mm_add_epi32 (input_i_1, input_i_2);
+ output_ps_1 = _mm_cvtepi32_ps(output_i32);
+
+ input_i_1 = _mm_cvtepi8_epi32(output);
+ output = _mm_srli_si128 (output, 4);
+ input_i_2 = _mm_cvtepi8_epi32(output);
+ output = _mm_srli_si128 (output, 4);
+ output_i32 = _mm_add_epi32 (input_i_1, input_i_2);
+ output_ps_2 = _mm_cvtepi32_ps(output_i32);
+
+ E_code_acc = _mm_add_ps (E_code_acc, output_ps_1);
+ E_code_acc = _mm_add_ps (E_code_acc, output_ps_2);
+
+ //Get prompt values
+ y = _mm_lddqu_si128((__m128i*)P_code_ptr);
+
+ y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux);
+ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence);
+ real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux);
+
+ y_aux = _mm_shuffle_epi8 (y, _mm_set_epi8 (14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1));
+ y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux);
+ imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux);
+
+ imag_output = _mm_slli_si128 (imag_output, 1);
+ output = _mm_blendv_epi8 (imag_output, real_output, mult1);
+
+ input_i_1 = _mm_cvtepi8_epi32(output);
+ output = _mm_srli_si128 (output, 4);
+ input_i_2 = _mm_cvtepi8_epi32(output);
+ output = _mm_srli_si128 (output, 4);
+ output_i32 = _mm_add_epi32 (input_i_1, input_i_2);
+ output_ps_1 = _mm_cvtepi32_ps(output_i32);
+
+ input_i_1 = _mm_cvtepi8_epi32(output);
+ output = _mm_srli_si128 (output, 4);
+ input_i_2 = _mm_cvtepi8_epi32(output);
+ output = _mm_srli_si128 (output, 4);
+ output_i32 = _mm_add_epi32 (input_i_1, input_i_2);
+ output_ps_2 = _mm_cvtepi32_ps(output_i32);
+
+ P_code_acc = _mm_add_ps (P_code_acc, output_ps_1);
+ P_code_acc = _mm_add_ps (P_code_acc, output_ps_2);
+
+ //Get late values
+ y = _mm_lddqu_si128((__m128i*)L_code_ptr);
+
+ y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux);
+ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence);
+ real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux);
+
+ y_aux = _mm_shuffle_epi8 (y, _mm_set_epi8 (14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1));
+ y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux);
+ imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux);
+
+ imag_output = _mm_slli_si128 (imag_output, 1);
+ output = _mm_blendv_epi8 (imag_output, real_output, mult1);
+
+ input_i_1 = _mm_cvtepi8_epi32(output);
+ output = _mm_srli_si128 (output, 4);
+ input_i_2 = _mm_cvtepi8_epi32(output);
+ output = _mm_srli_si128 (output, 4);
+ output_i32 = _mm_add_epi32 (input_i_1, input_i_2);
+ output_ps_1 = _mm_cvtepi32_ps(output_i32);
+
+ input_i_1 = _mm_cvtepi8_epi32(output);
+ output = _mm_srli_si128 (output, 4);
+ input_i_2 = _mm_cvtepi8_epi32(output);
+ output = _mm_srli_si128 (output, 4);
+ output_i32 = _mm_add_epi32 (input_i_1, input_i_2);
+ output_ps_2 = _mm_cvtepi32_ps(output_i32);
+
+ L_code_acc = _mm_add_ps (L_code_acc, output_ps_1);
+ L_code_acc = _mm_add_ps (L_code_acc, output_ps_2);
+
+ //Get very late values
+ y = _mm_lddqu_si128((__m128i*)VL_code_ptr);
+
+ y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux);
+ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence);
+ real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux);
+
+ y_aux = _mm_shuffle_epi8 (y, _mm_set_epi8 (14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1));
+ y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux);
+ imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux);
+
+ imag_output = _mm_slli_si128 (imag_output, 1);
+ output = _mm_blendv_epi8 (imag_output, real_output, mult1);
+
+ input_i_1 = _mm_cvtepi8_epi32(output);
+ output = _mm_srli_si128 (output, 4);
+ input_i_2 = _mm_cvtepi8_epi32(output);
+ output = _mm_srli_si128 (output, 4);
+ output_i32 = _mm_add_epi32 (input_i_1, input_i_2);
+ output_ps_1 = _mm_cvtepi32_ps(output_i32);
+
+ input_i_1 = _mm_cvtepi8_epi32(output);
+ output = _mm_srli_si128 (output, 4);
+ input_i_2 = _mm_cvtepi8_epi32(output);
+ output = _mm_srli_si128 (output, 4);
+ output_i32 = _mm_add_epi32 (input_i_1, input_i_2);
+ output_ps_2 = _mm_cvtepi32_ps(output_i32);
+
+ VL_code_acc = _mm_add_ps (VL_code_acc, output_ps_1);
+ VL_code_acc = _mm_add_ps (VL_code_acc, output_ps_2);
+
+ input_ptr += 8;
+ carrier_ptr += 8;
+ VE_code_ptr += 8;
+ E_code_ptr += 8;
+ P_code_ptr += 8;
+ L_code_ptr += 8;
+ VL_code_ptr += 8;
+ }
+
+ __VOLK_ATTR_ALIGNED(16) lv_32fc_t VE_dotProductVector[2];
+ __VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2];
+ __VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2];
+ __VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2];
+ __VOLK_ATTR_ALIGNED(16) lv_32fc_t VL_dotProductVector[2];
+
+ _mm_storeu_ps((float*)VE_dotProductVector,VE_code_acc); // Store the results back into the dot product vector
+ _mm_storeu_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector
+ _mm_storeu_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector
+ _mm_storeu_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector
+ _mm_storeu_ps((float*)VL_dotProductVector,VL_code_acc); // Store the results back into the dot product vector
+
+ for (int i = 0; i<2; ++i)
+ {
+ *VE_out_ptr += VE_dotProductVector[i];
+ *E_out_ptr += E_dotProductVector[i];
+ *P_out_ptr += P_dotProductVector[i];
+ *L_out_ptr += L_dotProductVector[i];
+ *VL_out_ptr += VL_dotProductVector[i];
+ }
+ }
+
+ lv_8sc_t bb_signal_sample;
+ for(int i=0; i < num_points%8; ++i)
+ {
+ //Perform the carrier wipe-off
+ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
+ // Now get very early, early, prompt, late and very late values for each
+ *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VE_code_ptr++));
+ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++));
+ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++));
+ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++));
+ *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VL_code_ptr++));
+ }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#ifdef LV_HAVE_SSE4_1
+#include "smmintrin.h"
+#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h"
+#include "CommonMacros/CommonMacros.h"
+/*!
+ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
+ \param input The input signal input
+ \param carrier The carrier signal input
+ \param VE_code Very Early PRN code replica input
+ \param E_code Early PRN code replica input
+ \param P_code Prompt PRN code replica input
+ \param L_code Late PRN code replica input
+ \param VL_code Very Late PRN code replica input
+ \param VE_out Very Early correlation output
+ \param E_out Early correlation output
+ \param P_out Prompt correlation output
+ \param L_out Late correlation output
+ \param VL_out Very Late correlation output
+ \param num_points The number of complex values in vectors
+ */
+static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5_u_sse4_1_third(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points)
+{
+ const unsigned int sse_iters = num_points / 8;
+
+ __m128i x, x_abs, y, y_aux, bb_signal_sample_aux, bb_signal_sample_aux_abs;;
+ __m128i mult1, real_output, imag_output;
+
+ __m128 real_VE_code_acc, imag_VE_code_acc, real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc, real_VL_code_acc, imag_VL_code_acc;
+ __m128i real_output_i_1, real_output_i_2, imag_output_i_1, imag_output_i_2, real_output_i32, imag_output_i32;
+ __m128 real_output_ps, imag_output_ps;
+
+ __m128i check_sign_sequence = _mm_set_epi8 (255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1);
+ __m128i rearrange_sequence = _mm_set_epi8 (14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
+
+ const lv_8sc_t* input_ptr = input;
+ const lv_8sc_t* carrier_ptr = carrier;
+
+ const lv_8sc_t* VE_code_ptr = VE_code;
+ lv_32fc_t* VE_out_ptr = VE_out;
+ const lv_8sc_t* E_code_ptr = E_code;
+ lv_32fc_t* E_out_ptr = E_out;
+ const lv_8sc_t* P_code_ptr = P_code;
+ lv_32fc_t* P_out_ptr = P_out;
+ const lv_8sc_t* L_code_ptr = L_code;
+ lv_32fc_t* L_out_ptr = L_out;
+ const lv_8sc_t* VL_code_ptr = VL_code;
+ lv_32fc_t* VL_out_ptr = VL_out;
+
+ float VE_out_real = 0;
+ float VE_out_imag = 0;
+ float E_out_real = 0;
+ float E_out_imag = 0;
+ float P_out_real = 0;
+ float P_out_imag = 0;
+ float L_out_real = 0;
+ float L_out_imag = 0;
+ float VL_out_real = 0;
+ float VL_out_imag = 0;
+
+ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
+
+ real_VE_code_acc = _mm_setzero_ps();
+ imag_VE_code_acc = _mm_setzero_ps();
+ real_E_code_acc = _mm_setzero_ps();
+ imag_E_code_acc = _mm_setzero_ps();
+ real_P_code_acc = _mm_setzero_ps();
+ imag_P_code_acc = _mm_setzero_ps();
+ real_L_code_acc = _mm_setzero_ps();
+ imag_L_code_acc = _mm_setzero_ps();
+ real_VL_code_acc = _mm_setzero_ps();
+ imag_VL_code_acc = _mm_setzero_ps();
+
+ if (sse_iters>0)
+ {
+ for(int number = 0;number < sse_iters; number++){
+
+ //Perform the carrier wipe-off
+ x = _mm_lddqu_si128((__m128i*)input_ptr);
+ y = _mm_lddqu_si128((__m128i*)carrier_ptr);
+
+ x_abs = _mm_abs_epi8 (x);
+
+ y_aux = _mm_sign_epi8 (y, x);
+ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence);
+ real_output = _mm_maddubs_epi16 (x_abs, y_aux);
+
+ y_aux = _mm_shuffle_epi8 (y, rearrange_sequence);
+ y_aux = _mm_sign_epi8 (y_aux, x);
+ imag_output = _mm_maddubs_epi16 (x_abs, y_aux);
+
+ imag_output = _mm_slli_si128 (imag_output, 1);
+ bb_signal_sample_aux = _mm_blendv_epi8 (imag_output, real_output, mult1);
+ bb_signal_sample_aux_abs = _mm_abs_epi8 (bb_signal_sample_aux);
+
+ //Get very early values
+ y = _mm_lddqu_si128((__m128i*)VE_code_ptr);
+
+ y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux);
+ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence);
+ real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux);
+
+ y_aux = _mm_shuffle_epi8 (y, rearrange_sequence);
+ y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux);
+ imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux);
+
+ real_output_i_1 = _mm_cvtepi16_epi32(real_output);
+ real_output = _mm_srli_si128 (real_output, 8);
+ real_output_i_2 = _mm_cvtepi16_epi32(real_output);
+ real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2);
+ real_output_ps = _mm_cvtepi32_ps(real_output_i32);
+
+ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output);
+ imag_output = _mm_srli_si128 (imag_output, 8);
+ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output);
+ imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2);
+ imag_output_ps = _mm_cvtepi32_ps(imag_output_i32);
+
+ real_VE_code_acc = _mm_add_ps (real_VE_code_acc, real_output_ps);
+ imag_VE_code_acc = _mm_add_ps (imag_VE_code_acc, imag_output_ps);
+
+ //Get early values
+ y = _mm_lddqu_si128((__m128i*)E_code_ptr);
+
+ y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux);
+ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence);
+ real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux);
+
+ y_aux = _mm_shuffle_epi8 (y, rearrange_sequence);
+ y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux);
+ imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux);
+
+ real_output_i_1 = _mm_cvtepi16_epi32(real_output);
+ real_output = _mm_srli_si128 (real_output, 8);
+ real_output_i_2 = _mm_cvtepi16_epi32(real_output);
+ real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2);
+ real_output_ps = _mm_cvtepi32_ps(real_output_i32);
+
+ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output);
+ imag_output = _mm_srli_si128 (imag_output, 8);
+ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output);
+ imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2);
+ imag_output_ps = _mm_cvtepi32_ps(imag_output_i32);
+
+ real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps);
+ imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps);
+
+ //Get prompt values
+ y = _mm_lddqu_si128((__m128i*)P_code_ptr);
+
+ y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux);
+ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence);
+ real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux);
+
+ y_aux = _mm_shuffle_epi8 (y, rearrange_sequence);
+ y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux);
+ imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux);
+
+ real_output_i_1 = _mm_cvtepi16_epi32(real_output);
+ real_output = _mm_srli_si128 (real_output, 8);
+ real_output_i_2 = _mm_cvtepi16_epi32(real_output);
+ real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2);
+ real_output_ps = _mm_cvtepi32_ps(real_output_i32);
+
+ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output);
+ imag_output = _mm_srli_si128 (imag_output, 8);
+ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output);
+ imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2);
+ imag_output_ps = _mm_cvtepi32_ps(imag_output_i32);
+
+ real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps);
+ imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps);
+
+ //Get late values
+ y = _mm_lddqu_si128((__m128i*)L_code_ptr);
+
+ y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux);
+ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence);
+ real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux);
+
+ y_aux = _mm_shuffle_epi8 (y, rearrange_sequence);
+ y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux);
+ imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux);
+
+ real_output_i_1 = _mm_cvtepi16_epi32(real_output);
+ real_output = _mm_srli_si128 (real_output, 8);
+ real_output_i_2 = _mm_cvtepi16_epi32(real_output);
+ real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2);
+ real_output_ps = _mm_cvtepi32_ps(real_output_i32);
+
+ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output);
+ imag_output = _mm_srli_si128 (imag_output, 8);
+ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output);
+ imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2);
+ imag_output_ps = _mm_cvtepi32_ps(imag_output_i32);
+
+ real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps);
+ imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps);
+
+ //Get very late values
+ y = _mm_lddqu_si128((__m128i*)VL_code_ptr);
+
+ y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux);
+ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence);
+ real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux);
+
+ y_aux = _mm_shuffle_epi8 (y, _mm_set_epi8 (14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1));
+ y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux);
+ imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux);
+
+ real_output_i_1 = _mm_cvtepi16_epi32(real_output);
+ real_output = _mm_srli_si128 (real_output, 8);
+ real_output_i_2 = _mm_cvtepi16_epi32(real_output);
+ real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2);
+ real_output_ps = _mm_cvtepi32_ps(real_output_i32);
+
+ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output);
+ imag_output = _mm_srli_si128 (imag_output, 8);
+ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output);
+ imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2);
+ imag_output_ps = _mm_cvtepi32_ps(imag_output_i32);
+
+ real_VL_code_acc = _mm_add_ps (real_VL_code_acc, real_output_ps);
+ imag_VL_code_acc = _mm_add_ps (imag_VL_code_acc, imag_output_ps);
+
+ input_ptr += 8;
+ carrier_ptr += 8;
+ VE_code_ptr += 8;
+ E_code_ptr += 8;
+ P_code_ptr += 8;
+ L_code_ptr += 8;
+ VL_code_ptr += 8;
+ }
+
+ __VOLK_ATTR_ALIGNED(16) float real_VE_dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(16) float imag_VE_dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(16) float real_VL_dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(16) float imag_VL_dotProductVector[4];
+
+ _mm_storeu_ps((float*)real_VE_dotProductVector,real_VE_code_acc); // Store the results back into the dot product vector
+ _mm_storeu_ps((float*)imag_VE_dotProductVector,imag_VE_code_acc); // Store the results back into the dot product vector
+ _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector
+ _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector
+ _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector
+ _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector
+ _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector
+ _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector
+ _mm_storeu_ps((float*)real_VL_dotProductVector,real_VL_code_acc); // Store the results back into the dot product vector
+ _mm_storeu_ps((float*)imag_VL_dotProductVector,imag_VL_code_acc); // Store the results back into the dot product vector
+
+ for (int i = 0; i<4; ++i)
+ {
+ VE_out_real += real_VE_dotProductVector[i];
+ VE_out_imag += imag_VE_dotProductVector[i];
+ E_out_real += real_E_dotProductVector[i];
+ E_out_imag += imag_E_dotProductVector[i];
+ P_out_real += real_P_dotProductVector[i];
+ P_out_imag += imag_P_dotProductVector[i];
+ L_out_real += real_L_dotProductVector[i];
+ L_out_imag += imag_L_dotProductVector[i];
+ VL_out_real += real_VL_dotProductVector[i];
+ VL_out_imag += imag_VL_dotProductVector[i];
+ }
+ *VE_out_ptr = lv_cmake(VE_out_real, VE_out_imag);
+ *E_out_ptr = lv_cmake(E_out_real, E_out_imag);
+ *P_out_ptr = lv_cmake(P_out_real, P_out_imag);
+ *L_out_ptr = lv_cmake(L_out_real, L_out_imag);
+ *VL_out_ptr = lv_cmake(VL_out_real, VL_out_imag);
+ }
+
+ lv_16sc_t bb_signal_sample;
+ for(int i=0; i < num_points%8; ++i)
+ {
+ //Perform the carrier wipe-off
+ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
+ // Now get very early, early, prompt, late and very late values for each
+ *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*VE_code_ptr++));
+ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*E_code_ptr++));
+ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*P_code_ptr++));
+ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*L_code_ptr++));
+ *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*VL_code_ptr++));
+ }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#ifdef LV_HAVE_SSE4_1
+#include "smmintrin.h"
+#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h"
+#include "CommonMacros/CommonMacros.h"
+/*!
+ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
+ \param input The input signal input
+ \param carrier The carrier signal input
+ \param VE_code Very Early PRN code replica input
+ \param E_code Early PRN code replica input
+ \param P_code Prompt PRN code replica input
+ \param L_code Late PRN code replica input
+ \param VL_code Very Late PRN code replica input
+ \param VE_out Very Early correlation output
+ \param E_out Early correlation output
+ \param P_out Prompt correlation output
+ \param L_out Late correlation output
+ \param VL_out Very Late correlation output
+ \param num_points The number of complex values in vectors
+ */
+static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5_u_sse4_1_fourth(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points)
+{
+ const unsigned int sse_iters = num_points / 8;
+
+ __m128i x, x_abs, y, y_aux, bb_signal_sample_aux, bb_signal_sample_aux_abs;;
+ __m128i real_output, imag_output;
+ __m128 real_VE_code_acc, imag_VE_code_acc, real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc, real_VL_code_acc, imag_VL_code_acc;
+ __m128i real_output_i_1, real_output_i_2, imag_output_i_1, imag_output_i_2, real_output_i32, imag_output_i32;
+ __m128 real_output_ps, imag_output_ps;
+ __m128i minus128control;
+
+ __m128i minus128 = _mm_set1_epi8 (-128);
+ __m128i check_sign_sequence = _mm_set_epi8 (255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1);
+ __m128i rearrange_sequence = _mm_set_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
+ __m128i mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
+
+ const lv_8sc_t* input_ptr = input;
+ const lv_8sc_t* carrier_ptr = carrier;
+
+ const lv_8sc_t* VE_code_ptr = VE_code;
+ lv_32fc_t* VE_out_ptr = VE_out;
+ const lv_8sc_t* E_code_ptr = E_code;
+ lv_32fc_t* E_out_ptr = E_out;
+ const lv_8sc_t* P_code_ptr = P_code;
+ lv_32fc_t* P_out_ptr = P_out;
+ const lv_8sc_t* L_code_ptr = L_code;
+ lv_32fc_t* L_out_ptr = L_out;
+ const lv_8sc_t* VL_code_ptr = VL_code;
+ lv_32fc_t* VL_out_ptr = VL_out;
+
+ float VE_out_real = 0;
+ float VE_out_imag = 0;
+ float E_out_real = 0;
+ float E_out_imag = 0;
+ float P_out_real = 0;
+ float P_out_imag = 0;
+ float L_out_real = 0;
+ float L_out_imag = 0;
+ float VL_out_real = 0;
+ float VL_out_imag = 0;
+
+ real_VE_code_acc = _mm_setzero_ps();
+ imag_VE_code_acc = _mm_setzero_ps();
+ real_E_code_acc = _mm_setzero_ps();
+ imag_E_code_acc = _mm_setzero_ps();
+ real_P_code_acc = _mm_setzero_ps();
+ imag_P_code_acc = _mm_setzero_ps();
+ real_L_code_acc = _mm_setzero_ps();
+ imag_L_code_acc = _mm_setzero_ps();
+ real_VL_code_acc = _mm_setzero_ps();
+ imag_VL_code_acc = _mm_setzero_ps();
+
+ if (sse_iters>0)
+ {
+ for(int number = 0;number < sse_iters; number++){
+
+ //Perform the carrier wipe-off
+ x = _mm_lddqu_si128((__m128i*)input_ptr);
+ y = _mm_lddqu_si128((__m128i*)carrier_ptr);
+
+ x_abs = _mm_abs_epi8 (x);
+
+ y_aux = _mm_sign_epi8 (y, x);
+ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence);
+ real_output = _mm_maddubs_epi16 (x_abs, y_aux);
+
+ y_aux = _mm_shuffle_epi8 (y, rearrange_sequence);
+ y_aux = _mm_sign_epi8 (y_aux, x);
+ imag_output = _mm_maddubs_epi16 (x_abs, y_aux);
+
+ imag_output = _mm_slli_si128 (imag_output, 1);
+ bb_signal_sample_aux = _mm_blendv_epi8 (imag_output, real_output, mult1);
+ bb_signal_sample_aux_abs = _mm_abs_epi8 (bb_signal_sample_aux);
+
+ //Get very early values
+ y = _mm_lddqu_si128((__m128i*)VE_code_ptr);
+ minus128control = _mm_cmpeq_epi8 (y, minus128);
+ y = _mm_sub_epi8 (y, minus128control);
+
+ y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux);
+ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence);
+ real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux);
+
+ y_aux = _mm_shuffle_epi8 (y, rearrange_sequence);
+ y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux);
+ imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux);
+
+ real_output_i_1 = _mm_cvtepi16_epi32(real_output);
+ real_output = _mm_srli_si128 (real_output, 8);
+ real_output_i_2 = _mm_cvtepi16_epi32(real_output);
+ real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2);
+ real_output_ps = _mm_cvtepi32_ps(real_output_i32);
+
+ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output);
+ imag_output = _mm_srli_si128 (imag_output, 8);
+ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output);
+ imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2);
+ imag_output_ps = _mm_cvtepi32_ps(imag_output_i32);
+
+ real_VE_code_acc = _mm_add_ps (real_VE_code_acc, real_output_ps);
+ imag_VE_code_acc = _mm_add_ps (imag_VE_code_acc, imag_output_ps);
+
+ //Get early values
+ y = _mm_lddqu_si128((__m128i*)E_code_ptr);
+ minus128control = _mm_cmpeq_epi8 (y, minus128);
+ y = _mm_sub_epi8 (y, minus128control);
+
+ y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux);
+ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence);
+ real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux);
+
+ y_aux = _mm_shuffle_epi8 (y, rearrange_sequence);
+ y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux);
+ imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux);
+
+ real_output_i_1 = _mm_cvtepi16_epi32(real_output);
+ real_output = _mm_srli_si128 (real_output, 8);
+ real_output_i_2 = _mm_cvtepi16_epi32(real_output);
+ real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2);
+ real_output_ps = _mm_cvtepi32_ps(real_output_i32);
+
+ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output);
+ imag_output = _mm_srli_si128 (imag_output, 8);
+ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output);
+ imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2);
+ imag_output_ps = _mm_cvtepi32_ps(imag_output_i32);
+
+ real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps);
+ imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps);
+
+ //Get prompt values
+ y = _mm_lddqu_si128((__m128i*)P_code_ptr);
+ minus128control = _mm_cmpeq_epi8 (y, minus128);
+ y = _mm_sub_epi8 (y, minus128control);
+
+ y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux);
+ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence);
+ real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux);
+
+ y_aux = _mm_shuffle_epi8 (y, rearrange_sequence);
+ y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux);
+ imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux);
+
+ real_output_i_1 = _mm_cvtepi16_epi32(real_output);
+ real_output = _mm_srli_si128 (real_output, 8);
+ real_output_i_2 = _mm_cvtepi16_epi32(real_output);
+ real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2);
+ real_output_ps = _mm_cvtepi32_ps(real_output_i32);
+
+ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output);
+ imag_output = _mm_srli_si128 (imag_output, 8);
+ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output);
+ imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2);
+ imag_output_ps = _mm_cvtepi32_ps(imag_output_i32);
+
+ real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps);
+ imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps);
+
+ //Get late values
+ y = _mm_lddqu_si128((__m128i*)L_code_ptr);
+ minus128control = _mm_cmpeq_epi8 (y, minus128);
+ y = _mm_sub_epi8 (y, minus128control);
+
+ y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux);
+ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence);
+ real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux);
+
+ y_aux = _mm_shuffle_epi8 (y, rearrange_sequence);
+ y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux);
+ imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux);
+
+ real_output_i_1 = _mm_cvtepi16_epi32(real_output);
+ real_output = _mm_srli_si128 (real_output, 8);
+ real_output_i_2 = _mm_cvtepi16_epi32(real_output);
+ real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2);
+ real_output_ps = _mm_cvtepi32_ps(real_output_i32);
+
+ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output);
+ imag_output = _mm_srli_si128 (imag_output, 8);
+ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output);
+ imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2);
+ imag_output_ps = _mm_cvtepi32_ps(imag_output_i32);
+
+ real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps);
+ imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps);
+
+ //Get very late values
+ y = _mm_lddqu_si128((__m128i*)VL_code_ptr);
+ minus128control = _mm_cmpeq_epi8 (y, minus128);
+ y = _mm_sub_epi8 (y, minus128control);
+
+
+ y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux);
+ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence);
+ real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux);
+
+ y_aux = _mm_shuffle_epi8 (y, _mm_set_epi8 (14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1));
+ y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux);
+ imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux);
+
+ real_output_i_1 = _mm_cvtepi16_epi32(real_output);
+ real_output = _mm_srli_si128 (real_output, 8);
+ real_output_i_2 = _mm_cvtepi16_epi32(real_output);
+ real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2);
+ real_output_ps = _mm_cvtepi32_ps(real_output_i32);
+
+ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output);
+ imag_output = _mm_srli_si128 (imag_output, 8);
+ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output);
+ imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2);
+ imag_output_ps = _mm_cvtepi32_ps(imag_output_i32);
+
+ real_VL_code_acc = _mm_add_ps (real_VL_code_acc, real_output_ps);
+ imag_VL_code_acc = _mm_add_ps (imag_VL_code_acc, imag_output_ps);
+
+ input_ptr += 8;
+ carrier_ptr += 8;
+ VE_code_ptr += 8;
+ E_code_ptr += 8;
+ P_code_ptr += 8;
+ L_code_ptr += 8;
+ VL_code_ptr += 8;
+ }
+
+ __VOLK_ATTR_ALIGNED(16) float real_VE_dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(16) float imag_VE_dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(16) float real_VL_dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(16) float imag_VL_dotProductVector[4];
+
+ _mm_storeu_ps((float*)real_VE_dotProductVector,real_VE_code_acc); // Store the results back into the dot product vector
+ _mm_storeu_ps((float*)imag_VE_dotProductVector,imag_VE_code_acc); // Store the results back into the dot product vector
+ _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector
+ _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector
+ _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector
+ _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector
+ _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector
+ _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector
+ _mm_storeu_ps((float*)real_VL_dotProductVector,real_VL_code_acc); // Store the results back into the dot product vector
+ _mm_storeu_ps((float*)imag_VL_dotProductVector,imag_VL_code_acc); // Store the results back into the dot product vector
+
+ for (int i = 0; i<4; ++i)
+ {
+ VE_out_real += real_VE_dotProductVector[i];
+ VE_out_imag += imag_VE_dotProductVector[i];
+ E_out_real += real_E_dotProductVector[i];
+ E_out_imag += imag_E_dotProductVector[i];
+ P_out_real += real_P_dotProductVector[i];
+ P_out_imag += imag_P_dotProductVector[i];
+ L_out_real += real_L_dotProductVector[i];
+ L_out_imag += imag_L_dotProductVector[i];
+ VL_out_real += real_VL_dotProductVector[i];
+ VL_out_imag += imag_VL_dotProductVector[i];
+ }
+ *VE_out_ptr = lv_cmake(VE_out_real, VE_out_imag);
+ *E_out_ptr = lv_cmake(E_out_real, E_out_imag);
+ *P_out_ptr = lv_cmake(P_out_real, P_out_imag);
+ *L_out_ptr = lv_cmake(L_out_real, L_out_imag);
+ *VL_out_ptr = lv_cmake(VL_out_real, VL_out_imag);
+ }
+
+ lv_16sc_t bb_signal_sample;
+ for(int i=0; i < num_points%8; ++i)
+ {
+ //Perform the carrier wipe-off
+ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
+ // Now get very early, early, prompt, late and very late values for each
+ *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*VE_code_ptr++));
+ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*E_code_ptr++));
+ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*P_code_ptr++));
+ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*L_code_ptr++));
+ *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*VL_code_ptr++));
+ }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+
+#ifdef LV_HAVE_GENERIC
+#include <stdio.h>
+#include <tmmintrin.h>
+
+/*!
+ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
+ \param input The input signal input
+ \param carrier The carrier signal input
+ \param VE_code Very Early PRN code replica input
+ \param E_code Early PRN code replica input
+ \param P_code Prompt PRN code replica input
+ \param L_code Late PRN code replica input
+ \param VL_code Very Late PRN code replica input
+ \param VE_out Very Early correlation output
+ \param E_out Early correlation output
+ \param P_out Prompt correlation output
+ \param L_out Late correlation output
+ \param VL_out Very Late correlation output
+ \param num_points The number of complex values in vectors
+ */
+static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points)
+{
+ *VE_out = 0;
+ *E_out = 0;
+ *P_out = 0;
+ *L_out = 0;
+ *VL_out = 0;
+
+
+ lv_16sc_t VE_code_value;
+ lv_16sc_t E_code_value;
+ lv_16sc_t P_code_value;
+ lv_16sc_t L_code_value;
+ lv_16sc_t VL_code_value;
+ lv_16sc_t bb_signal_sample;
+
+ for(int i=0; i < num_points; ++i)
+ {
+ VE_code_value = VE_code[i];
+ E_code_value = E_code[i];
+ P_code_value = P_code[i];
+ L_code_value = L_code[i];
+ VL_code_value = VL_code[i];
+
+ if(lv_creal(VE_code_value) == -128)
+ {
+ VE_code_value = lv_cmake(-127, lv_cimag(VE_code_value));
+ }
+ if(lv_cimag(VE_code_value) == -128)
+ {
+ VE_code_value = lv_cmake(lv_creal(VE_code_value), -127);
+ }
+
+ if(lv_creal(E_code_value) == -128)
+ {
+ E_code_value = lv_cmake(-127, lv_cimag(E_code_value));
+ }
+ if(lv_cimag(E_code_value) == -128)
+ {
+ E_code_value = lv_cmake(lv_creal(E_code_value), -127);
+ }
+
+ if(lv_creal(P_code_value) == -128)
+ {
+ P_code_value = lv_cmake(-127, lv_cimag(P_code_value));
+ }
+ if(lv_cimag(P_code_value) == -128)
+ {
+ P_code_value = lv_cmake(lv_creal(P_code_value), -127);
+ }
+
+ if(lv_creal(L_code_value) == -128)
+ {
+ L_code_value = lv_cmake(-127, lv_cimag(L_code_value));
+ }
+ if(lv_cimag(L_code_value) == -128)
+ {
+ L_code_value = lv_cmake(lv_creal(L_code_value), -127);
+ }
+
+ if(lv_creal(VL_code_value) == -128)
+ {
+ VL_code_value = lv_cmake(-127, lv_cimag(VL_code_value));
+ }
+ if(lv_cimag(VL_code_value) == -128)
+ {
+ VL_code_value = lv_cmake(lv_creal(VL_code_value), -127);
+ }
+
+ //Perform the carrier wipe-off
+ bb_signal_sample = input[i] * carrier[i];
+ // Now get very early, early, prompt, late and very late values for each
+ *VE_out += (lv_32fc_t) (bb_signal_sample * VE_code_value);
+ *E_out += (lv_32fc_t) (bb_signal_sample * E_code_value);
+ *P_out += (lv_32fc_t) (bb_signal_sample * P_code_value);
+ *L_out += (lv_32fc_t) (bb_signal_sample * L_code_value);
+ *VL_out += (lv_32fc_t) (bb_signal_sample * VL_code_value);
+ }
+}
+
+#endif /* LV_HAVE_GENERIC */
+
+//#ifdef LV_HAVE_GENERIC
+//#include <stdio.h>
+//#include <stdlib.h>
+//#include <tmmintrin.h>
+//
+//#ifndef MAX
+//#define MAX(a,b) ((a) > (b) ? a : b)
+//#endif
+//
+//#ifndef MIN
+//#define MIN(a,b) ((a) < (b) ? a : b)
+//#endif
+//
+///*!
+// \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
+// \param input The input signal input
+// \param carrier The carrier signal input
+// \param VE_code Very Early PRN code replica input
+// \param E_code Early PRN code replica input
+// \param P_code Prompt PRN code replica input
+// \param L_code Late PRN code replica input
+// \param VL_code Very Late PRN code replica input
+// \param VE_out Very Early correlation output
+// \param E_out Early correlation output
+// \param P_out Prompt correlation output
+// \param L_out Late correlation output
+// \param VL_out Very Late correlation output
+// \param num_points The number of complex values in vectors
+// */
+//static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points)
+//{
+// *VE_out = 0;
+// *E_out = 0;
+// *P_out = 0;
+// *L_out = 0;
+// *VL_out = 0;
+//
+// lv_16sc_t VE_out16;
+// lv_16sc_t E_out16;
+// lv_16sc_t P_out16;
+// lv_16sc_t L_out16;
+// lv_16sc_t VL_out16;
+//
+// int32_t max = 32767;
+// int32_t min = -32768;
+//
+// int16_t real_real;
+// int16_t imag_imag;
+// int16_t real_imag;
+// int16_t imag_real;
+// int32_t out_real_32;
+// int32_t out_imag_32;
+// int16_t out_real_16;
+// int16_t out_imag_16;
+// int16_t aux1;
+// int16_t aux2;
+//
+//
+// lv_8sc_t bb_signal_sample = lv_cmake(0, 0);
+//
+// // perform very early, Early, Prompt, Late and very late correlation
+// for(int i=0; i < num_points; ++i)
+// {
+// //Perform the carrier wipe-off
+// bb_signal_sample = input[i] * carrier[i];
+//
+// aux1 = (int16_t)lv_creal(bb_signal_sample);
+// aux2 = (int16_t)lv_creal(VE_code[i]);
+// real_real = aux1*aux2;
+// aux1 = (int16_t)lv_cimag(bb_signal_sample);
+// aux2 = (int16_t)lv_cimag(VE_code[i]);
+// imag_imag = aux1*aux2;
+// aux1 = (int16_t)lv_creal(bb_signal_sample);
+// aux2 = (int16_t)lv_cimag(VE_code[i]);
+// real_imag = aux1*aux2;
+// aux1 = (int16_t)lv_cimag(bb_signal_sample);
+// aux2 = (int16_t)lv_creal(VE_code[i]);
+// imag_real = aux1*aux2;
+// out_real_32 = (int32_t)real_real - (int32_t)imag_imag;
+// out_imag_32 = (int32_t)real_imag + (int32_t)imag_real;
+// out_real_16 = MIN(MAX(out_real_32, min), max);
+// out_imag_16 = MIN(MAX(out_imag_32, min), max);
+// VE_out16 = lv_cmake(out_real_16, out_imag_16);
+//
+//
+//
+// if(lv_creal(L_code[i]) == -128)
+// {
+// int8_t* L_pointer = (int8_t*)&L_code[i];
+// *L_pointer = -127;
+// }
+// if(lv_cimag(L_code[i]) == -128)
+// {
+// int8_t* L_pointer = (int8_t*)&L_code[i];
+// L_pointer++;
+// *L_pointer = -127;
+// }
+// aux1 = (int16_t)lv_creal(bb_signal_sample);
+// aux2 = (int16_t)lv_creal(L_code[i]);
+// real_real = aux1*aux2;
+// aux1 = (int16_t)lv_cimag(bb_signal_sample);
+// aux2 = (int16_t)lv_cimag(L_code[i]);
+// imag_imag = aux1*aux2;
+// aux1 = (int16_t)lv_creal(bb_signal_sample);
+// aux2 = (int16_t)lv_cimag(L_code[i]);
+// real_imag = aux1*aux2;
+// aux1 = (int16_t)lv_cimag(bb_signal_sample);
+// aux2 = (int16_t)lv_creal(L_code[i]);
+// imag_real = aux1*aux2;
+// out_real_32 = (int32_t)real_real - (int32_t)imag_imag;
+// out_imag_32 = (int32_t)real_imag + (int32_t)imag_real;
+// out_real_16 = MIN(MAX(out_real_32, min), max);
+// out_imag_16 = MIN(MAX(out_imag_32, min), max);
+// L_out16 = lv_cmake(out_real_16, out_imag_16);
+//
+// E_out16 = (lv_16sc_t)bb_signal_sample * (lv_16sc_t)E_code[i];
+// P_out16 = (lv_16sc_t)bb_signal_sample * (lv_16sc_t)P_code[i];
+// VL_out16 = (lv_16sc_t)bb_signal_sample * (lv_16sc_t)VL_code[i];
+//
+//
+// *VE_out += (lv_32fc_t) VE_out16;
+// *E_out += (lv_32fc_t) E_out16;
+// *P_out += (lv_32fc_t) P_out16;
+// *L_out += (lv_32fc_t) L_out16;
+// *VL_out += (lv_32fc_t) VL_out16;
+//
+// //error en la parte real de L con 32 muestras
+// //*L_out = lv_cmake(12, 12);
+// }
+//}
+//
+//#endif /* LV_HAVE_GENERIC */
+
+//#ifdef LV_HAVE_GENERIC
+///*!
+// \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
+// \param input The input signal input
+// \param carrier The carrier signal input
+// \param VE_code Very Early PRN code replica input
+// \param E_code Early PRN code replica input
+// \param P_code Prompt PRN code replica input
+// \param L_code Late PRN code replica input
+// \param VL_code Very Late PRN code replica input
+// \param VE_out Very Early correlation output
+// \param E_out Early correlation output
+// \param P_out Prompt correlation output
+// \param L_out Late correlation output
+// \param VL_out Very Late correlation output
+// \param num_points The number of complex values in vectors
+// */
+//static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points)
+//{
+// lv_8sc_t bb_signal_sample;
+//
+// bb_signal_sample = lv_cmake(0, 0);
+//
+// *VE_out = 0;
+// *E_out = 0;
+// *P_out = 0;
+// *L_out = 0;
+// *VL_out = 0;
+// // perform very early, Early, Prompt, Late and very late correlation
+// for(int i=0; i < num_points; ++i)
+// {
+// //Perform the carrier wipe-off
+// bb_signal_sample = input[i] * carrier[i];
+//
+// *VE_out += (lv_32fc_t) (bb_signal_sample * VE_code[i]);
+// *E_out += (lv_32fc_t) (bb_signal_sample * E_code[i]);
+// *P_out += (lv_32fc_t) (bb_signal_sample * P_code[i]);
+// *L_out += (lv_32fc_t) (bb_signal_sample * L_code[i]);
+// *VL_out += (lv_32fc_t) (bb_signal_sample * VL_code[i]);
+// }
+//}
+//
+//#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5_u_H */
\ No newline at end of file
diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5.h
--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5.h 1970-01-01 01:00:00.000000000 +0100
+++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5.h 2014-10-15 01:55:08.000000000 +0200
@@ -0,0 +1,772 @@
+/*!
+ * \file volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5.h
+ * \brief Volk protokernel: performs the carrier wipe-off mixing and the Very early, Early, Prompt, Late and very late correlation with 16 bits vectors, and accumulates the results into float32. This protokernel is called "safe" because it checks when the inputs have a -128 value, and replaces it with a -127 value. By doing this it avoids malfunctioning, but it lasts more time that the "unsafe" implementation. In order to avoid overflow, "input" and "carrier" must be values between —7 and 7 and "XX_code inputs" must be values between —127 and 127.
+ * \authors <ul>
+ * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
+ * </ul>
+ *
+ * Volk protokernel that performs the carrier wipe-off mixing and the
+ * Very early, Early, Prompt, Late and very late correlation with 16 bits vectors (8 bits the
+ * real part and 8 bits the imaginary part), and accumulates the result
+ * in 32 bits single point values, returning float32 values:
+ * - The carrier wipe-off is done by multiplying the input signal by the
+ * carrier (multiplication of 16 bits vectors) It returns the input
+ * signal in base band (BB)
+ * - Very Early values are calculated by multiplying the input signal in BB by the
+ * very early code (multiplication of 16 bits vectors), accumulating the results into float32 values
+ * - Early values are calculated by multiplying the input signal in BB by the
+ * early code (multiplication of 16 bits vectors), accumulating the results into float32 values
+ * - Prompt values are calculated by multiplying the input signal in BB by the
+ * prompt code (multiplication of 16 bits vectors), accumulating the results into float32 values
+ * - Late values are calculated by multiplying the input signal in BB by the
+ * late code (multiplication of 16 bits vectors), accumulating the results into float32 values
+ * - Very Late values are calculated by multiplying the input signal in BB by the
+ * very late code (multiplication of 16 bits vectors), accumulating the results into float32 values
+ *
+ * -------------------------------------------------------------------------
+ * Bits analysis
+ *
+ * input = 8 bits
+ * carrier = 8 bits
+ * XX_code = 8 bits
+ * XX_out16 = 16 bits
+ * bb_signal_sample = 8 bits
+ *
+ * bb_signal_sample = input*carrier -> 17 bits limited to 8 bits = input and carrier must be values between —7 and 7 to avoid overflow (3 bits)
+ *
+ * XX_out16 = XX_code*bb_signal_sample -> 17 bits limited to 16 bits = XX_code must be values between —127 and 127 to avoid overflow (7 bits)
+ * -------------------------------------------------------------------------
+ *
+ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
+ *
+ * GNSS-SDR is a software defined Global Navigation
+ * Satellite Systems receiver
+ *
+ * This file is part of GNSS-SDR.
+ *
+ * GNSS-SDR is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * at your option) any later version.
+ *
+ * GNSS-SDR is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * -------------------------------------------------------------------------
+ */
+
+#ifndef INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5_u_H
+#define INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk_gnsssdr/volk_gnsssdr_complex.h>
+#include <float.h>
+#include <string.h>
+
+#ifdef LV_HAVE_SSE4_1
+#include "smmintrin.h"
+#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h"
+#include "CommonMacros/CommonMacros.h"
+/*!
+ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
+ \param input The input signal input
+ \param carrier The carrier signal input
+ \param VE_code Very Early PRN code replica input
+ \param E_code Early PRN code replica input
+ \param P_code Prompt PRN code replica input
+ \param L_code Late PRN code replica input
+ \param VL_code Very Late PRN code replica input
+ \param VE_out Very Early correlation output
+ \param E_out Early correlation output
+ \param P_out Prompt correlation output
+ \param L_out Late correlation output
+ \param VL_out Very Late correlation output
+ \param num_points The number of complex values in vectors
+ */
+static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5_u_sse4_1(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points)
+{
+ const unsigned int sse_iters = num_points / 8;
+
+ __m128i x, x_abs, y, y_aux, bb_signal_sample_aux, bb_signal_sample_aux_abs;;
+ __m128i real_output, imag_output;
+ __m128 real_VE_code_acc, imag_VE_code_acc, real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc, real_VL_code_acc, imag_VL_code_acc;
+ __m128i input_i_1, input_i_2, output_i32;
+ __m128 real_output_ps, imag_output_ps;
+ __m128i minus128control;
+
+ __m128i minus128 = _mm_set1_epi8 (-128);
+ __m128i check_sign_sequence = _mm_set_epi8 (255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1);
+ __m128i rearrange_sequence = _mm_set_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
+ __m128i mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
+
+ const lv_8sc_t* input_ptr = input;
+ const lv_8sc_t* carrier_ptr = carrier;
+
+ const lv_8sc_t* VE_code_ptr = VE_code;
+ lv_32fc_t* VE_out_ptr = VE_out;
+ const lv_8sc_t* E_code_ptr = E_code;
+ lv_32fc_t* E_out_ptr = E_out;
+ const lv_8sc_t* P_code_ptr = P_code;
+ lv_32fc_t* P_out_ptr = P_out;
+ const lv_8sc_t* L_code_ptr = L_code;
+ lv_32fc_t* L_out_ptr = L_out;
+ const lv_8sc_t* VL_code_ptr = VL_code;
+ lv_32fc_t* VL_out_ptr = VL_out;
+
+ float VE_out_real = 0;
+ float VE_out_imag = 0;
+ float E_out_real = 0;
+ float E_out_imag = 0;
+ float P_out_real = 0;
+ float P_out_imag = 0;
+ float L_out_real = 0;
+ float L_out_imag = 0;
+ float VL_out_real = 0;
+ float VL_out_imag = 0;
+
+ real_VE_code_acc = _mm_setzero_ps();
+ imag_VE_code_acc = _mm_setzero_ps();
+ real_E_code_acc = _mm_setzero_ps();
+ imag_E_code_acc = _mm_setzero_ps();
+ real_P_code_acc = _mm_setzero_ps();
+ imag_P_code_acc = _mm_setzero_ps();
+ real_L_code_acc = _mm_setzero_ps();
+ imag_L_code_acc = _mm_setzero_ps();
+ real_VL_code_acc = _mm_setzero_ps();
+ imag_VL_code_acc = _mm_setzero_ps();
+
+ if (sse_iters>0)
+ {
+ for(int number = 0;number < sse_iters; number++){
+
+ //Perform the carrier wipe-off
+ x = _mm_lddqu_si128((__m128i*)input_ptr);
+ y = _mm_lddqu_si128((__m128i*)carrier_ptr);
+
+ x_abs = _mm_abs_epi8 (x);
+
+ CM_8IC_X2_SCALAR_PRODUCT_16IC_X2_U_SSSE3(y, x, check_sign_sequence, rearrange_sequence, y_aux, x_abs, real_output, imag_output)
+
+ imag_output = _mm_slli_si128 (imag_output, 1);
+ bb_signal_sample_aux = _mm_blendv_epi8 (imag_output, real_output, mult1);
+ bb_signal_sample_aux_abs = _mm_abs_epi8 (bb_signal_sample_aux);
+
+ //Get very early values
+ y = _mm_lddqu_si128((__m128i*)VE_code_ptr);
+
+ CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
+
+ real_VE_code_acc = _mm_add_ps (real_VE_code_acc, real_output_ps);
+ imag_VE_code_acc = _mm_add_ps (imag_VE_code_acc, imag_output_ps);
+
+ //Get early values
+ y = _mm_lddqu_si128((__m128i*)E_code_ptr);
+
+ CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
+
+ real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps);
+ imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps);
+
+ //Get prompt values
+ y = _mm_lddqu_si128((__m128i*)P_code_ptr);
+
+ CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
+
+ real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps);
+ imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps);
+
+ //Get late values
+ y = _mm_lddqu_si128((__m128i*)L_code_ptr);
+
+ CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
+
+ real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps);
+ imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps);
+
+ //Get very late values
+ y = _mm_lddqu_si128((__m128i*)VL_code_ptr);
+
+ CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
+
+ real_VL_code_acc = _mm_add_ps (real_VL_code_acc, real_output_ps);
+ imag_VL_code_acc = _mm_add_ps (imag_VL_code_acc, imag_output_ps);
+
+ input_ptr += 8;
+ carrier_ptr += 8;
+ VE_code_ptr += 8;
+ E_code_ptr += 8;
+ P_code_ptr += 8;
+ L_code_ptr += 8;
+ VL_code_ptr += 8;
+ }
+
+ __VOLK_ATTR_ALIGNED(16) float real_VE_dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(16) float imag_VE_dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(16) float real_VL_dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(16) float imag_VL_dotProductVector[4];
+
+ _mm_storeu_ps((float*)real_VE_dotProductVector,real_VE_code_acc); // Store the results back into the dot product vector
+ _mm_storeu_ps((float*)imag_VE_dotProductVector,imag_VE_code_acc); // Store the results back into the dot product vector
+ _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector
+ _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector
+ _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector
+ _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector
+ _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector
+ _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector
+ _mm_storeu_ps((float*)real_VL_dotProductVector,real_VL_code_acc); // Store the results back into the dot product vector
+ _mm_storeu_ps((float*)imag_VL_dotProductVector,imag_VL_code_acc); // Store the results back into the dot product vector
+
+ for (int i = 0; i<4; ++i)
+ {
+ VE_out_real += real_VE_dotProductVector[i];
+ VE_out_imag += imag_VE_dotProductVector[i];
+ E_out_real += real_E_dotProductVector[i];
+ E_out_imag += imag_E_dotProductVector[i];
+ P_out_real += real_P_dotProductVector[i];
+ P_out_imag += imag_P_dotProductVector[i];
+ L_out_real += real_L_dotProductVector[i];
+ L_out_imag += imag_L_dotProductVector[i];
+ VL_out_real += real_VL_dotProductVector[i];
+ VL_out_imag += imag_VL_dotProductVector[i];
+ }
+ *VE_out_ptr = lv_cmake(VE_out_real, VE_out_imag);
+ *E_out_ptr = lv_cmake(E_out_real, E_out_imag);
+ *P_out_ptr = lv_cmake(P_out_real, P_out_imag);
+ *L_out_ptr = lv_cmake(L_out_real, L_out_imag);
+ *VL_out_ptr = lv_cmake(VL_out_real, VL_out_imag);
+ }
+
+ if(num_points%8!=0)
+ {
+ lv_16sc_t bb_signal_sample;
+ lv_16sc_t VE_code_value;
+ lv_16sc_t E_code_value;
+ lv_16sc_t P_code_value;
+ lv_16sc_t L_code_value;
+ lv_16sc_t VL_code_value;
+
+ for(int i=0; i < num_points%8; ++i)
+ {
+ VE_code_value = *VE_code_ptr++;
+ E_code_value = *E_code_ptr++;
+ P_code_value = *P_code_ptr++;
+ L_code_value = *L_code_ptr++;
+ VL_code_value = *VL_code_ptr++;
+
+ if(lv_creal(VE_code_value) == -128)
+ {
+ VE_code_value = lv_cmake(-127, lv_cimag(VE_code_value));
+ }
+ if(lv_cimag(VE_code_value) == -128)
+ {
+ VE_code_value = lv_cmake(lv_creal(VE_code_value), -127);
+ }
+
+ if(lv_creal(E_code_value) == -128)
+ {
+ E_code_value = lv_cmake(-127, lv_cimag(E_code_value));
+ }
+ if(lv_cimag(E_code_value) == -128)
+ {
+ E_code_value = lv_cmake(lv_creal(E_code_value), -127);
+ }
+
+ if(lv_creal(P_code_value) == -128)
+ {
+ P_code_value = lv_cmake(-127, lv_cimag(P_code_value));
+ }
+ if(lv_cimag(P_code_value) == -128)
+ {
+ P_code_value = lv_cmake(lv_creal(P_code_value), -127);
+ }
+
+ if(lv_creal(L_code_value) == -128)
+ {
+ L_code_value = lv_cmake(-127, lv_cimag(L_code_value));
+ }
+ if(lv_cimag(L_code_value) == -128)
+ {
+ L_code_value = lv_cmake(lv_creal(L_code_value), -127);
+ }
+
+ //Perform the carrier wipe-off
+ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
+ // Now get very early, early, prompt, late and very late values for each
+ *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * VE_code_value);
+ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * E_code_value);
+ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * P_code_value);
+ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * L_code_value);
+ *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * VL_code_value);
+ }
+ }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#ifdef LV_HAVE_GENERIC
+#include <stdio.h>
+#include <tmmintrin.h>
+
+/*!
+ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
+ \param input The input signal input
+ \param carrier The carrier signal input
+ \param VE_code Very Early PRN code replica input
+ \param E_code Early PRN code replica input
+ \param P_code Prompt PRN code replica input
+ \param L_code Late PRN code replica input
+ \param VL_code Very Late PRN code replica input
+ \param VE_out Very Early correlation output
+ \param E_out Early correlation output
+ \param P_out Prompt correlation output
+ \param L_out Late correlation output
+ \param VL_out Very Late correlation output
+ \param num_points The number of complex values in vectors
+ */
+static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points)
+{
+ *VE_out = 0;
+ *E_out = 0;
+ *P_out = 0;
+ *L_out = 0;
+ *VL_out = 0;
+
+ lv_16sc_t VE_code_value;
+ lv_16sc_t E_code_value;
+ lv_16sc_t P_code_value;
+ lv_16sc_t L_code_value;
+ lv_16sc_t VL_code_value;
+ lv_16sc_t bb_signal_sample;
+
+ for(int i=0; i < num_points; ++i)
+ {
+ VE_code_value = VE_code[i];
+ E_code_value = E_code[i];
+ P_code_value = P_code[i];
+ L_code_value = L_code[i];
+ VL_code_value = VL_code[i];
+
+ if(lv_creal(VE_code_value) == -128)
+ {
+ VE_code_value = lv_cmake(-127, lv_cimag(VE_code_value));
+ }
+ if(lv_cimag(VE_code_value) == -128)
+ {
+ VE_code_value = lv_cmake(lv_creal(VE_code_value), -127);
+ }
+
+ if(lv_creal(E_code_value) == -128)
+ {
+ E_code_value = lv_cmake(-127, lv_cimag(E_code_value));
+ }
+ if(lv_cimag(E_code_value) == -128)
+ {
+ E_code_value = lv_cmake(lv_creal(E_code_value), -127);
+ }
+
+ if(lv_creal(P_code_value) == -128)
+ {
+ P_code_value = lv_cmake(-127, lv_cimag(P_code_value));
+ }
+ if(lv_cimag(P_code_value) == -128)
+ {
+ P_code_value = lv_cmake(lv_creal(P_code_value), -127);
+ }
+
+ if(lv_creal(L_code_value) == -128)
+ {
+ L_code_value = lv_cmake(-127, lv_cimag(L_code_value));
+ }
+ if(lv_cimag(L_code_value) == -128)
+ {
+ L_code_value = lv_cmake(lv_creal(L_code_value), -127);
+ }
+
+ if(lv_creal(VL_code_value) == -128)
+ {
+ VL_code_value = lv_cmake(-127, lv_cimag(VL_code_value));
+ }
+ if(lv_cimag(VL_code_value) == -128)
+ {
+ VL_code_value = lv_cmake(lv_creal(VL_code_value), -127);
+ }
+
+ //Perform the carrier wipe-off
+ bb_signal_sample = input[i] * carrier[i];
+ // Now get very early, early, prompt, late and very late values for each
+ *VE_out += (lv_32fc_t) (bb_signal_sample * VE_code_value);
+ *E_out += (lv_32fc_t) (bb_signal_sample * E_code_value);
+ *P_out += (lv_32fc_t) (bb_signal_sample * P_code_value);
+ *L_out += (lv_32fc_t) (bb_signal_sample * L_code_value);
+ *VL_out += (lv_32fc_t) (bb_signal_sample * VL_code_value);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+#endif /* INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5_u_H */
+
+
+#ifndef INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5_a_H
+#define INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk_gnsssdr/volk_gnsssdr_complex.h>
+#include <float.h>
+#include <string.h>
+
+#ifdef LV_HAVE_SSE4_1
+#include "smmintrin.h"
+#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h"
+#include "CommonMacros/CommonMacros.h"
+/*!
+ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
+ \param input The input signal input
+ \param carrier The carrier signal input
+ \param VE_code Very Early PRN code replica input
+ \param E_code Early PRN code replica input
+ \param P_code Prompt PRN code replica input
+ \param L_code Late PRN code replica input
+ \param VL_code Very Late PRN code replica input
+ \param VE_out Very Early correlation output
+ \param E_out Early correlation output
+ \param P_out Prompt correlation output
+ \param L_out Late correlation output
+ \param VL_out Very Late correlation output
+ \param num_points The number of complex values in vectors
+ */
+static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5_a_sse4_1(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points)
+{
+ const unsigned int sse_iters = num_points / 8;
+
+ __m128i x, x_abs, y, y_aux, bb_signal_sample_aux, bb_signal_sample_aux_abs;;
+ __m128i real_output, imag_output;
+ __m128 real_VE_code_acc, imag_VE_code_acc, real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc, real_VL_code_acc, imag_VL_code_acc;
+ __m128i input_i_1, input_i_2, output_i32;
+ __m128 real_output_ps, imag_output_ps;
+ __m128i minus128control;
+
+ __m128i minus128 = _mm_set1_epi8 (-128);
+ __m128i check_sign_sequence = _mm_set_epi8 (255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1);
+ __m128i rearrange_sequence = _mm_set_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
+ __m128i mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
+
+ const lv_8sc_t* input_ptr = input;
+ const lv_8sc_t* carrier_ptr = carrier;
+
+ const lv_8sc_t* VE_code_ptr = VE_code;
+ lv_32fc_t* VE_out_ptr = VE_out;
+ const lv_8sc_t* E_code_ptr = E_code;
+ lv_32fc_t* E_out_ptr = E_out;
+ const lv_8sc_t* P_code_ptr = P_code;
+ lv_32fc_t* P_out_ptr = P_out;
+ const lv_8sc_t* L_code_ptr = L_code;
+ lv_32fc_t* L_out_ptr = L_out;
+ const lv_8sc_t* VL_code_ptr = VL_code;
+ lv_32fc_t* VL_out_ptr = VL_out;
+
+ float VE_out_real = 0;
+ float VE_out_imag = 0;
+ float E_out_real = 0;
+ float E_out_imag = 0;
+ float P_out_real = 0;
+ float P_out_imag = 0;
+ float L_out_real = 0;
+ float L_out_imag = 0;
+ float VL_out_real = 0;
+ float VL_out_imag = 0;
+
+ real_VE_code_acc = _mm_setzero_ps();
+ imag_VE_code_acc = _mm_setzero_ps();
+ real_E_code_acc = _mm_setzero_ps();
+ imag_E_code_acc = _mm_setzero_ps();
+ real_P_code_acc = _mm_setzero_ps();
+ imag_P_code_acc = _mm_setzero_ps();
+ real_L_code_acc = _mm_setzero_ps();
+ imag_L_code_acc = _mm_setzero_ps();
+ real_VL_code_acc = _mm_setzero_ps();
+ imag_VL_code_acc = _mm_setzero_ps();
+
+ if (sse_iters>0)
+ {
+ for(int number = 0;number < sse_iters; number++){
+
+ //Perform the carrier wipe-off
+ x = _mm_load_si128((__m128i*)input_ptr);
+ y = _mm_load_si128((__m128i*)carrier_ptr);
+
+ x_abs = _mm_abs_epi8 (x);
+
+ CM_8IC_X2_SCALAR_PRODUCT_16IC_X2_U_SSSE3(y, x, check_sign_sequence, rearrange_sequence, y_aux, x_abs, real_output, imag_output)
+
+ imag_output = _mm_slli_si128 (imag_output, 1);
+ bb_signal_sample_aux = _mm_blendv_epi8 (imag_output, real_output, mult1);
+ bb_signal_sample_aux_abs = _mm_abs_epi8 (bb_signal_sample_aux);
+
+ //Get very early values
+ y = _mm_load_si128((__m128i*)VE_code_ptr);
+
+ CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
+
+ real_VE_code_acc = _mm_add_ps (real_VE_code_acc, real_output_ps);
+ imag_VE_code_acc = _mm_add_ps (imag_VE_code_acc, imag_output_ps);
+
+ //Get early values
+ y = _mm_load_si128((__m128i*)E_code_ptr);
+
+ CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
+
+ real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps);
+ imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps);
+
+ //Get prompt values
+ y = _mm_load_si128((__m128i*)P_code_ptr);
+
+ CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
+
+ real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps);
+ imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps);
+
+ //Get late values
+ y = _mm_load_si128((__m128i*)L_code_ptr);
+
+ CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
+
+ real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps);
+ imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps);
+
+ //Get very late values
+ y = _mm_load_si128((__m128i*)VL_code_ptr);
+
+ CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
+
+ real_VL_code_acc = _mm_add_ps (real_VL_code_acc, real_output_ps);
+ imag_VL_code_acc = _mm_add_ps (imag_VL_code_acc, imag_output_ps);
+
+ input_ptr += 8;
+ carrier_ptr += 8;
+ VE_code_ptr += 8;
+ E_code_ptr += 8;
+ P_code_ptr += 8;
+ L_code_ptr += 8;
+ VL_code_ptr += 8;
+ }
+
+ __VOLK_ATTR_ALIGNED(16) float real_VE_dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(16) float imag_VE_dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(16) float real_VL_dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(16) float imag_VL_dotProductVector[4];
+
+ _mm_store_ps((float*)real_VE_dotProductVector,real_VE_code_acc); // Store the results back into the dot product vector
+ _mm_store_ps((float*)imag_VE_dotProductVector,imag_VE_code_acc); // Store the results back into the dot product vector
+ _mm_store_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector
+ _mm_store_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector
+ _mm_store_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector
+ _mm_store_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector
+ _mm_store_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector
+ _mm_store_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector
+ _mm_store_ps((float*)real_VL_dotProductVector,real_VL_code_acc); // Store the results back into the dot product vector
+ _mm_store_ps((float*)imag_VL_dotProductVector,imag_VL_code_acc); // Store the results back into the dot product vector
+
+ for (int i = 0; i<4; ++i)
+ {
+ VE_out_real += real_VE_dotProductVector[i];
+ VE_out_imag += imag_VE_dotProductVector[i];
+ E_out_real += real_E_dotProductVector[i];
+ E_out_imag += imag_E_dotProductVector[i];
+ P_out_real += real_P_dotProductVector[i];
+ P_out_imag += imag_P_dotProductVector[i];
+ L_out_real += real_L_dotProductVector[i];
+ L_out_imag += imag_L_dotProductVector[i];
+ VL_out_real += real_VL_dotProductVector[i];
+ VL_out_imag += imag_VL_dotProductVector[i];
+ }
+ *VE_out_ptr = lv_cmake(VE_out_real, VE_out_imag);
+ *E_out_ptr = lv_cmake(E_out_real, E_out_imag);
+ *P_out_ptr = lv_cmake(P_out_real, P_out_imag);
+ *L_out_ptr = lv_cmake(L_out_real, L_out_imag);
+ *VL_out_ptr = lv_cmake(VL_out_real, VL_out_imag);
+ }
+
+ if(num_points%8!=0)
+ {
+ lv_16sc_t bb_signal_sample;
+ lv_16sc_t VE_code_value;
+ lv_16sc_t E_code_value;
+ lv_16sc_t P_code_value;
+ lv_16sc_t L_code_value;
+ lv_16sc_t VL_code_value;
+
+ for(int i=0; i < num_points%8; ++i)
+ {
+ VE_code_value = *VE_code_ptr++;
+ E_code_value = *E_code_ptr++;
+ P_code_value = *P_code_ptr++;
+ L_code_value = *L_code_ptr++;
+ VL_code_value = *VL_code_ptr++;
+
+ if(lv_creal(VE_code_value) == -128)
+ {
+ VE_code_value = lv_cmake(-127, lv_cimag(VE_code_value));
+ }
+ if(lv_cimag(VE_code_value) == -128)
+ {
+ VE_code_value = lv_cmake(lv_creal(VE_code_value), -127);
+ }
+
+ if(lv_creal(E_code_value) == -128)
+ {
+ E_code_value = lv_cmake(-127, lv_cimag(E_code_value));
+ }
+ if(lv_cimag(E_code_value) == -128)
+ {
+ E_code_value = lv_cmake(lv_creal(E_code_value), -127);
+ }
+
+ if(lv_creal(P_code_value) == -128)
+ {
+ P_code_value = lv_cmake(-127, lv_cimag(P_code_value));
+ }
+ if(lv_cimag(P_code_value) == -128)
+ {
+ P_code_value = lv_cmake(lv_creal(P_code_value), -127);
+ }
+
+ if(lv_creal(L_code_value) == -128)
+ {
+ L_code_value = lv_cmake(-127, lv_cimag(L_code_value));
+ }
+ if(lv_cimag(L_code_value) == -128)
+ {
+ L_code_value = lv_cmake(lv_creal(L_code_value), -127);
+ }
+
+ //Perform the carrier wipe-off
+ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
+ // Now get very early, early, prompt, late and very late values for each
+ *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * VE_code_value);
+ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * E_code_value);
+ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * P_code_value);
+ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * L_code_value);
+ *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * VL_code_value);
+ }
+ }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#ifdef LV_HAVE_GENERIC
+#include <stdio.h>
+#include <tmmintrin.h>
+
+/*!
+ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
+ \param input The input signal input
+ \param carrier The carrier signal input
+ \param VE_code Very Early PRN code replica input
+ \param E_code Early PRN code replica input
+ \param P_code Prompt PRN code replica input
+ \param L_code Late PRN code replica input
+ \param VL_code Very Late PRN code replica input
+ \param VE_out Very Early correlation output
+ \param E_out Early correlation output
+ \param P_out Prompt correlation output
+ \param L_out Late correlation output
+ \param VL_out Very Late correlation output
+ \param num_points The number of complex values in vectors
+ */
+static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5_a_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points)
+{
+ *VE_out = 0;
+ *E_out = 0;
+ *P_out = 0;
+ *L_out = 0;
+ *VL_out = 0;
+
+ lv_16sc_t VE_code_value;
+ lv_16sc_t E_code_value;
+ lv_16sc_t P_code_value;
+ lv_16sc_t L_code_value;
+ lv_16sc_t VL_code_value;
+ lv_16sc_t bb_signal_sample;
+
+ for(int i=0; i < num_points; ++i)
+ {
+ VE_code_value = VE_code[i];
+ E_code_value = E_code[i];
+ P_code_value = P_code[i];
+ L_code_value = L_code[i];
+ VL_code_value = VL_code[i];
+
+ if(lv_creal(VE_code_value) == -128)
+ {
+ VE_code_value = lv_cmake(-127, lv_cimag(VE_code_value));
+ }
+ if(lv_cimag(VE_code_value) == -128)
+ {
+ VE_code_value = lv_cmake(lv_creal(VE_code_value), -127);
+ }
+
+ if(lv_creal(E_code_value) == -128)
+ {
+ E_code_value = lv_cmake(-127, lv_cimag(E_code_value));
+ }
+ if(lv_cimag(E_code_value) == -128)
+ {
+ E_code_value = lv_cmake(lv_creal(E_code_value), -127);
+ }
+
+ if(lv_creal(P_code_value) == -128)
+ {
+ P_code_value = lv_cmake(-127, lv_cimag(P_code_value));
+ }
+ if(lv_cimag(P_code_value) == -128)
+ {
+ P_code_value = lv_cmake(lv_creal(P_code_value), -127);
+ }
+
+ if(lv_creal(L_code_value) == -128)
+ {
+ L_code_value = lv_cmake(-127, lv_cimag(L_code_value));
+ }
+ if(lv_cimag(L_code_value) == -128)
+ {
+ L_code_value = lv_cmake(lv_creal(L_code_value), -127);
+ }
+
+ if(lv_creal(VL_code_value) == -128)
+ {
+ VL_code_value = lv_cmake(-127, lv_cimag(VL_code_value));
+ }
+ if(lv_cimag(VL_code_value) == -128)
+ {
+ VL_code_value = lv_cmake(lv_creal(VL_code_value), -127);
+ }
+
+ //Perform the carrier wipe-off
+ bb_signal_sample = input[i] * carrier[i];
+ // Now get very early, early, prompt, late and very late values for each
+ *VE_out += (lv_32fc_t) (bb_signal_sample * VE_code_value);
+ *E_out += (lv_32fc_t) (bb_signal_sample * E_code_value);
+ *P_out += (lv_32fc_t) (bb_signal_sample * P_code_value);
+ *L_out += (lv_32fc_t) (bb_signal_sample * L_code_value);
+ *VL_out += (lv_32fc_t) (bb_signal_sample * VL_code_value);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+#endif /* INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5_a_H */
\ No newline at end of file
diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5.h
--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5.h 1970-01-01 01:00:00.000000000 +0100
+++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5.h 2014-10-15 01:55:08.000000000 +0200
@@ -0,0 +1,554 @@
+/*!
+ * \file volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5.h
+ * \brief Volk protokernel: performs the carrier wipe-off mixing and the Very early, Early, Prompt, Late and very late correlation with 16 bits vectors, and accumulates the results into float32. This protokernel is called "unsafe" because it does NOT check when the inputs have a -128 value. If you introduce a -128 value the protokernel will NOT operate properly (generic implementation will have different results than volk implementation). In order to avoid overflow, "input" and "carrier" must be values between —7 and 7 and "XX_code inputs" must be values between —127 and 127.
+ * \authors <ul>
+ * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
+ * </ul>
+ *
+ * Volk protokernel that performs the carrier wipe-off mixing and the
+ * Very early, Early, Prompt, Late and very late correlation with 16 bits vectors (8 bits the
+ * real part and 8 bits the imaginary part), and accumulates the result
+ * in 32 bits single point values, returning float32 values:
+ * - The carrier wipe-off is done by multiplying the input signal by the
+ * carrier (multiplication of 16 bits vectors) It returns the input
+ * signal in base band (BB)
+ * - Very Early values are calculated by multiplying the input signal in BB by the
+ * very early code (multiplication of 16 bits vectors), accumulating the results into float32 values
+ * - Early values are calculated by multiplying the input signal in BB by the
+ * early code (multiplication of 16 bits vectors), accumulating the results into float32 values
+ * - Prompt values are calculated by multiplying the input signal in BB by the
+ * prompt code (multiplication of 16 bits vectors), accumulating the results into float32 values
+ * - Late values are calculated by multiplying the input signal in BB by the
+ * late code (multiplication of 16 bits vectors), accumulating the results into float32 values
+ * - Very Late values are calculated by multiplying the input signal in BB by the
+ * very late code (multiplication of 16 bits vectors), accumulating the results into float32 values
+ *
+ * -------------------------------------------------------------------------
+ * Bits analysis
+ *
+ * input = 8 bits
+ * carrier = 8 bits
+ * XX_code = 8 bits
+ * XX_out16 = 16 bits
+ * bb_signal_sample = 8 bits
+ *
+ * bb_signal_sample = input*carrier -> 17 bits limited to 8 bits = input and carrier must be values between —7 and 7 to avoid overflow (3 bits)
+ *
+ * XX_out16 = XX_code*bb_signal_sample -> 17 bits limited to 16 bits = XX_code must be values between —127 and 127 to avoid overflow (7 bits)
+ * -------------------------------------------------------------------------
+ *
+ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
+ *
+ * GNSS-SDR is a software defined Global Navigation
+ * Satellite Systems receiver
+ *
+ * This file is part of GNSS-SDR.
+ *
+ * GNSS-SDR is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * at your option) any later version.
+ *
+ * GNSS-SDR is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * -------------------------------------------------------------------------
+ */
+
+#ifndef INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_u_H
+#define INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk_gnsssdr/volk_gnsssdr_complex.h>
+#include <float.h>
+#include <string.h>
+
+#ifdef LV_HAVE_SSE4_1
+#include "smmintrin.h"
+#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h"
+#include "CommonMacros/CommonMacros.h"
+/*!
+ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
+ \param input The input signal input
+ \param carrier The carrier signal input
+ \param VE_code Very Early PRN code replica input
+ \param E_code Early PRN code replica input
+ \param P_code Prompt PRN code replica input
+ \param L_code Late PRN code replica input
+ \param VL_code Very Late PRN code replica input
+ \param VE_out Very Early correlation output
+ \param E_out Early correlation output
+ \param P_out Prompt correlation output
+ \param L_out Late correlation output
+ \param VL_out Very Late correlation output
+ \param num_points The number of complex values in vectors
+ */
+static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_u_sse4_1(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points)
+{
+ const unsigned int sse_iters = num_points / 8;
+
+ __m128i x, x_abs, y, y_aux, bb_signal_sample_aux, bb_signal_sample_aux_abs;;
+ __m128i real_output, imag_output;
+ __m128 real_VE_code_acc, imag_VE_code_acc, real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc, real_VL_code_acc, imag_VL_code_acc;
+ __m128i input_i_1, input_i_2, output_i32;
+ __m128 real_output_ps, imag_output_ps;
+
+ __m128i check_sign_sequence = _mm_set_epi8 (255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1);
+ __m128i rearrange_sequence = _mm_set_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
+ __m128i mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
+
+ const lv_8sc_t* input_ptr = input;
+ const lv_8sc_t* carrier_ptr = carrier;
+
+ const lv_8sc_t* VE_code_ptr = VE_code;
+ lv_32fc_t* VE_out_ptr = VE_out;
+ const lv_8sc_t* E_code_ptr = E_code;
+ lv_32fc_t* E_out_ptr = E_out;
+ const lv_8sc_t* P_code_ptr = P_code;
+ lv_32fc_t* P_out_ptr = P_out;
+ const lv_8sc_t* L_code_ptr = L_code;
+ lv_32fc_t* L_out_ptr = L_out;
+ const lv_8sc_t* VL_code_ptr = VL_code;
+ lv_32fc_t* VL_out_ptr = VL_out;
+
+ float VE_out_real = 0;
+ float VE_out_imag = 0;
+ float E_out_real = 0;
+ float E_out_imag = 0;
+ float P_out_real = 0;
+ float P_out_imag = 0;
+ float L_out_real = 0;
+ float L_out_imag = 0;
+ float VL_out_real = 0;
+ float VL_out_imag = 0;
+
+ real_VE_code_acc = _mm_setzero_ps();
+ imag_VE_code_acc = _mm_setzero_ps();
+ real_E_code_acc = _mm_setzero_ps();
+ imag_E_code_acc = _mm_setzero_ps();
+ real_P_code_acc = _mm_setzero_ps();
+ imag_P_code_acc = _mm_setzero_ps();
+ real_L_code_acc = _mm_setzero_ps();
+ imag_L_code_acc = _mm_setzero_ps();
+ real_VL_code_acc = _mm_setzero_ps();
+ imag_VL_code_acc = _mm_setzero_ps();
+
+ if (sse_iters>0)
+ {
+ for(int number = 0;number < sse_iters; number++){
+
+ //Perform the carrier wipe-off
+ x = _mm_lddqu_si128((__m128i*)input_ptr);
+ y = _mm_lddqu_si128((__m128i*)carrier_ptr);
+
+ x_abs = _mm_abs_epi8 (x);
+
+ CM_8IC_X2_SCALAR_PRODUCT_16IC_X2_U_SSSE3(y, x, check_sign_sequence, rearrange_sequence, y_aux, x_abs, real_output, imag_output)
+
+ imag_output = _mm_slli_si128 (imag_output, 1);
+ bb_signal_sample_aux = _mm_blendv_epi8 (imag_output, real_output, mult1);
+ bb_signal_sample_aux_abs = _mm_abs_epi8 (bb_signal_sample_aux);
+
+ //Get very early values
+ y = _mm_lddqu_si128((__m128i*)VE_code_ptr);
+
+ CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
+
+ real_VE_code_acc = _mm_add_ps (real_VE_code_acc, real_output_ps);
+ imag_VE_code_acc = _mm_add_ps (imag_VE_code_acc, imag_output_ps);
+
+ //Get early values
+ y = _mm_lddqu_si128((__m128i*)E_code_ptr);
+
+ CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
+
+ real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps);
+ imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps);
+
+ //Get prompt values
+ y = _mm_lddqu_si128((__m128i*)P_code_ptr);
+
+ CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
+
+ real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps);
+ imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps);
+
+ //Get late values
+ y = _mm_lddqu_si128((__m128i*)L_code_ptr);
+
+ CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
+
+ real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps);
+ imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps);
+
+ //Get very late values
+ y = _mm_lddqu_si128((__m128i*)VL_code_ptr);
+
+ CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
+
+ real_VL_code_acc = _mm_add_ps (real_VL_code_acc, real_output_ps);
+ imag_VL_code_acc = _mm_add_ps (imag_VL_code_acc, imag_output_ps);
+
+ input_ptr += 8;
+ carrier_ptr += 8;
+ VE_code_ptr += 8;
+ E_code_ptr += 8;
+ P_code_ptr += 8;
+ L_code_ptr += 8;
+ VL_code_ptr += 8;
+ }
+
+ __VOLK_ATTR_ALIGNED(16) float real_VE_dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(16) float imag_VE_dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(16) float real_VL_dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(16) float imag_VL_dotProductVector[4];
+
+ _mm_storeu_ps((float*)real_VE_dotProductVector,real_VE_code_acc); // Store the results back into the dot product vector
+ _mm_storeu_ps((float*)imag_VE_dotProductVector,imag_VE_code_acc); // Store the results back into the dot product vector
+ _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector
+ _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector
+ _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector
+ _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector
+ _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector
+ _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector
+ _mm_storeu_ps((float*)real_VL_dotProductVector,real_VL_code_acc); // Store the results back into the dot product vector
+ _mm_storeu_ps((float*)imag_VL_dotProductVector,imag_VL_code_acc); // Store the results back into the dot product vector
+
+ for (int i = 0; i<4; ++i)
+ {
+ VE_out_real += real_VE_dotProductVector[i];
+ VE_out_imag += imag_VE_dotProductVector[i];
+ E_out_real += real_E_dotProductVector[i];
+ E_out_imag += imag_E_dotProductVector[i];
+ P_out_real += real_P_dotProductVector[i];
+ P_out_imag += imag_P_dotProductVector[i];
+ L_out_real += real_L_dotProductVector[i];
+ L_out_imag += imag_L_dotProductVector[i];
+ VL_out_real += real_VL_dotProductVector[i];
+ VL_out_imag += imag_VL_dotProductVector[i];
+ }
+ *VE_out_ptr = lv_cmake(VE_out_real, VE_out_imag);
+ *E_out_ptr = lv_cmake(E_out_real, E_out_imag);
+ *P_out_ptr = lv_cmake(P_out_real, P_out_imag);
+ *L_out_ptr = lv_cmake(L_out_real, L_out_imag);
+ *VL_out_ptr = lv_cmake(VL_out_real, VL_out_imag);
+ }
+
+ lv_16sc_t bb_signal_sample;
+ for(int i=0; i < num_points%8; ++i)
+ {
+ //Perform the carrier wipe-off
+ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
+ // Now get very early, early, prompt, late and very late values for each
+ *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*VE_code_ptr++));
+ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*E_code_ptr++));
+ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*P_code_ptr++));
+ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*L_code_ptr++));
+ *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*VL_code_ptr++));
+ }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#ifdef LV_HAVE_GENERIC
+#include <stdio.h>
+#include <tmmintrin.h>
+
+/*!
+ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
+ \param input The input signal input
+ \param carrier The carrier signal input
+ \param VE_code Very Early PRN code replica input
+ \param E_code Early PRN code replica input
+ \param P_code Prompt PRN code replica input
+ \param L_code Late PRN code replica input
+ \param VL_code Very Late PRN code replica input
+ \param VE_out Very Early correlation output
+ \param E_out Early correlation output
+ \param P_out Prompt correlation output
+ \param L_out Late correlation output
+ \param VL_out Very Late correlation output
+ \param num_points The number of complex values in vectors
+ */
+static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points)
+{
+ *VE_out = 0;
+ *E_out = 0;
+ *P_out = 0;
+ *L_out = 0;
+ *VL_out = 0;
+
+ lv_16sc_t bb_signal_sample;
+
+ for(int i=0; i < num_points; ++i)
+ {
+ //Perform the carrier wipe-off
+ bb_signal_sample = input[i] * carrier[i];
+ // Now get very early, early, prompt, late and very late values for each
+ *VE_out += (lv_32fc_t) (bb_signal_sample * VE_code[i]);
+ *E_out += (lv_32fc_t) (bb_signal_sample * E_code[i]);
+ *P_out += (lv_32fc_t) (bb_signal_sample * P_code[i]);
+ *L_out += (lv_32fc_t) (bb_signal_sample * L_code[i]);
+ *VL_out += (lv_32fc_t) (bb_signal_sample * VL_code[i]);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+#endif /* INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_u_H */
+
+
+#ifndef INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_a_H
+#define INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk_gnsssdr/volk_gnsssdr_complex.h>
+#include <float.h>
+#include <string.h>
+
+#ifdef LV_HAVE_SSE4_1
+#include "smmintrin.h"
+#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h"
+#include "CommonMacros/CommonMacros.h"
+/*!
+ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
+ \param input The input signal input
+ \param carrier The carrier signal input
+ \param VE_code Very Early PRN code replica input
+ \param E_code Early PRN code replica input
+ \param P_code Prompt PRN code replica input
+ \param L_code Late PRN code replica input
+ \param VL_code Very Late PRN code replica input
+ \param VE_out Very Early correlation output
+ \param E_out Early correlation output
+ \param P_out Prompt correlation output
+ \param L_out Late correlation output
+ \param VL_out Very Late correlation output
+ \param num_points The number of complex values in vectors
+ */
+static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_a_sse4_1(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points)
+{
+ const unsigned int sse_iters = num_points / 8;
+
+ __m128i x, x_abs, y, y_aux, bb_signal_sample_aux, bb_signal_sample_aux_abs;;
+ __m128i real_output, imag_output;
+ __m128 real_VE_code_acc, imag_VE_code_acc, real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc, real_VL_code_acc, imag_VL_code_acc;
+ __m128i input_i_1, input_i_2, output_i32;
+ __m128 real_output_ps, imag_output_ps;
+
+ __m128i check_sign_sequence = _mm_set_epi8 (255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1);
+ __m128i rearrange_sequence = _mm_set_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
+ __m128i mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
+
+ const lv_8sc_t* input_ptr = input;
+ const lv_8sc_t* carrier_ptr = carrier;
+
+ const lv_8sc_t* VE_code_ptr = VE_code;
+ lv_32fc_t* VE_out_ptr = VE_out;
+ const lv_8sc_t* E_code_ptr = E_code;
+ lv_32fc_t* E_out_ptr = E_out;
+ const lv_8sc_t* P_code_ptr = P_code;
+ lv_32fc_t* P_out_ptr = P_out;
+ const lv_8sc_t* L_code_ptr = L_code;
+ lv_32fc_t* L_out_ptr = L_out;
+ const lv_8sc_t* VL_code_ptr = VL_code;
+ lv_32fc_t* VL_out_ptr = VL_out;
+
+ float VE_out_real = 0;
+ float VE_out_imag = 0;
+ float E_out_real = 0;
+ float E_out_imag = 0;
+ float P_out_real = 0;
+ float P_out_imag = 0;
+ float L_out_real = 0;
+ float L_out_imag = 0;
+ float VL_out_real = 0;
+ float VL_out_imag = 0;
+
+ real_VE_code_acc = _mm_setzero_ps();
+ imag_VE_code_acc = _mm_setzero_ps();
+ real_E_code_acc = _mm_setzero_ps();
+ imag_E_code_acc = _mm_setzero_ps();
+ real_P_code_acc = _mm_setzero_ps();
+ imag_P_code_acc = _mm_setzero_ps();
+ real_L_code_acc = _mm_setzero_ps();
+ imag_L_code_acc = _mm_setzero_ps();
+ real_VL_code_acc = _mm_setzero_ps();
+ imag_VL_code_acc = _mm_setzero_ps();
+
+ if (sse_iters>0)
+ {
+ for(int number = 0;number < sse_iters; number++){
+
+ //Perform the carrier wipe-off
+ x = _mm_load_si128((__m128i*)input_ptr);
+ y = _mm_load_si128((__m128i*)carrier_ptr);
+
+ x_abs = _mm_abs_epi8 (x);
+
+ CM_8IC_X2_SCALAR_PRODUCT_16IC_X2_U_SSSE3(y, x, check_sign_sequence, rearrange_sequence, y_aux, x_abs, real_output, imag_output)
+
+ imag_output = _mm_slli_si128 (imag_output, 1);
+ bb_signal_sample_aux = _mm_blendv_epi8 (imag_output, real_output, mult1);
+ bb_signal_sample_aux_abs = _mm_abs_epi8 (bb_signal_sample_aux);
+
+ //Get very early values
+ y = _mm_load_si128((__m128i*)VE_code_ptr);
+
+ CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
+
+ real_VE_code_acc = _mm_add_ps (real_VE_code_acc, real_output_ps);
+ imag_VE_code_acc = _mm_add_ps (imag_VE_code_acc, imag_output_ps);
+
+ //Get early values
+ y = _mm_load_si128((__m128i*)E_code_ptr);
+
+ CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
+
+ real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps);
+ imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps);
+
+ //Get prompt values
+ y = _mm_load_si128((__m128i*)P_code_ptr);
+
+ CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
+
+ real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps);
+ imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps);
+
+ //Get late values
+ y = _mm_load_si128((__m128i*)L_code_ptr);
+
+ CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
+
+ real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps);
+ imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps);
+
+ //Get very late values
+ y = _mm_load_si128((__m128i*)VL_code_ptr);
+
+ CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
+
+ real_VL_code_acc = _mm_add_ps (real_VL_code_acc, real_output_ps);
+ imag_VL_code_acc = _mm_add_ps (imag_VL_code_acc, imag_output_ps);
+
+ input_ptr += 8;
+ carrier_ptr += 8;
+ VE_code_ptr += 8;
+ E_code_ptr += 8;
+ P_code_ptr += 8;
+ L_code_ptr += 8;
+ VL_code_ptr += 8;
+ }
+
+ __VOLK_ATTR_ALIGNED(16) float real_VE_dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(16) float imag_VE_dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(16) float real_VL_dotProductVector[4];
+ __VOLK_ATTR_ALIGNED(16) float imag_VL_dotProductVector[4];
+
+ _mm_store_ps((float*)real_VE_dotProductVector,real_VE_code_acc); // Store the results back into the dot product vector
+ _mm_store_ps((float*)imag_VE_dotProductVector,imag_VE_code_acc); // Store the results back into the dot product vector
+ _mm_store_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector
+ _mm_store_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector
+ _mm_store_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector
+ _mm_store_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector
+ _mm_store_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector
+ _mm_store_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector
+ _mm_store_ps((float*)real_VL_dotProductVector,real_VL_code_acc); // Store the results back into the dot product vector
+ _mm_store_ps((float*)imag_VL_dotProductVector,imag_VL_code_acc); // Store the results back into the dot product vector
+
+ for (int i = 0; i<4; ++i)
+ {
+ VE_out_real += real_VE_dotProductVector[i];
+ VE_out_imag += imag_VE_dotProductVector[i];
+ E_out_real += real_E_dotProductVector[i];
+ E_out_imag += imag_E_dotProductVector[i];
+ P_out_real += real_P_dotProductVector[i];
+ P_out_imag += imag_P_dotProductVector[i];
+ L_out_real += real_L_dotProductVector[i];
+ L_out_imag += imag_L_dotProductVector[i];
+ VL_out_real += real_VL_dotProductVector[i];
+ VL_out_imag += imag_VL_dotProductVector[i];
+ }
+ *VE_out_ptr = lv_cmake(VE_out_real, VE_out_imag);
+ *E_out_ptr = lv_cmake(E_out_real, E_out_imag);
+ *P_out_ptr = lv_cmake(P_out_real, P_out_imag);
+ *L_out_ptr = lv_cmake(L_out_real, L_out_imag);
+ *VL_out_ptr = lv_cmake(VL_out_real, VL_out_imag);
+ }
+
+ lv_16sc_t bb_signal_sample;
+ for(int i=0; i < num_points%8; ++i)
+ {
+ //Perform the carrier wipe-off
+ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
+ // Now get very early, early, prompt, late and very late values for each
+ *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*VE_code_ptr++));
+ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*E_code_ptr++));
+ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*P_code_ptr++));
+ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*L_code_ptr++));
+ *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*VL_code_ptr++));
+ }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#ifdef LV_HAVE_GENERIC
+#include <stdio.h>
+#include <tmmintrin.h>
+
+/*!
+ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
+ \param input The input signal input
+ \param carrier The carrier signal input
+ \param VE_code Very Early PRN code replica input
+ \param E_code Early PRN code replica input
+ \param P_code Prompt PRN code replica input
+ \param L_code Late PRN code replica input
+ \param VL_code Very Late PRN code replica input
+ \param VE_out Very Early correlation output
+ \param E_out Early correlation output
+ \param P_out Prompt correlation output
+ \param L_out Late correlation output
+ \param VL_out Very Late correlation output
+ \param num_points The number of complex values in vectors
+ */
+static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_a_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points)
+{
+ *VE_out = 0;
+ *E_out = 0;
+ *P_out = 0;
+ *L_out = 0;
+ *VL_out = 0;
+
+ lv_16sc_t bb_signal_sample;
+
+ for(int i=0; i < num_points; ++i)
+ {
+ //Perform the carrier wipe-off
+ bb_signal_sample = input[i] * carrier[i];
+ // Now get very early, early, prompt, late and very late values for each
+ *VE_out += (lv_32fc_t) (bb_signal_sample * VE_code[i]);
+ *E_out += (lv_32fc_t) (bb_signal_sample * E_code[i]);
+ *P_out += (lv_32fc_t) (bb_signal_sample * P_code[i]);
+ *L_out += (lv_32fc_t) (bb_signal_sample * L_code[i]);
+ *VL_out += (lv_32fc_t) (bb_signal_sample * VL_code[i]);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+#endif /* INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_a_H */
\ No newline at end of file
diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8u_x2_multiply_8u.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8u_x2_multiply_8u.h
--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8u_x2_multiply_8u.h 1970-01-01 01:00:00.000000000 +0100
+++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8u_x2_multiply_8u.h 2014-10-15 01:55:08.000000000 +0200
@@ -0,0 +1,210 @@
+/*!
+ * \file volk_gnsssdr_8u_x2_multiply_8u.h
+ * \brief Volk protokernel: multiplies unsigned char values
+ * \authors <ul>
+ * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
+ * </ul>
+ *
+ * Volk protokernel that multiplies unsigned char values (8 bits data)
+ *
+ * -------------------------------------------------------------------------
+ *
+ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
+ *
+ * GNSS-SDR is a software defined Global Navigation
+ * Satellite Systems receiver
+ *
+ * This file is part of GNSS-SDR.
+ *
+ * GNSS-SDR is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * at your option) any later version.
+ *
+ * GNSS-SDR is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * -------------------------------------------------------------------------
+ */
+
+#ifndef INCLUDED_volk_gnsssdr_8u_x2_multiply_8u_u_H
+#define INCLUDED_volk_gnsssdr_8u_x2_multiply_8u_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+#include <emmintrin.h>
+/*!
+ \brief Multiplies the two input unsigned char values and stores their results in the third unisgned char
+ \param cChar The unsigned char where the results will be stored
+ \param aChar One of the unsigned char to be multiplied
+ \param bChar One of the unsigned char to be multiplied
+ \param num_points The number of unsigned char values in aChar and bChar to be multiplied together and stored into cChar
+ */
+static inline void volk_gnsssdr_8u_x2_multiply_8u_u_sse3(unsigned char* cChar, const unsigned char* aChar, const unsigned char* bChar, unsigned int num_points){
+
+ const unsigned int sse_iters = num_points / 16;
+
+ __m128i x, y, x1, x2, y1, y2, mult1, x1_mult_y1, x2_mult_y2, tmp, tmp1, tmp2, totalc;
+ unsigned char* c = cChar;
+ const unsigned char* a = aChar;
+ const unsigned char* b = bChar;
+
+ for(int number = 0;number < sse_iters; number++){
+ x = _mm_lddqu_si128((__m128i*)a);
+ y = _mm_lddqu_si128((__m128i*)b);
+
+ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
+ x1 = _mm_srli_si128 (x, 1);
+ x1 = _mm_and_si128 (x1, mult1);
+ x2 = _mm_and_si128 (x, mult1);
+
+ y1 = _mm_srli_si128 (y, 1);
+ y1 = _mm_and_si128 (y1, mult1);
+ y2 = _mm_and_si128 (y, mult1);
+
+ x1_mult_y1 = _mm_mullo_epi16 (x1, y1);
+ x2_mult_y2 = _mm_mullo_epi16 (x2, y2);
+
+ tmp = _mm_and_si128 (x1_mult_y1, mult1);
+ tmp1 = _mm_slli_si128 (tmp, 1);
+ tmp2 = _mm_and_si128 (x2_mult_y2, mult1);
+ totalc = _mm_or_si128 (tmp1, tmp2);
+
+ _mm_storeu_si128((__m128i*)c, totalc);
+
+ a += 16;
+ b += 16;
+ c += 16;
+ }
+
+ for (int i = 0; i<(num_points % 16); ++i)
+ {
+ *c++ = (*a++) * (*b++);
+ }
+}
+#endif /* LV_HAVE_SSE3 */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Multiplies the two input unsigned char values and stores their results in the third unisgned char
+ \param cChar The unsigned char where the results will be stored
+ \param aChar One of the unsigned char to be multiplied
+ \param bChar One of the unsigned char to be multiplied
+ \param num_points The number of unsigned char values in aChar and bChar to be multiplied together and stored into cChar
+ */
+static inline void volk_gnsssdr_8u_x2_multiply_8u_generic(unsigned char* cChar, const unsigned char* aChar, const unsigned char* bChar, unsigned int num_points){
+ unsigned char* cPtr = cChar;
+ const unsigned char* aPtr = aChar;
+ const unsigned char* bPtr = bChar;
+
+ for(int number = 0; number < num_points; number++){
+ *cPtr++ = (*aPtr++) * (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_volk_gnsssdr_8u_x2_multiply_8u_u_H */
+
+
+#ifndef INCLUDED_volk_gnsssdr_8u_x2_multiply_8u_a_H
+#define INCLUDED_volk_gnsssdr_8u_x2_multiply_8u_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+#include <emmintrin.h>
+/*!
+ \brief Multiplies the two input unsigned char values and stores their results in the third unisgned char
+ \param cChar The unsigned char where the results will be stored
+ \param aChar One of the unsigned char to be multiplied
+ \param bChar One of the unsigned char to be multiplied
+ \param num_points The number of unsigned char values in aChar and bChar to be multiplied together and stored into cChar
+ */
+static inline void volk_gnsssdr_8u_x2_multiply_8u_a_sse3(unsigned char* cChar, const unsigned char* aChar, const unsigned char* bChar, unsigned int num_points){
+
+ const unsigned int sse_iters = num_points / 16;
+
+ __m128i x, y, x1, x2, y1, y2, mult1, x1_mult_y1, x2_mult_y2, tmp, tmp1, tmp2, totalc;
+ unsigned char* c = cChar;
+ const unsigned char* a = aChar;
+ const unsigned char* b = bChar;
+
+ for(int number = 0;number < sse_iters; number++){
+ x = _mm_load_si128((__m128i*)a);
+ y = _mm_load_si128((__m128i*)b);
+
+ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
+ x1 = _mm_srli_si128 (x, 1);
+ x1 = _mm_and_si128 (x1, mult1);
+ x2 = _mm_and_si128 (x, mult1);
+
+ y1 = _mm_srli_si128 (y, 1);
+ y1 = _mm_and_si128 (y1, mult1);
+ y2 = _mm_and_si128 (y, mult1);
+
+ x1_mult_y1 = _mm_mullo_epi16 (x1, y1);
+ x2_mult_y2 = _mm_mullo_epi16 (x2, y2);
+
+ tmp = _mm_and_si128 (x1_mult_y1, mult1);
+ tmp1 = _mm_slli_si128 (tmp, 1);
+ tmp2 = _mm_and_si128 (x2_mult_y2, mult1);
+ totalc = _mm_or_si128 (tmp1, tmp2);
+
+ _mm_store_si128((__m128i*)c, totalc);
+
+ a += 16;
+ b += 16;
+ c += 16;
+ }
+
+ for (int i = 0; i<(num_points % 16); ++i)
+ {
+ *c++ = (*a++) * (*b++);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Multiplies the two input unsigned char values and stores their results in the third unisgned char
+ \param cChar The unsigned char where the results will be stored
+ \param aChar One of the unsigned char to be multiplied
+ \param bChar One of the unsigned char to be multiplied
+ \param num_points The number of unsigned char values in aChar and bChar to be multiplied together and stored into cChar
+ */
+static inline void volk_gnsssdr_8u_x2_multiply_8u_a_generic(unsigned char* cChar, const unsigned char* aChar, const unsigned char* bChar, unsigned int num_points){
+ unsigned char* cPtr = cChar;
+ const unsigned char* aPtr = aChar;
+ const unsigned char* bPtr = bChar;
+
+ for(int number = 0; number < num_points; number++){
+ *cPtr++ = (*aPtr++) * (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#ifdef LV_HAVE_ORC
+/*!
+ \brief Multiplies the two input unsigned char values and stores their results in the third unisgned char
+ \param cChar The unsigned char where the results will be stored
+ \param aChar One of the unsigned char to be multiplied
+ \param bChar One of the unsigned char to be multiplied
+ \param num_points The number of unsigned char values in aChar and bChar to be multiplied together and stored into cChar
+ */
+extern void volk_gnsssdr_8u_x2_multiply_8u_a_orc_impl(unsigned char* cVector, const unsigned char* aVector, const unsigned char* bVector, unsigned int num_points);
+static inline void volk_gnsssdr_8u_x2_multiply_8u_u_orc(unsigned char* cVector, const unsigned char* aVector, const unsigned char* bVector, unsigned int num_points){
+ volk_gnsssdr_8u_x2_multiply_8u_a_orc_impl(cVector, aVector, bVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+#endif /* INCLUDED_volk_gnsssdr_8u_x2_multiply_8u_a_H */
diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_s32f_x2_update_local_carrier_32fc.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_s32f_x2_update_local_carrier_32fc.h
--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_s32f_x2_update_local_carrier_32fc.h 1970-01-01 01:00:00.000000000 +0100
+++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_s32f_x2_update_local_carrier_32fc.h 2014-10-17 01:53:55.000000000 +0200
@@ -0,0 +1,866 @@
+/*!
+ * \file volk_gnsssdr_32fc_s32f_x2_update_local_carrier_32fc
+ * \brief Volk protokernel: replaces the tracking function for update_local_carrier. Algorithm by Julien Pommier and Giovanni Garberoglio, modified by Andrés Cecilia.
+ * \authors <ul>
+ * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
+ * </ul>
+ *
+ * Volk protokernel that replaces the tracking function for update_local_carrier. Algorithm by Julien Pommier and Giovanni Garberoglio, modified by Andrés Cecilia.
+ *
+ * -------------------------------------------------------------------------
+ *
+ * Copyright (C) 2007 Julien Pommier
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ *
+ *(this is the zlib license)
+ *
+ * -------------------------------------------------------------------------
+ *
+ * Copyright (C) 2012 Giovanni Garberoglio
+ * Interdisciplinary Laboratory for Computational Science (LISC)
+ * Fondazione Bruno Kessler and University of Trento
+ * via Sommarive, 18
+ * I-38123 Trento (Italy)
+ *
+ * -------------------------------------------------------------------------
+ *
+ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
+ *
+ * GNSS-SDR is a software defined Global Navigation
+ * Satellite Systems receiver
+ *
+ * This file is part of GNSS-SDR.
+ *
+ * GNSS-SDR is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * at your option) any later version.
+ *
+ * GNSS-SDR is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * -------------------------------------------------------------------------
+ */
+
+#ifndef INCLUDED_volk_gnsssdr_32fc_s32f_x2_update_local_carrier_32fc_u_H
+#define INCLUDED_volk_gnsssdr_32fc_s32f_x2_update_local_carrier_32fc_u_H
+
+#include <volk_gnsssdr/volk_gnsssdr_common.h>
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_AVX
+#include <tmmintrin.h>
+/*!
+ \brief Accumulates the values in the input buffer
+ \param result The accumulated result
+ \param inputBuffer The buffer of data to be accumulated
+ \param num_points The number of values in inputBuffer to be accumulated
+ */
+static inline void volk_gnsssdr_s32f_x2_update_local_carrier_32fc_u_avx(lv_32fc_t* d_carr_sign, const float phase_rad_init, const float phase_step_rad, unsigned int num_points){
+
+// float* pointer1 = (float*)&phase_rad_init;
+// *pointer1 = 0;
+// float* pointer2 = (float*)&phase_step_rad;
+// *pointer2 = 0.5;
+
+ const unsigned int sse_iters = num_points / 8;
+
+ __m256 _ps256_minus_cephes_DP1 = _mm256_set1_ps(-0.78515625f);
+ __m256 _ps256_minus_cephes_DP2 = _mm256_set1_ps(-2.4187564849853515625e-4f);
+ __m256 _ps256_minus_cephes_DP3 = _mm256_set1_ps(-3.77489497744594108e-8f);
+ __m256 _ps256_sign_mask = _mm256_set1_ps(-0.f);
+ __m128i _pi32avx_1 = _mm_set1_epi32(1);
+ __m128i _pi32avx_inv1 = _mm_set1_epi32(~1);
+ __m128i _pi32avx_2 = _mm_set1_epi32(2);
+ __m128i _pi32avx_4 = _mm_set1_epi32(4);
+ __m256 _ps256_cephes_FOPI = _mm256_set1_ps(1.27323954473516f); // 4 / PI
+ __m256 _ps256_sincof_p0 = _mm256_set1_ps(-1.9515295891E-4f);
+ __m256 _ps256_sincof_p1 = _mm256_set1_ps( 8.3321608736E-3f);
+ __m256 _ps256_sincof_p2 = _mm256_set1_ps(-1.6666654611E-1f);
+ __m256 _ps256_coscof_p0 = _mm256_set1_ps( 2.443315711809948E-005f);
+ __m256 _ps256_coscof_p1 = _mm256_set1_ps(-1.388731625493765E-003f);
+ __m256 _ps256_coscof_p2 = _mm256_set1_ps( 4.166664568298827E-002f);
+ __m256 _ps256_1 = _mm256_set1_ps(1.f);
+ __m256 _ps256_0p5 = _mm256_set1_ps(0.5f);
+
+ __m256 phase_step_rad_array = _mm256_set1_ps(8*phase_step_rad);
+
+ __m256 phase_rad_array, x, s, c, swap_sign_bit_sin, sign_bit_cos, poly_mask, z, tmp, y, y2, ysin1, ysin2;
+ __m256 xmm1, xmm2, xmm3, sign_bit_sin;
+ __m256i imm0, imm2, imm4;
+ __m128i imm0_1, imm0_2, imm2_1, imm2_2, imm4_1, imm4_2;
+ __VOLK_ATTR_ALIGNED(32) float sin_value[8];
+ __VOLK_ATTR_ALIGNED(32) float cos_value[8];
+
+ phase_rad_array = _mm256_set_ps (phase_rad_init+7*phase_step_rad, phase_rad_init+6*phase_step_rad, phase_rad_init+5*phase_step_rad, phase_rad_init+4*phase_step_rad, phase_rad_init+3*phase_step_rad, phase_rad_init+2*phase_step_rad, phase_rad_init+phase_step_rad, phase_rad_init);
+
+ for(int i = 0; i < sse_iters; i++)
+ {
+
+ x = phase_rad_array;
+
+ /* extract the sign bit (upper one) */
+ sign_bit_sin = _mm256_and_ps(x, _ps256_sign_mask);
+
+ /* take the absolute value */
+ x = _mm256_xor_ps(x, sign_bit_sin);
+
+ /* scale by 4/Pi */
+ y = _mm256_mul_ps(x, _ps256_cephes_FOPI);
+
+ /* we use SSE2 routines to perform the integer ops */
+
+ //COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y),imm2_1,imm2_2);
+ y = _mm256_cvttps_epi32(y);
+ imm2_1 = _mm256_extractf128_ps (y, 0);
+ imm2_2 = _mm256_extractf128_ps (y, 1);
+
+ imm2_1 = _mm_add_epi32(imm2_1, _pi32avx_1);
+ imm2_2 = _mm_add_epi32(imm2_2, _pi32avx_1);
+
+ imm2_1 = _mm_and_si128(imm2_1, _pi32avx_inv1);
+ imm2_2 = _mm_and_si128(imm2_2, _pi32avx_inv1);
+
+ //COPY_XMM_TO_IMM(imm2_1,imm2_2,imm2);
+ //_mm256_set_m128i not defined in some versions of immintrin.h
+ //imm2 = _mm256_set_m128i (imm2_2, imm2_1);
+ imm2 = _mm256_insertf128_si256(_mm256_castsi128_si256(imm2_1),(imm2_2),1);
+
+ y = _mm256_cvtepi32_ps(imm2);
+
+ imm4_1 = imm2_1;
+ imm4_2 = imm2_2;
+
+ imm0_1 = _mm_and_si128(imm2_1, _pi32avx_4);
+ imm0_2 = _mm_and_si128(imm2_2, _pi32avx_4);
+
+ imm0_1 = _mm_slli_epi32(imm0_1, 29);
+ imm0_2 = _mm_slli_epi32(imm0_2, 29);
+
+ //COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0);
+ //_mm256_set_m128i not defined in some versions of immintrin.h
+ //imm0 = _mm256_set_m128i (imm0_2, imm0_1);
+ imm0 = _mm256_insertf128_si256(_mm256_castsi128_si256(imm0_1),(imm0_2),1);
+
+ imm2_1 = _mm_and_si128(imm2_1, _pi32avx_2);
+ imm2_2 = _mm_and_si128(imm2_2, _pi32avx_2);
+
+ imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128());
+ imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128());
+
+ //COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
+ //_mm256_set_m128i not defined in some versions of immintrin.h
+ //imm2 = _mm256_set_m128i (imm2_2, imm2_1);
+ imm2 = _mm256_insertf128_si256(_mm256_castsi128_si256(imm2_1),(imm2_2),1);
+
+ swap_sign_bit_sin = _mm256_castsi256_ps(imm0);
+ poly_mask = _mm256_castsi256_ps(imm2);
+
+ /* The magic pass: "Extended precision modular arithmetic"
+ x = ((x - y * DP1) - y * DP2) - y * DP3; */
+ xmm1 = _ps256_minus_cephes_DP1;
+ xmm2 = _ps256_minus_cephes_DP2;
+ xmm3 = _ps256_minus_cephes_DP3;
+ xmm1 = _mm256_mul_ps(y, xmm1);
+ xmm2 = _mm256_mul_ps(y, xmm2);
+ xmm3 = _mm256_mul_ps(y, xmm3);
+ x = _mm256_add_ps(x, xmm1);
+ x = _mm256_add_ps(x, xmm2);
+ x = _mm256_add_ps(x, xmm3);
+
+ imm4_1 = _mm_sub_epi32(imm4_1, _pi32avx_2);
+ imm4_2 = _mm_sub_epi32(imm4_2, _pi32avx_2);
+
+ imm4_1 = _mm_andnot_si128(imm4_1, _pi32avx_4);
+ imm4_2 = _mm_andnot_si128(imm4_2, _pi32avx_4);
+
+ imm4_1 = _mm_slli_epi32(imm4_1, 29);
+ imm4_2 = _mm_slli_epi32(imm4_2, 29);
+
+ //COPY_XMM_TO_IMM(imm4_1, imm4_2, imm4);
+ //_mm256_set_m128i not defined in some versions of immintrin.h
+ //imm4 = _mm256_set_m128i (imm4_2, imm4_1);
+ imm4 = _mm256_insertf128_si256(_mm256_castsi128_si256(imm4_1),(imm4_2),1);
+
+ sign_bit_cos = _mm256_castsi256_ps(imm4);
+
+ sign_bit_sin = _mm256_xor_ps(sign_bit_sin, swap_sign_bit_sin);
+
+ /* Evaluate the first polynom (0 <= x <= Pi/4) */
+ z = _mm256_mul_ps(x,x);
+ y = _ps256_coscof_p0;
+
+ y = _mm256_mul_ps(y, z);
+ y = _mm256_add_ps(y, _ps256_coscof_p1);
+ y = _mm256_mul_ps(y, z);
+ y = _mm256_add_ps(y, _ps256_coscof_p2);
+ y = _mm256_mul_ps(y, z);
+ y = _mm256_mul_ps(y, z);
+ tmp = _mm256_mul_ps(z, _ps256_0p5);
+ y = _mm256_sub_ps(y, tmp);
+ y = _mm256_add_ps(y, _ps256_1);
+
+ /* Evaluate the second polynom (Pi/4 <= x <= 0) */
+
+ y2 = _ps256_sincof_p0;
+ y2 = _mm256_mul_ps(y2, z);
+ y2 = _mm256_add_ps(y2, _ps256_sincof_p1);
+ y2 = _mm256_mul_ps(y2, z);
+ y2 = _mm256_add_ps(y2, _ps256_sincof_p2);
+ y2 = _mm256_mul_ps(y2, z);
+ y2 = _mm256_mul_ps(y2, x);
+ y2 = _mm256_add_ps(y2, x);
+
+ /* select the correct result from the two polynoms */
+ xmm3 = poly_mask;
+ ysin2 = _mm256_and_ps(xmm3, y2);
+ ysin1 = _mm256_andnot_ps(xmm3, y);
+ y2 = _mm256_sub_ps(y2,ysin2);
+ y = _mm256_sub_ps(y, ysin1);
+
+ xmm1 = _mm256_add_ps(ysin1,ysin2);
+ xmm2 = _mm256_add_ps(y,y2);
+
+ /* update the sign */
+ s = _mm256_xor_ps(xmm1, sign_bit_sin);
+ c = _mm256_xor_ps(xmm2, sign_bit_cos);
+
+ //GNSS-SDR needs to return -sin
+ s = _mm256_xor_ps(s, _ps256_sign_mask);
+
+ _mm256_storeu_ps ((float*)sin_value, s);
+ _mm256_storeu_ps ((float*)cos_value, c);
+
+ for(int i = 0; i < 8; i++)
+ {
+ d_carr_sign[i] = lv_cmake(cos_value[i], sin_value[i]);
+ }
+ d_carr_sign += 8;
+
+ phase_rad_array = _mm256_add_ps (phase_rad_array, phase_step_rad_array);
+ }
+
+ if (num_points%8!=0)
+ {
+ __VOLK_ATTR_ALIGNED(32) float phase_rad_store[8];
+ _mm256_storeu_si256 ((float*)phase_rad_store, phase_rad_array);
+
+ float phase_rad = phase_rad_store[0];
+
+ for(int i = 0; i < num_points%8; i++)
+ {
+ *d_carr_sign = lv_cmake(cos(phase_rad), -sin(phase_rad));
+ d_carr_sign++;
+ phase_rad += phase_step_rad;
+ }
+ }
+}
+#endif /* LV_HAVE_AVX */
+
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+/*!
+ \brief Accumulates the values in the input buffer
+ \param result The accumulated result
+ \param inputBuffer The buffer of data to be accumulated
+ \param num_points The number of values in inputBuffer to be accumulated
+*/
+static inline void volk_gnsssdr_s32f_x2_update_local_carrier_32fc_u_sse2(lv_32fc_t* d_carr_sign, const float phase_rad_init, const float phase_step_rad, unsigned int num_points){
+
+// float* pointer1 = (float*)&phase_rad_init;
+// *pointer1 = 0;
+// float* pointer2 = (float*)&phase_step_rad;
+// *pointer2 = 0.5;
+
+ const unsigned int sse_iters = num_points / 4;
+
+ __m128 _ps_minus_cephes_DP1 = _mm_set1_ps(-0.78515625f);
+ __m128 _ps_minus_cephes_DP2 = _mm_set1_ps(-2.4187564849853515625e-4f);
+ __m128 _ps_minus_cephes_DP3 = _mm_set1_ps(-3.77489497744594108e-8f);
+ __m128 _ps_sign_mask = _mm_set1_ps(-0.f);
+ __m128i _pi32_1 = _mm_set1_epi32(1);
+ __m128i _pi32_inv1 = _mm_set1_epi32(~1);
+ __m128i _pi32_2 = _mm_set1_epi32(2);
+ __m128i _pi32_4 = _mm_set1_epi32(4);
+ __m128 _ps_cephes_FOPI = _mm_set1_ps(1.27323954473516f); // 4 / PI
+ __m128 _ps_sincof_p0 = _mm_set1_ps(-1.9515295891E-4f);
+ __m128 _ps_sincof_p1 = _mm_set1_ps( 8.3321608736E-3f);
+ __m128 _ps_sincof_p2 = _mm_set1_ps(-1.6666654611E-1f);
+ __m128 _ps_coscof_p0 = _mm_set1_ps( 2.443315711809948E-005f);
+ __m128 _ps_coscof_p1 = _mm_set1_ps(-1.388731625493765E-003f);
+ __m128 _ps_coscof_p2 = _mm_set1_ps( 4.166664568298827E-002f);
+ __m128 _ps_1 = _mm_set1_ps(1.f);
+ __m128 _ps_0p5 = _mm_set1_ps(0.5f);
+
+ __m128 phase_step_rad_array = _mm_set1_ps(4*phase_step_rad);
+
+ __m128 phase_rad_array, x, s, c, swap_sign_bit_sin, sign_bit_cos, poly_mask, z, tmp, y, y2, ysin1, ysin2;
+ __m128 xmm1, xmm2, xmm3, sign_bit_sin;
+ __m128i emm0, emm2, emm4;
+ __VOLK_ATTR_ALIGNED(16) float sin_value[4];
+ __VOLK_ATTR_ALIGNED(16) float cos_value[4];
+
+ phase_rad_array = _mm_set_ps (phase_rad_init+3*phase_step_rad, phase_rad_init+2*phase_step_rad, phase_rad_init+phase_step_rad, phase_rad_init);
+
+ for(int i = 0; i < sse_iters; i++)
+ {
+ x = phase_rad_array;
+
+ /* extract the sign bit (upper one) */
+ sign_bit_sin = _mm_and_ps(x, _ps_sign_mask);
+
+ /* take the absolute value */
+ x = _mm_xor_ps(x, sign_bit_sin);
+
+ /* scale by 4/Pi */
+ y = _mm_mul_ps(x, _ps_cephes_FOPI);
+
+ /* store the integer part of y in emm2 */
+ emm2 = _mm_cvttps_epi32(y);
+
+ /* j=(j+1) & (~1) (see the cephes sources) */
+ emm2 = _mm_add_epi32(emm2, _pi32_1);
+ emm2 = _mm_and_si128(emm2, _pi32_inv1);
+ y = _mm_cvtepi32_ps(emm2);
+
+ emm4 = emm2;
+
+ /* get the swap sign flag for the sine */
+ emm0 = _mm_and_si128(emm2, _pi32_4);
+ emm0 = _mm_slli_epi32(emm0, 29);
+ swap_sign_bit_sin = _mm_castsi128_ps(emm0);
+
+ /* get the polynom selection mask for the sine*/
+ emm2 = _mm_and_si128(emm2, _pi32_2);
+ emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
+ poly_mask = _mm_castsi128_ps(emm2);
+
+ /* The magic pass: "Extended precision modular arithmetic"
+ x = ((x - y * DP1) - y * DP2) - y * DP3; */
+ xmm1 = _mm_mul_ps(y, _ps_minus_cephes_DP1);
+ xmm2 = _mm_mul_ps(y, _ps_minus_cephes_DP2);
+ xmm3 = _mm_mul_ps(y, _ps_minus_cephes_DP3);
+ x = _mm_add_ps(_mm_add_ps(x, xmm1), _mm_add_ps(xmm2, xmm3));
+
+ emm4 = _mm_sub_epi32(emm4, _pi32_2);
+ emm4 = _mm_andnot_si128(emm4, _pi32_4);
+ emm4 = _mm_slli_epi32(emm4, 29);
+ sign_bit_cos = _mm_castsi128_ps(emm4);
+
+ sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
+
+ /* Evaluate the first polynom (0 <= x <= Pi/4) */
+ z = _mm_mul_ps(x,x);
+ y = _ps_coscof_p0;
+ y = _mm_mul_ps(y, z);
+ y = _mm_add_ps(y, _ps_coscof_p1);
+ y = _mm_mul_ps(y, z);
+ y = _mm_add_ps(y, _ps_coscof_p2);
+ y = _mm_mul_ps(y, _mm_mul_ps(z, z));
+ tmp = _mm_mul_ps(z, _ps_0p5);
+ y = _mm_sub_ps(y, tmp);
+ y = _mm_add_ps(y, _ps_1);
+
+ /* Evaluate the second polynom (Pi/4 <= x <= 0) */
+ y2 = _ps_sincof_p0;
+ y2 = _mm_mul_ps(y2, z);
+ y2 = _mm_add_ps(y2, _ps_sincof_p1);
+ y2 = _mm_mul_ps(y2, z);
+ y2 = _mm_add_ps(y2, _ps_sincof_p2);
+ y2 = _mm_mul_ps(y2, _mm_mul_ps(z, x));
+ y2 = _mm_add_ps(y2, x);
+
+ /* select the correct result from the two polynoms */
+ xmm3 = poly_mask;
+ ysin2 = _mm_and_ps(xmm3, y2);
+ ysin1 = _mm_andnot_ps(xmm3, y);
+ y2 = _mm_sub_ps(y2,ysin2);
+ y = _mm_sub_ps(y, ysin1);
+
+ xmm1 = _mm_add_ps(ysin1,ysin2);
+ xmm2 = _mm_add_ps(y,y2);
+
+ /* update the sign */
+ s = _mm_xor_ps(xmm1, sign_bit_sin);
+ c = _mm_xor_ps(xmm2, sign_bit_cos);
+
+ //GNSS-SDR needs to return -sin
+ s = _mm_xor_ps(s, _ps_sign_mask);
+
+ _mm_storeu_ps ((float*)sin_value, s);
+ _mm_storeu_ps ((float*)cos_value, c);
+
+ for(int i = 0; i < 4; i++)
+ {
+ d_carr_sign[i] = lv_cmake(cos_value[i], sin_value[i]);
+ }
+ d_carr_sign += 4;
+
+ phase_rad_array = _mm_add_ps (phase_rad_array, phase_step_rad_array);
+ }
+
+ if (num_points%4!=0)
+ {
+ __VOLK_ATTR_ALIGNED(16) float phase_rad_store[4];
+ _mm_storeu_si128 ((__m128i*)phase_rad_store, phase_rad_array);
+
+ float phase_rad = phase_rad_store[0];
+
+ for(int i = 0; i < num_points%4; i++)
+ {
+ *d_carr_sign = lv_cmake(cos(phase_rad), -sin(phase_rad));
+ d_carr_sign++;
+ phase_rad += phase_step_rad;
+ }
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Accumulates the values in the input buffer
+ \param result The accumulated result
+ \param inputBuffer The buffer of data to be accumulated
+ \param num_points The number of values in inputBuffer to be accumulated
+*/
+static inline void volk_gnsssdr_s32f_x2_update_local_carrier_32fc_generic(lv_32fc_t* d_carr_sign, const float phase_rad_init, const float phase_step_rad, unsigned int num_points){
+
+// float* pointer1 = (float*)&phase_rad_init;
+// *pointer1 = 0;
+// float* pointer2 = (float*)&phase_step_rad;
+// *pointer2 = 0.5;
+
+ float phase_rad = phase_rad_init;
+ for(int i = 0; i < num_points; i++)
+ {
+ *d_carr_sign = lv_cmake(cos(phase_rad), -sin(phase_rad));
+ d_carr_sign++;
+ phase_rad += phase_step_rad;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+#endif /* INCLUDED_volk_gnsssdr_32fc_s32f_x2_update_local_carrier_32fc_u_H */
+
+
+#ifndef INCLUDED_volk_gnsssdr_32fc_s32f_x2_update_local_carrier_32fc_a_H
+#define INCLUDED_volk_gnsssdr_32fc_s32f_x2_update_local_carrier_32fc_a_H
+
+#include <volk_gnsssdr/volk_gnsssdr_common.h>
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_AVX
+#include <tmmintrin.h>
+/*!
+ \brief Accumulates the values in the input buffer
+ \param result The accumulated result
+ \param inputBuffer The buffer of data to be accumulated
+ \param num_points The number of values in inputBuffer to be accumulated
+ */
+static inline void volk_gnsssdr_s32f_x2_update_local_carrier_32fc_a_avx(lv_32fc_t* d_carr_sign, const float phase_rad_init, const float phase_step_rad, unsigned int num_points){
+
+ // float* pointer1 = (float*)&phase_rad_init;
+ // *pointer1 = 0;
+ // float* pointer2 = (float*)&phase_step_rad;
+ // *pointer2 = 0.5;
+
+ const unsigned int sse_iters = num_points / 8;
+
+ __m256 _ps256_minus_cephes_DP1 = _mm256_set1_ps(-0.78515625f);
+ __m256 _ps256_minus_cephes_DP2 = _mm256_set1_ps(-2.4187564849853515625e-4f);
+ __m256 _ps256_minus_cephes_DP3 = _mm256_set1_ps(-3.77489497744594108e-8f);
+ __m256 _ps256_sign_mask = _mm256_set1_ps(-0.f);
+ __m128i _pi32avx_1 = _mm_set1_epi32(1);
+ __m128i _pi32avx_inv1 = _mm_set1_epi32(~1);
+ __m128i _pi32avx_2 = _mm_set1_epi32(2);
+ __m128i _pi32avx_4 = _mm_set1_epi32(4);
+ __m256 _ps256_cephes_FOPI = _mm256_set1_ps(1.27323954473516f); // 4 / PI
+ __m256 _ps256_sincof_p0 = _mm256_set1_ps(-1.9515295891E-4f);
+ __m256 _ps256_sincof_p1 = _mm256_set1_ps( 8.3321608736E-3f);
+ __m256 _ps256_sincof_p2 = _mm256_set1_ps(-1.6666654611E-1f);
+ __m256 _ps256_coscof_p0 = _mm256_set1_ps( 2.443315711809948E-005f);
+ __m256 _ps256_coscof_p1 = _mm256_set1_ps(-1.388731625493765E-003f);
+ __m256 _ps256_coscof_p2 = _mm256_set1_ps( 4.166664568298827E-002f);
+ __m256 _ps256_1 = _mm256_set1_ps(1.f);
+ __m256 _ps256_0p5 = _mm256_set1_ps(0.5f);
+
+ __m256 phase_step_rad_array = _mm256_set1_ps(8*phase_step_rad);
+
+ __m256 phase_rad_array, x, s, c, swap_sign_bit_sin, sign_bit_cos, poly_mask, z, tmp, y, y2, ysin1, ysin2;
+ __m256 xmm1, xmm2, xmm3, sign_bit_sin;
+ __m256i imm0, imm2, imm4;
+ __m128i imm0_1, imm0_2, imm2_1, imm2_2, imm4_1, imm4_2;
+ __VOLK_ATTR_ALIGNED(32) float sin_value[8];
+ __VOLK_ATTR_ALIGNED(32) float cos_value[8];
+
+ phase_rad_array = _mm256_set_ps (phase_rad_init+7*phase_step_rad, phase_rad_init+6*phase_step_rad, phase_rad_init+5*phase_step_rad, phase_rad_init+4*phase_step_rad, phase_rad_init+3*phase_step_rad, phase_rad_init+2*phase_step_rad, phase_rad_init+phase_step_rad, phase_rad_init);
+
+ for(int i = 0; i < sse_iters; i++)
+ {
+
+ x = phase_rad_array;
+
+ /* extract the sign bit (upper one) */
+ sign_bit_sin = _mm256_and_ps(x, _ps256_sign_mask);
+
+ /* take the absolute value */
+ x = _mm256_xor_ps(x, sign_bit_sin);
+
+ /* scale by 4/Pi */
+ y = _mm256_mul_ps(x, _ps256_cephes_FOPI);
+
+ /* we use SSE2 routines to perform the integer ops */
+
+ //COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y),imm2_1,imm2_2);
+ y = _mm256_cvttps_epi32(y);
+ imm2_1 = _mm256_extractf128_ps (y, 0);
+ imm2_2 = _mm256_extractf128_ps (y, 1);
+
+ imm2_1 = _mm_add_epi32(imm2_1, _pi32avx_1);
+ imm2_2 = _mm_add_epi32(imm2_2, _pi32avx_1);
+
+ imm2_1 = _mm_and_si128(imm2_1, _pi32avx_inv1);
+ imm2_2 = _mm_and_si128(imm2_2, _pi32avx_inv1);
+
+ //COPY_XMM_TO_IMM(imm2_1,imm2_2,imm2);
+ //_mm256_set_m128i not defined in some versions of immintrin.h
+ //imm2 = _mm256_set_m128i (imm2_2, imm2_1);
+ imm2 = _mm256_insertf128_si256(_mm256_castsi128_si256(imm2_1),(imm2_2),1);
+
+ y = _mm256_cvtepi32_ps(imm2);
+
+ imm4_1 = imm2_1;
+ imm4_2 = imm2_2;
+
+ imm0_1 = _mm_and_si128(imm2_1, _pi32avx_4);
+ imm0_2 = _mm_and_si128(imm2_2, _pi32avx_4);
+
+ imm0_1 = _mm_slli_epi32(imm0_1, 29);
+ imm0_2 = _mm_slli_epi32(imm0_2, 29);
+
+ //COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0);
+ //_mm256_set_m128i not defined in some versions of immintrin.h
+ //imm0 = _mm256_set_m128i (imm0_2, imm0_1);
+ imm0 = _mm256_insertf128_si256(_mm256_castsi128_si256(imm0_1),(imm0_2),1);
+
+ imm2_1 = _mm_and_si128(imm2_1, _pi32avx_2);
+ imm2_2 = _mm_and_si128(imm2_2, _pi32avx_2);
+
+ imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128());
+ imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128());
+
+ //COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
+ //_mm256_set_m128i not defined in some versions of immintrin.h
+ //imm2 = _mm256_set_m128i (imm2_2, imm2_1);
+ imm2 = _mm256_insertf128_si256(_mm256_castsi128_si256(imm2_1),(imm2_2),1);
+
+ swap_sign_bit_sin = _mm256_castsi256_ps(imm0);
+ poly_mask = _mm256_castsi256_ps(imm2);
+
+ /* The magic pass: "Extended precision modular arithmetic"
+ x = ((x - y * DP1) - y * DP2) - y * DP3; */
+ xmm1 = _ps256_minus_cephes_DP1;
+ xmm2 = _ps256_minus_cephes_DP2;
+ xmm3 = _ps256_minus_cephes_DP3;
+ xmm1 = _mm256_mul_ps(y, xmm1);
+ xmm2 = _mm256_mul_ps(y, xmm2);
+ xmm3 = _mm256_mul_ps(y, xmm3);
+ x = _mm256_add_ps(x, xmm1);
+ x = _mm256_add_ps(x, xmm2);
+ x = _mm256_add_ps(x, xmm3);
+
+ imm4_1 = _mm_sub_epi32(imm4_1, _pi32avx_2);
+ imm4_2 = _mm_sub_epi32(imm4_2, _pi32avx_2);
+
+ imm4_1 = _mm_andnot_si128(imm4_1, _pi32avx_4);
+ imm4_2 = _mm_andnot_si128(imm4_2, _pi32avx_4);
+
+ imm4_1 = _mm_slli_epi32(imm4_1, 29);
+ imm4_2 = _mm_slli_epi32(imm4_2, 29);
+
+ //COPY_XMM_TO_IMM(imm4_1, imm4_2, imm4);
+ //_mm256_set_m128i not defined in some versions of immintrin.h
+ //imm4 = _mm256_set_m128i (imm4_2, imm4_1);
+ imm4 = _mm256_insertf128_si256(_mm256_castsi128_si256(imm4_1),(imm4_2),1);
+
+ sign_bit_cos = _mm256_castsi256_ps(imm4);
+
+ sign_bit_sin = _mm256_xor_ps(sign_bit_sin, swap_sign_bit_sin);
+
+ /* Evaluate the first polynom (0 <= x <= Pi/4) */
+ z = _mm256_mul_ps(x,x);
+ y = _ps256_coscof_p0;
+
+ y = _mm256_mul_ps(y, z);
+ y = _mm256_add_ps(y, _ps256_coscof_p1);
+ y = _mm256_mul_ps(y, z);
+ y = _mm256_add_ps(y, _ps256_coscof_p2);
+ y = _mm256_mul_ps(y, z);
+ y = _mm256_mul_ps(y, z);
+ tmp = _mm256_mul_ps(z, _ps256_0p5);
+ y = _mm256_sub_ps(y, tmp);
+ y = _mm256_add_ps(y, _ps256_1);
+
+ /* Evaluate the second polynom (Pi/4 <= x <= 0) */
+
+ y2 = _ps256_sincof_p0;
+ y2 = _mm256_mul_ps(y2, z);
+ y2 = _mm256_add_ps(y2, _ps256_sincof_p1);
+ y2 = _mm256_mul_ps(y2, z);
+ y2 = _mm256_add_ps(y2, _ps256_sincof_p2);
+ y2 = _mm256_mul_ps(y2, z);
+ y2 = _mm256_mul_ps(y2, x);
+ y2 = _mm256_add_ps(y2, x);
+
+ /* select the correct result from the two polynoms */
+ xmm3 = poly_mask;
+ ysin2 = _mm256_and_ps(xmm3, y2);
+ ysin1 = _mm256_andnot_ps(xmm3, y);
+ y2 = _mm256_sub_ps(y2,ysin2);
+ y = _mm256_sub_ps(y, ysin1);
+
+ xmm1 = _mm256_add_ps(ysin1,ysin2);
+ xmm2 = _mm256_add_ps(y,y2);
+
+ /* update the sign */
+ s = _mm256_xor_ps(xmm1, sign_bit_sin);
+ c = _mm256_xor_ps(xmm2, sign_bit_cos);
+
+ //GNSS-SDR needs to return -sin
+ s = _mm256_xor_ps(s, _ps256_sign_mask);
+
+ _mm256_store_ps ((float*)sin_value, s);
+ _mm256_store_ps ((float*)cos_value, c);
+
+ for(int i = 0; i < 8; i++)
+ {
+ d_carr_sign[i] = lv_cmake(cos_value[i], sin_value[i]);
+ }
+ d_carr_sign += 8;
+
+ phase_rad_array = _mm256_add_ps (phase_rad_array, phase_step_rad_array);
+ }
+
+ if (num_points%8!=0)
+ {
+ __VOLK_ATTR_ALIGNED(32) float phase_rad_store[8];
+ _mm256_store_ps ((float*)phase_rad_store, phase_rad_array);
+
+ float phase_rad = phase_rad_store[0];
+
+ for(int i = 0; i < num_points%8; i++)
+ {
+ *d_carr_sign = lv_cmake(cos(phase_rad), -sin(phase_rad));
+ d_carr_sign++;
+ phase_rad += phase_step_rad;
+ }
+ }
+}
+#endif /* LV_HAVE_AVX */
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+/*!
+ \brief Accumulates the values in the input buffer
+ \param result The accumulated result
+ \param inputBuffer The buffer of data to be accumulated
+ \param num_points The number of values in inputBuffer to be accumulated
+ */
+static inline void volk_gnsssdr_s32f_x2_update_local_carrier_32fc_a_sse2(lv_32fc_t* d_carr_sign, const float phase_rad_init, const float phase_step_rad, unsigned int num_points){
+
+// float* pointer1 = (float*)&phase_rad_init;
+// *pointer1 = 0;
+// float* pointer2 = (float*)&phase_step_rad;
+// *pointer2 = 0.5;
+
+ const unsigned int sse_iters = num_points / 4;
+
+ __m128 _ps_minus_cephes_DP1 = _mm_set1_ps(-0.78515625f);
+ __m128 _ps_minus_cephes_DP2 = _mm_set1_ps(-2.4187564849853515625e-4f);
+ __m128 _ps_minus_cephes_DP3 = _mm_set1_ps(-3.77489497744594108e-8f);
+ __m128 _ps_sign_mask = _mm_set1_ps(-0.f);
+ __m128i _pi32_1 = _mm_set1_epi32(1);
+ __m128i _pi32_inv1 = _mm_set1_epi32(~1);
+ __m128i _pi32_2 = _mm_set1_epi32(2);
+ __m128i _pi32_4 = _mm_set1_epi32(4);
+ __m128 _ps_cephes_FOPI = _mm_set1_ps(1.27323954473516f); // 4 / PI
+ __m128 _ps_sincof_p0 = _mm_set1_ps(-1.9515295891E-4f);
+ __m128 _ps_sincof_p1 = _mm_set1_ps( 8.3321608736E-3f);
+ __m128 _ps_sincof_p2 = _mm_set1_ps(-1.6666654611E-1f);
+ __m128 _ps_coscof_p0 = _mm_set1_ps( 2.443315711809948E-005f);
+ __m128 _ps_coscof_p1 = _mm_set1_ps(-1.388731625493765E-003f);
+ __m128 _ps_coscof_p2 = _mm_set1_ps( 4.166664568298827E-002f);
+ __m128 _ps_1 = _mm_set1_ps(1.f);
+ __m128 _ps_0p5 = _mm_set1_ps(0.5f);
+
+ __m128 phase_step_rad_array = _mm_set1_ps(4*phase_step_rad);
+
+ __m128 phase_rad_array, x, s, c, swap_sign_bit_sin, sign_bit_cos, poly_mask, z, tmp, y, y2, ysin1, ysin2;
+ __m128 xmm1, xmm2, xmm3, sign_bit_sin;
+ __m128i emm0, emm2, emm4;
+ __VOLK_ATTR_ALIGNED(16) float sin_value[4];
+ __VOLK_ATTR_ALIGNED(16) float cos_value[4];
+
+ phase_rad_array = _mm_set_ps (phase_rad_init+3*phase_step_rad, phase_rad_init+2*phase_step_rad, phase_rad_init+phase_step_rad, phase_rad_init);
+
+ for(int i = 0; i < sse_iters; i++)
+ {
+ x = phase_rad_array;
+
+ /* extract the sign bit (upper one) */
+ sign_bit_sin = _mm_and_ps(x, _ps_sign_mask);
+
+ /* take the absolute value */
+ x = _mm_xor_ps(x, sign_bit_sin);
+
+ /* scale by 4/Pi */
+ y = _mm_mul_ps(x, _ps_cephes_FOPI);
+
+ /* store the integer part of y in emm2 */
+ emm2 = _mm_cvttps_epi32(y);
+
+ /* j=(j+1) & (~1) (see the cephes sources) */
+ emm2 = _mm_add_epi32(emm2, _pi32_1);
+ emm2 = _mm_and_si128(emm2, _pi32_inv1);
+ y = _mm_cvtepi32_ps(emm2);
+
+ emm4 = emm2;
+
+ /* get the swap sign flag for the sine */
+ emm0 = _mm_and_si128(emm2, _pi32_4);
+ emm0 = _mm_slli_epi32(emm0, 29);
+ swap_sign_bit_sin = _mm_castsi128_ps(emm0);
+
+ /* get the polynom selection mask for the sine*/
+ emm2 = _mm_and_si128(emm2, _pi32_2);
+ emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
+ poly_mask = _mm_castsi128_ps(emm2);
+
+ /* The magic pass: "Extended precision modular arithmetic"
+ x = ((x - y * DP1) - y * DP2) - y * DP3; */
+ xmm1 = _mm_mul_ps(y, _ps_minus_cephes_DP1);
+ xmm2 = _mm_mul_ps(y, _ps_minus_cephes_DP2);
+ xmm3 = _mm_mul_ps(y, _ps_minus_cephes_DP3);
+ x = _mm_add_ps(_mm_add_ps(x, xmm1), _mm_add_ps(xmm2, xmm3));
+
+ emm4 = _mm_sub_epi32(emm4, _pi32_2);
+ emm4 = _mm_andnot_si128(emm4, _pi32_4);
+ emm4 = _mm_slli_epi32(emm4, 29);
+ sign_bit_cos = _mm_castsi128_ps(emm4);
+
+ sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
+
+ /* Evaluate the first polynom (0 <= x <= Pi/4) */
+ z = _mm_mul_ps(x,x);
+ y = _ps_coscof_p0;
+ y = _mm_mul_ps(y, z);
+ y = _mm_add_ps(y, _ps_coscof_p1);
+ y = _mm_mul_ps(y, z);
+ y = _mm_add_ps(y, _ps_coscof_p2);
+ y = _mm_mul_ps(y, _mm_mul_ps(z, z));
+ tmp = _mm_mul_ps(z, _ps_0p5);
+ y = _mm_sub_ps(y, tmp);
+ y = _mm_add_ps(y, _ps_1);
+
+ /* Evaluate the second polynom (Pi/4 <= x <= 0) */
+ y2 = _ps_sincof_p0;
+ y2 = _mm_mul_ps(y2, z);
+ y2 = _mm_add_ps(y2, _ps_sincof_p1);
+ y2 = _mm_mul_ps(y2, z);
+ y2 = _mm_add_ps(y2, _ps_sincof_p2);
+ y2 = _mm_mul_ps(y2, _mm_mul_ps(z, x));
+ y2 = _mm_add_ps(y2, x);
+
+ /* select the correct result from the two polynoms */
+ xmm3 = poly_mask;
+ ysin2 = _mm_and_ps(xmm3, y2);
+ ysin1 = _mm_andnot_ps(xmm3, y);
+ y2 = _mm_sub_ps(y2,ysin2);
+ y = _mm_sub_ps(y, ysin1);
+
+ xmm1 = _mm_add_ps(ysin1,ysin2);
+ xmm2 = _mm_add_ps(y,y2);
+
+ /* update the sign */
+ s = _mm_xor_ps(xmm1, sign_bit_sin);
+ c = _mm_xor_ps(xmm2, sign_bit_cos);
+
+ //GNSS-SDR needs to return -sin
+ s = _mm_xor_ps(s, _ps_sign_mask);
+
+ _mm_store_ps ((float*)sin_value, s);
+ _mm_store_ps ((float*)cos_value, c);
+
+ for(int i = 0; i < 4; i++)
+ {
+ d_carr_sign[i] = lv_cmake(cos_value[i], sin_value[i]);
+ }
+ d_carr_sign += 4;
+
+ phase_rad_array = _mm_add_ps (phase_rad_array, phase_step_rad_array);
+ }
+
+ if (num_points%4!=0)
+ {
+ __VOLK_ATTR_ALIGNED(16) float phase_rad_store[4];
+ _mm_store_si128 ((__m128i*)phase_rad_store, phase_rad_array);
+
+ float phase_rad = phase_rad_store[0];
+
+ for(int i = 0; i < num_points%4; i++)
+ {
+ *d_carr_sign = lv_cmake(cos(phase_rad), -sin(phase_rad));
+ d_carr_sign++;
+ phase_rad += phase_step_rad;
+ }
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Accumulates the values in the input buffer
+ \param result The accumulated result
+ \param inputBuffer The buffer of data to be accumulated
+ \param num_points The number of values in inputBuffer to be accumulated
+ */
+static inline void volk_gnsssdr_s32f_x2_update_local_carrier_32fc_a_generic(lv_32fc_t* d_carr_sign, const float phase_rad_init, const float phase_step_rad, unsigned int num_points){
+
+// float* pointer1 = (float*)&phase_rad_init;
+// *pointer1 = 0;
+// float* pointer2 = (float*)&phase_step_rad;
+// *pointer2 = 0.5;
+
+ float phase_rad = phase_rad_init;
+ for(int i = 0; i < num_points; i++)
+ {
+ *d_carr_sign = lv_cmake(cos(phase_rad), -sin(phase_rad));
+ d_carr_sign++;
+ phase_rad += phase_step_rad;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+#endif /* INCLUDED_volk_gnsssdr_32fc_s32f_x2_update_local_carrier_32fc_a_H */
+
diff -rupN /Users/andres/Desktop/volk_gnsssdr/lib/CMakeLists.txt /Users/andres/Desktop/volk_gnsssdr_original/lib/CMakeLists.txt
--- /Users/andres/Desktop/volk_gnsssdr/lib/CMakeLists.txt 2014-10-17 05:07:22.000000000 +0200
+++ /Users/andres/Desktop/volk_gnsssdr_original/lib/CMakeLists.txt 2014-10-17 04:50:28.000000000 +0200
@@ -517,7 +517,19 @@ if(MSVC)
endif()
#create the volk_gnsssdr runtime library
-add_library(volk_gnsssdr SHARED ${volk_gnsssdr_sources})
+
+#MODIFICATIONS BY GNSS-SDR
+file(GLOB orc ${CMAKE_SOURCE_DIR}/orc/*.orc)
+file(GLOB CommonMacros ${CMAKE_SOURCE_DIR}/kernels/CommonMacros/*.h ${CMAKE_SOURCE_DIR}/kernels/CommonMacros/README.txt)
+
+#add_library(volk_gnsssdr SHARED ${volk_gnsssdr_sources})
+add_library(volk_gnsssdr SHARED ${volk_gnsssdr_sources} ${h_files} ${CommonMacros} ${orc})
+
+source_group("Kernels" FILES ${h_files})
+source_group("Common Macros" FILES ${CommonMacros})
+source_group("ORC Files" FILES ${orc})
+#END OF MODIFICATIONS
+
target_link_libraries(volk_gnsssdr ${volk_gnsssdr_libraries})
set_target_properties(volk_gnsssdr PROPERTIES SOVERSION ${LIBVER})
set_target_properties(volk_gnsssdr PROPERTIES DEFINE_SYMBOL "volk_gnsssdr_EXPORTS")
diff -rupN /Users/andres/Desktop/volk_gnsssdr/lib/qa_utils.cc /Users/andres/Desktop/volk_gnsssdr_original/lib/qa_utils.cc
--- /Users/andres/Desktop/volk_gnsssdr/lib/qa_utils.cc 2014-10-17 05:07:25.000000000 +0200
+++ /Users/andres/Desktop/volk_gnsssdr_original/lib/qa_utils.cc 2014-10-17 04:21:03.000000000 +0200
@@ -217,6 +217,72 @@ inline void run_cast_test3_s32fc(volk_gn
while(iter--) func(buffs[0], buffs[1], buffs[2], scalar, vlen, arch.c_str());
}
+//ADDED BY GNSS-SDR. START
+inline void run_cast_test1_s8i(volk_gnsssdr_fn_1arg_s8i func, std::vector<void *> &buffs, char scalar, unsigned int vlen, unsigned int iter, std::string arch) {
+ while(iter--) func(buffs[0], scalar, vlen, arch.c_str());
+}
+
+inline void run_cast_test2_s8i(volk_gnsssdr_fn_2arg_s8i func, std::vector<void *> &buffs, char scalar, unsigned int vlen, unsigned int iter, std::string arch) {
+ while(iter--) func(buffs[0], buffs[1], scalar, vlen, arch.c_str());
+}
+
+inline void run_cast_test3_s8i(volk_gnsssdr_fn_3arg_s8i func, std::vector<void *> &buffs, char scalar, unsigned int vlen, unsigned int iter, std::string arch) {
+ while(iter--) func(buffs[0], buffs[1], buffs[2], scalar, vlen, arch.c_str());
+}
+
+inline void run_cast_test1_s8ic(volk_gnsssdr_fn_1arg_s8ic func, std::vector<void *> &buffs, lv_8sc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) {
+ while(iter--) func(buffs[0], scalar, vlen, arch.c_str());
+}
+
+inline void run_cast_test2_s8ic(volk_gnsssdr_fn_2arg_s8ic func, std::vector<void *> &buffs, lv_8sc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) {
+ while(iter--) func(buffs[0], buffs[1], scalar, vlen, arch.c_str());
+}
+
+inline void run_cast_test3_s8ic(volk_gnsssdr_fn_3arg_s8ic func, std::vector<void *> &buffs, lv_8sc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) {
+ while(iter--) func(buffs[0], buffs[1], buffs[2], scalar, vlen, arch.c_str());
+}
+
+inline void run_cast_test8(volk_gnsssdr_fn_8arg func, std::vector<void *> &buffs, unsigned int vlen, unsigned int iter, std::string arch) {
+ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], vlen, arch.c_str());
+}
+
+inline void run_cast_test8_s8i(volk_gnsssdr_fn_8arg_s8i func, std::vector<void *> &buffs, char scalar, unsigned int vlen, unsigned int iter, std::string arch) {
+ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], scalar, vlen, arch.c_str());
+}
+
+inline void run_cast_test8_s8ic(volk_gnsssdr_fn_8arg_s8ic func, std::vector<void *> &buffs, lv_8sc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) {
+ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], scalar, vlen, arch.c_str());
+}
+
+inline void run_cast_test8_s32f(volk_gnsssdr_fn_8arg_s32f func, std::vector<void *> &buffs, float scalar, unsigned int vlen, unsigned int iter, std::string arch) {
+ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], scalar, vlen, arch.c_str());
+}
+
+inline void run_cast_test8_s32fc(volk_gnsssdr_fn_8arg_s32fc func, std::vector<void *> &buffs, lv_32fc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) {
+ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], scalar, vlen, arch.c_str());
+}
+
+inline void run_cast_test12(volk_gnsssdr_fn_12arg func, std::vector<void *> &buffs, unsigned int vlen, unsigned int iter, std::string arch) {
+ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], buffs[8], buffs[9], buffs[10], buffs[11], vlen, arch.c_str());
+}
+
+inline void run_cast_test12_s8i(volk_gnsssdr_fn_12arg_s8i func, std::vector<void *> &buffs, char scalar, unsigned int vlen, unsigned int iter, std::string arch) {
+ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], buffs[8], buffs[9], buffs[10], buffs[11], scalar, vlen, arch.c_str());
+}
+
+inline void run_cast_test12_s8ic(volk_gnsssdr_fn_12arg_s8ic func, std::vector<void *> &buffs, lv_8sc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) {
+ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], buffs[8], buffs[9], buffs[10], buffs[11], scalar, vlen, arch.c_str());
+}
+
+inline void run_cast_test12_s32f(volk_gnsssdr_fn_12arg_s32f func, std::vector<void *> &buffs, float scalar, unsigned int vlen, unsigned int iter, std::string arch) {
+ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], buffs[8], buffs[9], buffs[10], buffs[11], scalar, vlen, arch.c_str());
+}
+
+inline void run_cast_test12_s32fc(volk_gnsssdr_fn_12arg_s32fc func, std::vector<void *> &buffs, lv_32fc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) {
+ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], buffs[8], buffs[9], buffs[10], buffs[11], scalar, vlen, arch.c_str());
+}
+//ADDED BY GNSS-SDR. END
+
// This function is a nop that helps resolve GNU Radio bugs 582 and 583.
// Without this the cast in run_volk_gnsssdr_tests for tol_i = static_cast<int>(float tol)
// won't happen on armhf (reported on cortex A9 and A15).
@@ -426,7 +492,17 @@ bool run_volk_gnsssdr_tests(volk_gnsssdr
} else {
run_cast_test1_s32f((volk_gnsssdr_fn_1arg_s32f)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]);
}
- } else throw "unsupported 1 arg function >1 scalars";
+ }
+ //ADDED BY GNSS-SDR. START
+ else if(inputsc.size() == 1 && !inputsc[0].is_float) {
+ if(inputsc[0].is_complex) {
+ run_cast_test1_s8ic((volk_gnsssdr_fn_1arg_s8ic)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]);
+ } else {
+ run_cast_test1_s8i((volk_gnsssdr_fn_1arg_s8i)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]);
+ }
+ }
+ //ADDED BY GNSS-SDR. END
+ else throw "unsupported 1 arg function >1 scalars";
break;
case 2:
if(inputsc.size() == 0) {
@@ -437,7 +513,17 @@ bool run_volk_gnsssdr_tests(volk_gnsssdr
} else {
run_cast_test2_s32f((volk_gnsssdr_fn_2arg_s32f)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]);
}
- } else throw "unsupported 2 arg function >1 scalars";
+ }
+ //ADDED BY GNSS-SDR. START
+ else if(inputsc.size() == 1 && !inputsc[0].is_float) {
+ if(inputsc[0].is_complex) {
+ run_cast_test2_s8ic((volk_gnsssdr_fn_2arg_s8ic)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]);
+ } else {
+ run_cast_test2_s8i((volk_gnsssdr_fn_2arg_s8i)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]);
+ }
+ }
+ //ADDED BY GNSS-SDR. END
+ else throw "unsupported 2 arg function >1 scalars";
break;
case 3:
if(inputsc.size() == 0) {
@@ -448,11 +534,61 @@ bool run_volk_gnsssdr_tests(volk_gnsssdr
} else {
run_cast_test3_s32f((volk_gnsssdr_fn_3arg_s32f)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]);
}
- } else throw "unsupported 3 arg function >1 scalars";
+ }
+ //ADDED BY GNSS-SDR. START
+ else if(inputsc.size() == 1 && !inputsc[0].is_float) {
+ if(inputsc[0].is_complex) {
+ run_cast_test3_s8ic((volk_gnsssdr_fn_3arg_s8ic)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]);
+ } else {
+ run_cast_test3_s8i((volk_gnsssdr_fn_3arg_s8i)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]);
+ }
+ }
+ //ADDED BY GNSS-SDR. END
+ else throw "unsupported 3 arg function >1 scalars";
break;
case 4:
run_cast_test4((volk_gnsssdr_fn_4arg)(manual_func), test_data[i], vlen, iter, arch_list[i]);
break;
+ //ADDED BY GNSS-SDR. START
+ case 8:
+ if(inputsc.size() == 0) {
+ run_cast_test8((volk_gnsssdr_fn_8arg)(manual_func), test_data[i], vlen, iter, arch_list[i]);
+ } else if(inputsc.size() == 1 && inputsc[0].is_float) {
+ if(inputsc[0].is_complex) {
+ run_cast_test8_s32fc((volk_gnsssdr_fn_8arg_s32fc)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]);
+ } else {
+ run_cast_test8_s32f((volk_gnsssdr_fn_8arg_s32f)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]);
+ }
+ }
+ else if(inputsc.size() == 1 && !inputsc[0].is_float) {
+ if(inputsc[0].is_complex) {
+ run_cast_test8_s8ic((volk_gnsssdr_fn_8arg_s8ic)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]);
+ } else {
+ run_cast_test8_s8i((volk_gnsssdr_fn_8arg_s8i)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]);
+ }
+ }
+ else throw "unsupported 8 arg function >1 scalars";
+ break;
+ case 12:
+ if(inputsc.size() == 0) {
+ run_cast_test12((volk_gnsssdr_fn_12arg)(manual_func), test_data[i], vlen, iter, arch_list[i]);
+ } else if(inputsc.size() == 1 && inputsc[0].is_float) {
+ if(inputsc[0].is_complex) {
+ run_cast_test12_s32fc((volk_gnsssdr_fn_12arg_s32fc)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]);
+ } else {
+ run_cast_test12_s32f((volk_gnsssdr_fn_12arg_s32f)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]);
+ }
+ }
+ else if(inputsc.size() == 1 && !inputsc[0].is_float) {
+ if(inputsc[0].is_complex) {
+ run_cast_test12_s8ic((volk_gnsssdr_fn_12arg_s8ic)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]);
+ } else {
+ run_cast_test12_s8i((volk_gnsssdr_fn_12arg_s8i)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]);
+ }
+ }
+ else throw "unsupported 12 arg function >1 scalars";
+ break;
+ //ADDED BY GNSS-SDR. END
default:
throw "no function handler for this signature";
break;
diff -rupN /Users/andres/Desktop/volk_gnsssdr/lib/qa_utils.h /Users/andres/Desktop/volk_gnsssdr_original/lib/qa_utils.h
--- /Users/andres/Desktop/volk_gnsssdr/lib/qa_utils.h 2014-10-17 05:07:24.000000000 +0200
+++ /Users/andres/Desktop/volk_gnsssdr_original/lib/qa_utils.h 2014-10-17 04:21:51.000000000 +0200
@@ -77,4 +77,26 @@ typedef void (*volk_gnsssdr_fn_1arg_s32f
typedef void (*volk_gnsssdr_fn_2arg_s32fc)(void *, void *, lv_32fc_t, unsigned int, const char*);
typedef void (*volk_gnsssdr_fn_3arg_s32fc)(void *, void *, void *, lv_32fc_t, unsigned int, const char*);
+//ADDED BY GNSS-SDR. START
+typedef void (*volk_gnsssdr_fn_1arg_s8i)(void *, char, unsigned int, const char*); //one input vector, one scalar char input
+typedef void (*volk_gnsssdr_fn_2arg_s8i)(void *, void *, char, unsigned int, const char*);
+typedef void (*volk_gnsssdr_fn_3arg_s8i)(void *, void *, void *, char, unsigned int, const char*);
+typedef void (*volk_gnsssdr_fn_1arg_s8ic)(void *, lv_8sc_t, unsigned int, const char*); //one input vector, one scalar lv_8sc_t vector input
+typedef void (*volk_gnsssdr_fn_2arg_s8ic)(void *, void *, lv_8sc_t, unsigned int, const char*);
+typedef void (*volk_gnsssdr_fn_3arg_s8ic)(void *, void *, void *, lv_8sc_t, unsigned int, const char*);
+
+typedef void (*volk_gnsssdr_fn_8arg)(void *, void *, void *, void *, void *, void *, void *, void *, unsigned int, const char*);
+typedef void (*volk_gnsssdr_fn_8arg_s32f)(void *, void *, void *, void *, void *, void *, void *, void *, float, unsigned int, const char*);
+typedef void (*volk_gnsssdr_fn_8arg_s32fc)(void *, void *, void *, void *, void *, void *, void *, void *, lv_32fc_t, unsigned int, const char*);
+typedef void (*volk_gnsssdr_fn_8arg_s8i)(void *, void *, void *, void *, void *, void *, void *, void *, char, unsigned int, const char*);
+typedef void (*volk_gnsssdr_fn_8arg_s8ic)(void *, void *, void *, void *, void *, void *, void *, void *, lv_8sc_t, unsigned int, const char*);
+
+typedef void (*volk_gnsssdr_fn_12arg)(void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, unsigned int, const char*);
+typedef void (*volk_gnsssdr_fn_12arg_s32f)(void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, float, unsigned int, const char*);
+typedef void (*volk_gnsssdr_fn_12arg_s32fc)(void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, lv_32fc_t, unsigned int, const char*);
+typedef void (*volk_gnsssdr_fn_12arg_s8i)(void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, char, unsigned int, const char*);
+typedef void (*volk_gnsssdr_fn_12arg_s8ic)(void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, lv_8sc_t, unsigned int, const char*);
+//ADDED BY GNSS-SDR. END
+
+
#endif //VOLK_QA_UTILS_H
diff -rupN /Users/andres/Desktop/volk_gnsssdr/lib/testqa.cc /Users/andres/Desktop/volk_gnsssdr_original/lib/testqa.cc
--- /Users/andres/Desktop/volk_gnsssdr/lib/testqa.cc 2014-10-17 05:07:25.000000000 +0200
+++ /Users/andres/Desktop/volk_gnsssdr_original/lib/testqa.cc 2014-10-15 01:55:08.000000000 +0200
@@ -24,6 +24,58 @@
#include <volk_gnsssdr/volk_gnsssdr.h>
#include <boost/test/unit_test.hpp>
+//VOLK PROTOKERNELS OBTAINED FROM THE GNURADIO BASE
+VOLK_RUN_TESTS(volk_gnsssdr_32fc_x2_multiply_32fc, 1e-4, 0, 20462, 1);
+VOLK_RUN_TESTS(volk_gnsssdr_32fc_x2_dot_prod_32fc, 1e-4, 0, 204603, 1);
+VOLK_RUN_TESTS(volk_gnsssdr_32fc_s32fc_multiply_32fc, 1e-4, 0, 20462, 1);
+VOLK_RUN_TESTS(volk_gnsssdr_32fc_conjugate_32fc, 1e-4, 0, 20462, 1);
+VOLK_RUN_TESTS(volk_gnsssdr_32f_x2_add_32f, 1e-4, 0, 20462, 1);
+VOLK_RUN_TESTS(volk_gnsssdr_32f_index_max_16u, 3, 0, 20462, 1);
+VOLK_RUN_TESTS(volk_gnsssdr_32f_accumulator_s32f, 1e-4, 0, 20462, 1);
+VOLK_RUN_TESTS(volk_gnsssdr_32fc_magnitude_squared_32f, 1e-4, 0, 20462, 1);
+VOLK_RUN_TESTS(volk_gnsssdr_32f_s32f_convert_16i, 3, 0, 20462, 1);
+
+//GNSS-SDR PROTO-KERNELS
+VOLK_RUN_TESTS(volk_gnsssdr_8ic_x2_multiply_8ic, 1e-4, 0, 20462, 1);
+VOLK_RUN_TESTS(volk_gnsssdr_8u_x2_multiply_8u, 1e-4, 0, 20462, 1);
+VOLK_RUN_TESTS(volk_gnsssdr_8ic_x2_dot_prod_8ic, 1e-4, 0, 204603, 1);
+VOLK_RUN_TESTS(volk_gnsssdr_8ic_s8ic_multiply_8ic, 1e-4, 0, 20462, 1);
+VOLK_RUN_TESTS(volk_gnsssdr_8ic_conjugate_8ic, 1e-4, 0, 20462, 1);
+VOLK_RUN_TESTS(volk_gnsssdr_8i_x2_add_8i, 1e-4, 0, 20462, 1);
+VOLK_RUN_TESTS(volk_gnsssdr_8i_index_max_16u, 3, 0, 20462, 1);
+VOLK_RUN_TESTS(volk_gnsssdr_8i_accumulator_s8i, 1e-4, 0, 20462, 1);
+VOLK_RUN_TESTS(volk_gnsssdr_8ic_magnitude_squared_8i, 1e-4, 0, 20462, 1);
+
+VOLK_RUN_TESTS(volk_gnsssdr_8i_max_s8i, 3, 0, 20462, 1);
+VOLK_RUN_TESTS(volk_gnsssdr_64f_accumulator_64f, 3, 0, 20462, 1);
+
+VOLK_RUN_TESTS(volk_gnsssdr_32fc_convert_16ic, 3, 0, 20462, 1);
+VOLK_RUN_TESTS(volk_gnsssdr_32fc_s32f_convert_8ic, 3, 0, 20462, 1);
+VOLK_RUN_TESTS(volk_gnsssdr_32fc_convert_8ic, 3, 0, 20462, 1);
+VOLK_RUN_TESTS(volk_gnsssdr_16i_s32f_convert_32f, 3, 0, 20462, 1);
+
+VOLK_RUN_TESTS(volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3, 1e-4, 0, 20462, 1);
+VOLK_RUN_TESTS(volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3, 1e-4, 0, 20462, 1);
+VOLK_RUN_TESTS(volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3, 1e-4, 0, 20462, 1);
+VOLK_RUN_TESTS(volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3, 1e-4, 0, 20462, 1);
+VOLK_RUN_TESTS(volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3, 1e-4, 0, 20462, 1);
+
+VOLK_RUN_TESTS(volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5, 1e-4, 0, 20462, 1);
+VOLK_RUN_TESTS(volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5, 1e-4, 0, 20462, 1);
+VOLK_RUN_TESTS(volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5, 1e-4, 0, 20462, 1);
+VOLK_RUN_TESTS(volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5, 1e-4, 0, 20462, 1);
+VOLK_RUN_TESTS(volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5, 1e-4, 0, 20462, 1);
+VOLK_RUN_TESTS(volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5, 1e-4, 0, 20462, 1);
+
+VOLK_RUN_TESTS(volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc, 1e-4, 0, 20462, 1);
+VOLK_RUN_TESTS(volk_gnsssdr_s32f_x2_update_local_carrier_32fc, 1e-4, 0, 20462, 1);
+
+
+
+
+
+
+
//VOLK_RUN_TESTS(volk_gnsssdr_16i_x5_add_quad_16i_x4, 1e-4, 2046, 10000);
//VOLK_RUN_TESTS(volk_gnsssdr_16i_branch_4_state_8, 1e-4, 2046, 10000);
//VOLK_RUN_TESTS(volk_gnsssdr_16i_max_star_16i, 0, 0, 20462, 10000);
diff -rupN /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_32f_x2_add_32f.orc /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_32f_x2_add_32f.orc
--- /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_32f_x2_add_32f.orc 1970-01-01 01:00:00.000000000 +0100
+++ /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_32f_x2_add_32f.orc 2014-10-15 01:55:08.000000000 +0200
@@ -0,0 +1,5 @@
+.function volk_gnsssdr_32f_x2_add_32f_a_orc_impl
+.dest 4 dst
+.source 4 src1
+.source 4 src2
+addf dst, src1, src2
diff -rupN /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_32fc_s32fc_multiply_32fc.orc /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_32fc_s32fc_multiply_32fc.orc
--- /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_32fc_s32fc_multiply_32fc.orc 1970-01-01 01:00:00.000000000 +0100
+++ /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_32fc_s32fc_multiply_32fc.orc 2014-10-15 01:55:08.000000000 +0200
@@ -0,0 +1,18 @@
+.function volk_gnsssdr_32fc_s32fc_multiply_32fc_a_orc_impl
+.source 8 src1
+.floatparam 8 scalar
+.dest 8 dst
+.temp 8 iqprod
+.temp 4 real
+.temp 4 imag
+.temp 4 ac
+.temp 4 bd
+.temp 8 swapped
+x2 mulf iqprod, src1, scalar
+splitql bd, ac, iqprod
+subf real, ac, bd
+swaplq swapped, src1
+x2 mulf iqprod, swapped, scalar
+splitql bd, ac, iqprod
+addf imag, ac, bd
+mergelq dst, real, imag
diff -rupN /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_32fc_x2_multiply_32fc.orc /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_32fc_x2_multiply_32fc.orc
--- /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_32fc_x2_multiply_32fc.orc 1970-01-01 01:00:00.000000000 +0100
+++ /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_32fc_x2_multiply_32fc.orc 2014-10-15 01:55:08.000000000 +0200
@@ -0,0 +1,18 @@
+.function volk_gnsssdr_32fc_x2_multiply_32fc_a_orc_impl
+.source 8 src1
+.source 8 src2
+.dest 8 dst
+.temp 8 iqprod
+.temp 4 real
+.temp 4 imag
+.temp 4 ac
+.temp 4 bd
+.temp 8 swapped
+x2 mulf iqprod, src1, src2
+splitql bd, ac, iqprod
+subf real, ac, bd
+swaplq swapped, src1
+x2 mulf iqprod, swapped, src2
+splitql bd, ac, iqprod
+addf imag, ac, bd
+mergelq dst, real, imag
diff -rupN /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_8i_accumulator_s8i.orc /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_8i_accumulator_s8i.orc
--- /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_8i_accumulator_s8i.orc 1970-01-01 01:00:00.000000000 +0100
+++ /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_8i_accumulator_s8i.orc 2014-10-15 01:55:08.000000000 +0200
@@ -0,0 +1,40 @@
+#/*!
+# * \file volk_gnsssdr_8i_accumulator_s8i.orc
+# * \brief ORC implementation: 8 bits (char) scalar accumulator
+# * \authors <ul>
+# * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
+# * </ul>
+# *
+# * ORC code that implements an accumulator of char values
+# *
+# * -------------------------------------------------------------------------
+# *
+# * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
+# *
+# * GNSS-SDR is a software defined Global Navigation
+# * Satellite Systems receiver
+# *
+# * This file is part of GNSS-SDR.
+# *
+# * GNSS-SDR is free software: you can redistribute it and/or modify
+# * it under the terms of the GNU General Public License as published by
+# * the Free Software Foundation, either version 3 of the License, or
+# * at your option) any later version.
+# *
+# * GNSS-SDR is distributed in the hope that it will be useful,
+# * but WITHOUT ANY WARRANTY; without even the implied warranty of
+# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# * GNU General Public License for more details.
+# *
+# * You should have received a copy of the GNU General Public License
+# * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
+# *
+# * -------------------------------------------------------------------------
+# */
+
+.function volk_gnsssdr_8i_accumulator_s8i_a_orc_impl
+.source 1 src1
+.accumulator 2 acc
+.temp 2 sum
+mergebw sum, 0, src1
+accw acc, sum
diff -rupN /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_8i_x2_add_8i.orc /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_8i_x2_add_8i.orc
--- /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_8i_x2_add_8i.orc 1970-01-01 01:00:00.000000000 +0100
+++ /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_8i_x2_add_8i.orc 2014-10-15 01:55:08.000000000 +0200
@@ -0,0 +1,39 @@
+#/*!
+# * \file volk_gnsssdr_8i_x2_add_8i.orc
+# * \brief ORC implementation: adds pairs of 8 bits (char) scalars
+# * \authors <ul>
+# * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
+# * </ul>
+# *
+# * ORC code that adds pairs of 8 bits (char) scalars
+# *
+# * -------------------------------------------------------------------------
+# *
+# * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
+# *
+# * GNSS-SDR is a software defined Global Navigation
+# * Satellite Systems receiver
+# *
+# * This file is part of GNSS-SDR.
+# *
+# * GNSS-SDR is free software: you can redistribute it and/or modify
+# * it under the terms of the GNU General Public License as published by
+# * the Free Software Foundation, either version 3 of the License, or
+# * at your option) any later version.
+# *
+# * GNSS-SDR is distributed in the hope that it will be useful,
+# * but WITHOUT ANY WARRANTY; without even the implied warranty of
+# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# * GNU General Public License for more details.
+# *
+# * You should have received a copy of the GNU General Public License
+# * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
+# *
+# * -------------------------------------------------------------------------
+# */
+
+.function volk_gnsssdr_8i_x2_add_8i_a_orc_impl
+.dest 1 dst
+.source 1 src1
+.source 1 src2
+addb dst, src1, src2
diff -rupN /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_8ic_conjugate_8ic.orc /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_8ic_conjugate_8ic.orc
--- /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_8ic_conjugate_8ic.orc 1970-01-01 01:00:00.000000000 +0100
+++ /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_8ic_conjugate_8ic.orc 2014-10-15 01:55:08.000000000 +0200
@@ -0,0 +1,42 @@
+#/*!
+# * \file volk_gnsssdr_8ic_conjugate_8ic.orc
+# * \brief ORC implementation: calculates the conjugate of a 16 bits vector
+# * \authors <ul>
+# * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
+# * </ul>
+# *
+# * ORC code that calculates the conjugate of a
+# * 16 bits vector (8 bits the real part and 8 bits the imaginary part)
+# * result = (real*real) + (imag*imag)
+# *
+# * -------------------------------------------------------------------------
+# *
+# * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
+# *
+# * GNSS-SDR is a software defined Global Navigation
+# * Satellite Systems receiver
+# *
+# * This file is part of GNSS-SDR.
+# *
+# * GNSS-SDR is free software: you can redistribute it and/or modify
+# * it under the terms of the GNU General Public License as published by
+# * the Free Software Foundation, either version 3 of the License, or
+# * at your option) any later version.
+# *
+# * GNSS-SDR is distributed in the hope that it will be useful,
+# * but WITHOUT ANY WARRANTY; without even the implied warranty of
+# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# * GNU General Public License for more details.
+# *
+# * You should have received a copy of the GNU General Public License
+# * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
+# *
+# * -------------------------------------------------------------------------
+# */
+
+.function volk_gnsssdr_8ic_conjugate_8ic_a_orc_impl
+.source 2 src1
+.dest 2 dst
+.temp 2 merged
+mergebw merged, 1, -1
+x2 mullb dst, merged, src1
diff -rupN /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_8ic_magnitude_squared_8i.orc /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_8ic_magnitude_squared_8i.orc
--- /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_8ic_magnitude_squared_8i.orc 1970-01-01 01:00:00.000000000 +0100
+++ /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_8ic_magnitude_squared_8i.orc 2014-10-15 01:55:08.000000000 +0200
@@ -0,0 +1,45 @@
+#/*!
+# * \file volk_gnsssdr_8ic_magnitude_squared_8i.orc
+# * \brief ORC implementation: calculates the magnitude squared of a 16 bits vector
+# * \authors <ul>
+# * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
+# * </ul>
+# *
+# * ORC code that calculates the magnitude squared of a
+# * 16 bits vector (8 bits the real part and 8 bits the imaginary part)
+# * result = (real*real) + (imag*imag)
+# *
+# * -------------------------------------------------------------------------
+# *
+# * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
+# *
+# * GNSS-SDR is a software defined Global Navigation
+# * Satellite Systems receiver
+# *
+# * This file is part of GNSS-SDR.
+# *
+# * GNSS-SDR is free software: you can redistribute it and/or modify
+# * it under the terms of the GNU General Public License as published by
+# * the Free Software Foundation, either version 3 of the License, or
+# * at your option) any later version.
+# *
+# * GNSS-SDR is distributed in the hope that it will be useful,
+# * but WITHOUT ANY WARRANTY; without even the implied warranty of
+# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# * GNU General Public License for more details.
+# *
+# * You should have received a copy of the GNU General Public License
+# * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
+# *
+# * -------------------------------------------------------------------------
+# */
+
+.function volk_gnsssdr_8ic_magnitude_squared_8i_a_orc_impl
+.source 2 src1
+.dest 1 dst
+.temp 2 iqprod
+.temp 1 ac
+.temp 1 bd
+x2 mullb iqprod, src1, src1
+splitwb bd, ac, iqprod
+addb dst, ac, bd
diff -rupN /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_8ic_s8ic_multiply_8ic.orc /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_8ic_s8ic_multiply_8ic.orc
--- /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_8ic_s8ic_multiply_8ic.orc 1970-01-01 01:00:00.000000000 +0100
+++ /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_8ic_s8ic_multiply_8ic.orc 2014-10-15 01:55:08.000000000 +0200
@@ -0,0 +1,58 @@
+#/*!
+# * \file volk_gnsssdr_8ic_s8ic_multiply_8ic.orc
+# * \brief ORC implementation: multiplies a group of 16 bits vectors by one constant vector
+# * \authors <ul>
+# * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
+# * </ul>
+# *
+# * ORC code that multiplies a group of 16 bits vectors
+# * (8 bits the real part and 8 bits the imaginary part) by one constant vector
+# *
+# * -------------------------------------------------------------------------
+# *
+# * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
+# *
+# * GNSS-SDR is a software defined Global Navigation
+# * Satellite Systems receiver
+# *
+# * This file is part of GNSS-SDR.
+# *
+# * GNSS-SDR is free software: you can redistribute it and/or modify
+# * it under the terms of the GNU General Public License as published by
+# * the Free Software Foundation, either version 3 of the License, or
+# * at your option) any later version.
+# *
+# * GNSS-SDR is distributed in the hope that it will be useful,
+# * but WITHOUT ANY WARRANTY; without even the implied warranty of
+# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# * GNU General Public License for more details.
+# *
+# * You should have received a copy of the GNU General Public License
+# * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
+# *
+# * -------------------------------------------------------------------------
+# */
+
+.function volk_gnsssdr_8ic_s8ic_multiply_8ic_a_orc_impl
+.source 2 src1
+.param 2 src2real
+.param 2 src2imag
+.dest 2 dst
+.temp 2 iqprod
+.temp 1 real
+.temp 1 imag
+.temp 1 rr
+.temp 1 ii
+.temp 1 ri
+.temp 1 ir
+x2 mullb iqprod, src1, src2real
+splitwb ir, rr, iqprod
+x2 mullb iqprod, src1, src2imag
+splitwb ii, ri, iqprod
+subb real, rr, ii
+addb imag, ri, ir
+mergebw dst, real, imag
+
+
+
+
diff -rupN /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_8ic_x2_dot_prod_8ic.orc /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_8ic_x2_dot_prod_8ic.orc
--- /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_8ic_x2_dot_prod_8ic.orc 1970-01-01 01:00:00.000000000 +0100
+++ /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_8ic_x2_dot_prod_8ic.orc 2014-10-15 01:55:08.000000000 +0200
@@ -0,0 +1,59 @@
+#/*!
+# * \file volk_gnsssdr_8ic_x2_dot_prod_8ic.orc
+# * \brief ORC implementation: multiplies two 16 bits vectors and accumulates them
+# * \authors <ul>
+# * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
+# * </ul>
+# *
+# * ORC code that multiplies two 16 bits vectors (8 bits the real part
+# * and 8 bits the imaginary part) and accumulates them
+# *
+# * -------------------------------------------------------------------------
+# *
+# * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
+# *
+# * GNSS-SDR is a software defined Global Navigation
+# * Satellite Systems receiver
+# *
+# * This file is part of GNSS-SDR.
+# *
+# * GNSS-SDR is free software: you can redistribute it and/or modify
+# * it under the terms of the GNU General Public License as published by
+# * the Free Software Foundation, either version 3 of the License, or
+# * at your option) any later version.
+# *
+# * GNSS-SDR is distributed in the hope that it will be useful,
+# * but WITHOUT ANY WARRANTY; without even the implied warranty of
+# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# * GNU General Public License for more details.
+# *
+# * You should have received a copy of the GNU General Public License
+# * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
+# *
+# * -------------------------------------------------------------------------
+# */
+
+.function volk_gnsssdr_8ic_x2_dot_prod_8ic_a_orc_impl
+.source 2 src1
+.source 2 src2
+.accumulator 2 accreal
+.accumulator 2 accimag
+.temp 2 iqprod
+.temp 1 real
+.temp 1 imag
+.temp 2 real2
+.temp 2 imag2
+.temp 1 ac
+.temp 1 bd
+.temp 2 swapped
+x2 mullb iqprod, src1, src2
+splitwb bd, ac, iqprod
+subb real, ac, bd
+swapw swapped, src1
+x2 mullb iqprod, swapped, src2
+splitwb bd, ac, iqprod
+addb imag, ac, bd
+mergebw real2, 0, real
+accw accreal, real2
+mergebw imag2, 0, imag
+accw accimag, imag2
diff -rupN /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_8ic_x2_multiply_8ic.orc /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_8ic_x2_multiply_8ic.orc
--- /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_8ic_x2_multiply_8ic.orc 1970-01-01 01:00:00.000000000 +0100
+++ /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_8ic_x2_multiply_8ic.orc 2014-10-15 01:55:08.000000000 +0200
@@ -0,0 +1,57 @@
+#/*!
+# * \file volk_gnsssdr_8ic_x2_multiply_8ic.orc
+# * \brief ORC implementation: multiplies two 16 bits vectors
+# * \authors <ul>
+# * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
+# * </ul>
+# *
+# * ORC code that multiplies two 16 bits vectors (8 bits the real part
+# * and 8 bits the imaginary part)
+# *
+# * -------------------------------------------------------------------------
+# *
+# * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
+# *
+# * GNSS-SDR is a software defined Global Navigation
+# * Satellite Systems receiver
+# *
+# * This file is part of GNSS-SDR.
+# *
+# * GNSS-SDR is free software: you can redistribute it and/or modify
+# * it under the terms of the GNU General Public License as published by
+# * the Free Software Foundation, either version 3 of the License, or
+# * at your option) any later version.
+# *
+# * GNSS-SDR is distributed in the hope that it will be useful,
+# * but WITHOUT ANY WARRANTY; without even the implied warranty of
+# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# * GNU General Public License for more details.
+# *
+# * You should have received a copy of the GNU General Public License
+# * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
+# *
+# * -------------------------------------------------------------------------
+# */
+
+.function volk_gnsssdr_8ic_x2_multiply_8ic_a_orc_impl
+.source 2 src1
+.source 2 src2
+.dest 2 dst
+.temp 2 iqprod
+.temp 1 real
+.temp 1 imag
+.temp 1 ac
+.temp 1 bd
+.temp 2 swapped
+x2 mullb iqprod, src1, src2
+splitwb bd, ac, iqprod
+subb real, ac, bd
+swapw swapped, src1
+x2 mullb iqprod, swapped, src2
+splitwb bd, ac, iqprod
+addb imag, ac, bd
+mergebw dst, real, imag
+
+
+
+
diff -rupN /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3.orc /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3.orc
--- /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3.orc 1970-01-01 01:00:00.000000000 +0100
+++ /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3.orc 2014-10-15 01:55:08.000000000 +0200
@@ -0,0 +1,139 @@
+#/*!
+# * \file volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3.orc
+# * \brief ORC implementation: performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation with 16 bits vectors
+# * \authors <ul>
+# * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
+# * </ul>
+# *
+# * ORC code that performs the carrier wipe-off mixing and the
+# * Early, Prompt, and Late correlation with 16 bits vectors (8 bits the
+# * real part and 8 bits the imaginary part):
+# * - The carrier wipe-off is done by multiplying the input signal by the
+# * carrier (multiplication of 16 bits vectors) It returns the input
+# * signal in base band (BB)
+# * - Early values are calculated by multiplying the input signal in BB by the
+# * early code (multiplication of 16 bits vectors), accumulating the results
+# * - Prompt values are calculated by multiplying the input signal in BB by the
+# * prompt code (multiplication of 16 bits vectors), accumulating the results
+# * - Late values are calculated by multiplying the input signal in BB by the
+# * late code (multiplication of 16 bits vectors), accumulating the results
+# *
+# * -------------------------------------------------------------------------
+# *
+# * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
+# *
+# * GNSS-SDR is a software defined Global Navigation
+# * Satellite Systems receiver
+# *
+# * This file is part of GNSS-SDR.
+# *
+# * GNSS-SDR is free software: you can redistribute it and/or modify
+# * it under the terms of the GNU General Public License as published by
+# * the Free Software Foundation, either version 3 of the License, or
+# * at your option) any later version.
+# *
+# * GNSS-SDR is distributed in the hope that it will be useful,
+# * but WITHOUT ANY WARRANTY; without even the implied warranty of
+# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# * GNU General Public License for more details.
+# *
+# * You should have received a copy of the GNU General Public License
+# * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
+# *
+# * -------------------------------------------------------------------------
+# */
+
+.function volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_first_a_orc_impl
+.source 2 input
+.source 2 carrier
+.source 2 E_code
+.source 2 P_code
+.accumulator 2 E_out_real
+.accumulator 2 E_out_imag
+.accumulator 2 P_out_real
+.accumulator 2 P_out_imag
+.temp 2 bb_signal_sample
+.temp 2 iqprod
+.temp 1 real
+.temp 1 imag
+.temp 1 ac
+.temp 1 bd
+.temp 2 swapped
+
+.temp 2 real2
+.temp 2 imag2
+
+x2 mullb iqprod, input, carrier
+splitwb bd, ac, iqprod
+subb real, ac, bd
+swapw swapped, input
+x2 mullb iqprod, swapped, carrier
+splitwb bd, ac, iqprod
+addb imag, ac, bd
+mergebw bb_signal_sample, real, imag
+
+swapw swapped, bb_signal_sample
+
+x2 mullb iqprod, bb_signal_sample, E_code
+splitwb bd, ac, iqprod
+subb real, ac, bd
+x2 mullb iqprod, swapped, E_code
+splitwb bd, ac, iqprod
+addb imag, ac, bd
+mergebw real2, 0, real
+mergebw imag2, 0, imag
+accw E_out_real, real2
+accw E_out_imag, imag2
+
+x2 mullb iqprod, bb_signal_sample, P_code
+splitwb bd, ac, iqprod
+subb real, ac, bd
+x2 mullb iqprod, swapped, P_code
+splitwb bd, ac, iqprod
+addb imag, ac, bd
+mergebw real2, 0, real
+mergebw imag2, 0, imag
+accw P_out_real, real2
+accw P_out_imag, imag2
+
+.function volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_second_a_orc_impl
+.source 2 input
+.source 2 carrier
+.source 2 L_code
+.accumulator 2 L_out_real
+.accumulator 2 L_out_imag
+
+.temp 2 bb_signal_sample
+.temp 2 iqprod
+.temp 1 real
+.temp 1 imag
+.temp 1 ac
+.temp 1 bd
+.temp 2 swapped
+
+.temp 2 real2
+.temp 2 imag2
+
+x2 mullb iqprod, input, carrier
+splitwb bd, ac, iqprod
+subb real, ac, bd
+swapw swapped, input
+x2 mullb iqprod, swapped, carrier
+splitwb bd, ac, iqprod
+addb imag, ac, bd
+mergebw bb_signal_sample, real, imag
+
+swapw swapped, bb_signal_sample
+
+x2 mullb iqprod, bb_signal_sample, L_code
+splitwb bd, ac, iqprod
+subb real, ac, bd
+x2 mullb iqprod, swapped, L_code
+splitwb bd, ac, iqprod
+addb imag, ac, bd
+mergebw real2, 0, real
+mergebw imag2, 0, imag
+accw L_out_real, real2
+accw L_out_imag, imag2
+
+
diff -rupN /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_8u_x2_multiply_8u.orc /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_8u_x2_multiply_8u.orc
--- /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_8u_x2_multiply_8u.orc 1970-01-01 01:00:00.000000000 +0100
+++ /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_8u_x2_multiply_8u.orc 2014-10-15 01:55:08.000000000 +0200
@@ -0,0 +1,39 @@
+#/*!
+# * \file volk_gnsssdr_8u_x2_multiply_8u.orc
+# * \brief ORC implementation: multiplies unsigned char values
+# * \authors <ul>
+# * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
+# * </ul>
+# *
+# * ORC code that multiplies unsigned char values (8 bits data)
+# *
+# * -------------------------------------------------------------------------
+# *
+# * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
+# *
+# * GNSS-SDR is a software defined Global Navigation
+# * Satellite Systems receiver
+# *
+# * This file is part of GNSS-SDR.
+# *
+# * GNSS-SDR is free software: you can redistribute it and/or modify
+# * it under the terms of the GNU General Public License as published by
+# * the Free Software Foundation, either version 3 of the License, or
+# * at your option) any later version.
+# *
+# * GNSS-SDR is distributed in the hope that it will be useful,
+# * but WITHOUT ANY WARRANTY; without even the implied warranty of
+# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# * GNU General Public License for more details.
+# *
+# * You should have received a copy of the GNU General Public License
+# * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
+# *
+# * -------------------------------------------------------------------------
+# */
+
+.function volk_gnsssdr_8u_x2_multiply_8u_a_orc_impl
+.source 1 src1
+.source 1 src2
+.dest 1 dst
+mullb dst, src1, src2
diff -rupN /Users/andres/Desktop/volk_gnsssdr/patches for generating volk_gnsssdr/2014-10-17_Patch.patch /Users/andres/Desktop/volk_gnsssdr_original/patches for generating volk_gnsssdr/2014-10-17_Patch.patch
--- /Users/andres/Desktop/volk_gnsssdr/patches for generating volk_gnsssdr/2014-10-17_Patch.patch 1970-01-01 01:00:00.000000000 +0100
+++ /Users/andres/Desktop/volk_gnsssdr_original/patches for generating volk_gnsssdr/2014-10-17_Patch.patch 2014-10-17 04:29:54.000000000 +0200
@@ -0,0 +1,329 @@
+diff -rupN /Users/andres/Desktop/volk_gnsssdr/lib/CMakeLists.txt /Users/andres/Desktop/volk_gnsssdr_original/lib/CMakeLists.txt
+--- /Users/andres/Desktop/volk_gnsssdr/lib/CMakeLists.txt 2014-10-17 04:26:38.000000000 +0200
++++ /Users/andres/Desktop/volk_gnsssdr_original/lib/CMakeLists.txt 2014-10-17 04:17:37.000000000 +0200
+@@ -517,7 +517,19 @@ if(MSVC)
+ endif()
+
+ #create the volk_gnsssdr runtime library
+-add_library(volk_gnsssdr SHARED ${volk_gnsssdr_sources})
++
++#MODIFICATIONS BY GNSS-SDR
++file(GLOB orc ${CMAKE_SOURCE_DIR}/orc/*.orc)
++file(GLOB CommonMacros ${CMAKE_SOURCE_DIR}/kernels/CommonMacros/*.h ${CMAKE_SOURCE_DIR}/kernels/CommonMacros/README.txt)
++
++#add_library(volk_gnsssdr SHARED ${volk_gnsssdr_sources})
++add_library(volk_gnsssdr SHARED ${volk_gnsssdr_sources} ${h_files} ${CommonMacros} ${orc})
++
++source_group("Kernels" FILES ${h_files})
++source_group("Common Macros" FILES ${CommonMacros})
++source_group("ORC Files" FILES ${orc})
++#END OF MODIFICATIONS
++
+ target_link_libraries(volk_gnsssdr ${volk_gnsssdr_libraries})
+ set_target_properties(volk_gnsssdr PROPERTIES SOVERSION ${LIBVER})
+ set_target_properties(volk_gnsssdr PROPERTIES DEFINE_SYMBOL "volk_gnsssdr_EXPORTS")
+diff -rupN /Users/andres/Desktop/volk_gnsssdr/lib/qa_utils.cc /Users/andres/Desktop/volk_gnsssdr_original/lib/qa_utils.cc
+--- /Users/andres/Desktop/volk_gnsssdr/lib/qa_utils.cc 2014-10-17 04:26:39.000000000 +0200
++++ /Users/andres/Desktop/volk_gnsssdr_original/lib/qa_utils.cc 2014-10-17 04:21:03.000000000 +0200
+@@ -217,6 +217,72 @@ inline void run_cast_test3_s32fc(volk_gn
+ while(iter--) func(buffs[0], buffs[1], buffs[2], scalar, vlen, arch.c_str());
+ }
+
++//ADDED BY GNSS-SDR. START
++inline void run_cast_test1_s8i(volk_gnsssdr_fn_1arg_s8i func, std::vector<void *> &buffs, char scalar, unsigned int vlen, unsigned int iter, std::string arch) {
++ while(iter--) func(buffs[0], scalar, vlen, arch.c_str());
++}
++
++inline void run_cast_test2_s8i(volk_gnsssdr_fn_2arg_s8i func, std::vector<void *> &buffs, char scalar, unsigned int vlen, unsigned int iter, std::string arch) {
++ while(iter--) func(buffs[0], buffs[1], scalar, vlen, arch.c_str());
++}
++
++inline void run_cast_test3_s8i(volk_gnsssdr_fn_3arg_s8i func, std::vector<void *> &buffs, char scalar, unsigned int vlen, unsigned int iter, std::string arch) {
++ while(iter--) func(buffs[0], buffs[1], buffs[2], scalar, vlen, arch.c_str());
++}
++
++inline void run_cast_test1_s8ic(volk_gnsssdr_fn_1arg_s8ic func, std::vector<void *> &buffs, lv_8sc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) {
++ while(iter--) func(buffs[0], scalar, vlen, arch.c_str());
++}
++
++inline void run_cast_test2_s8ic(volk_gnsssdr_fn_2arg_s8ic func, std::vector<void *> &buffs, lv_8sc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) {
++ while(iter--) func(buffs[0], buffs[1], scalar, vlen, arch.c_str());
++}
++
++inline void run_cast_test3_s8ic(volk_gnsssdr_fn_3arg_s8ic func, std::vector<void *> &buffs, lv_8sc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) {
++ while(iter--) func(buffs[0], buffs[1], buffs[2], scalar, vlen, arch.c_str());
++}
++
++inline void run_cast_test8(volk_gnsssdr_fn_8arg func, std::vector<void *> &buffs, unsigned int vlen, unsigned int iter, std::string arch) {
++ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], vlen, arch.c_str());
++}
++
++inline void run_cast_test8_s8i(volk_gnsssdr_fn_8arg_s8i func, std::vector<void *> &buffs, char scalar, unsigned int vlen, unsigned int iter, std::string arch) {
++ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], scalar, vlen, arch.c_str());
++}
++
++inline void run_cast_test8_s8ic(volk_gnsssdr_fn_8arg_s8ic func, std::vector<void *> &buffs, lv_8sc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) {
++ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], scalar, vlen, arch.c_str());
++}
++
++inline void run_cast_test8_s32f(volk_gnsssdr_fn_8arg_s32f func, std::vector<void *> &buffs, float scalar, unsigned int vlen, unsigned int iter, std::string arch) {
++ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], scalar, vlen, arch.c_str());
++}
++
++inline void run_cast_test8_s32fc(volk_gnsssdr_fn_8arg_s32fc func, std::vector<void *> &buffs, lv_32fc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) {
++ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], scalar, vlen, arch.c_str());
++}
++
++inline void run_cast_test12(volk_gnsssdr_fn_12arg func, std::vector<void *> &buffs, unsigned int vlen, unsigned int iter, std::string arch) {
++ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], buffs[8], buffs[9], buffs[10], buffs[11], vlen, arch.c_str());
++}
++
++inline void run_cast_test12_s8i(volk_gnsssdr_fn_12arg_s8i func, std::vector<void *> &buffs, char scalar, unsigned int vlen, unsigned int iter, std::string arch) {
++ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], buffs[8], buffs[9], buffs[10], buffs[11], scalar, vlen, arch.c_str());
++}
++
++inline void run_cast_test12_s8ic(volk_gnsssdr_fn_12arg_s8ic func, std::vector<void *> &buffs, lv_8sc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) {
++ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], buffs[8], buffs[9], buffs[10], buffs[11], scalar, vlen, arch.c_str());
++}
++
++inline void run_cast_test12_s32f(volk_gnsssdr_fn_12arg_s32f func, std::vector<void *> &buffs, float scalar, unsigned int vlen, unsigned int iter, std::string arch) {
++ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], buffs[8], buffs[9], buffs[10], buffs[11], scalar, vlen, arch.c_str());
++}
++
++inline void run_cast_test12_s32fc(volk_gnsssdr_fn_12arg_s32fc func, std::vector<void *> &buffs, lv_32fc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) {
++ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], buffs[8], buffs[9], buffs[10], buffs[11], scalar, vlen, arch.c_str());
++}
++//ADDED BY GNSS-SDR. END
++
+ // This function is a nop that helps resolve GNU Radio bugs 582 and 583.
+ // Without this the cast in run_volk_gnsssdr_tests for tol_i = static_cast<int>(float tol)
+ // won't happen on armhf (reported on cortex A9 and A15).
+@@ -426,7 +492,17 @@ bool run_volk_gnsssdr_tests(volk_gnsssdr
+ } else {
+ run_cast_test1_s32f((volk_gnsssdr_fn_1arg_s32f)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]);
+ }
+- } else throw "unsupported 1 arg function >1 scalars";
++ }
++ //ADDED BY GNSS-SDR. START
++ else if(inputsc.size() == 1 && !inputsc[0].is_float) {
++ if(inputsc[0].is_complex) {
++ run_cast_test1_s8ic((volk_gnsssdr_fn_1arg_s8ic)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]);
++ } else {
++ run_cast_test1_s8i((volk_gnsssdr_fn_1arg_s8i)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]);
++ }
++ }
++ //ADDED BY GNSS-SDR. END
++ else throw "unsupported 1 arg function >1 scalars";
+ break;
+ case 2:
+ if(inputsc.size() == 0) {
+@@ -437,7 +513,17 @@ bool run_volk_gnsssdr_tests(volk_gnsssdr
+ } else {
+ run_cast_test2_s32f((volk_gnsssdr_fn_2arg_s32f)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]);
+ }
+- } else throw "unsupported 2 arg function >1 scalars";
++ }
++ //ADDED BY GNSS-SDR. START
++ else if(inputsc.size() == 1 && !inputsc[0].is_float) {
++ if(inputsc[0].is_complex) {
++ run_cast_test2_s8ic((volk_gnsssdr_fn_2arg_s8ic)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]);
++ } else {
++ run_cast_test2_s8i((volk_gnsssdr_fn_2arg_s8i)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]);
++ }
++ }
++ //ADDED BY GNSS-SDR. END
++ else throw "unsupported 2 arg function >1 scalars";
+ break;
+ case 3:
+ if(inputsc.size() == 0) {
+@@ -448,11 +534,61 @@ bool run_volk_gnsssdr_tests(volk_gnsssdr
+ } else {
+ run_cast_test3_s32f((volk_gnsssdr_fn_3arg_s32f)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]);
+ }
+- } else throw "unsupported 3 arg function >1 scalars";
++ }
++ //ADDED BY GNSS-SDR. START
++ else if(inputsc.size() == 1 && !inputsc[0].is_float) {
++ if(inputsc[0].is_complex) {
++ run_cast_test3_s8ic((volk_gnsssdr_fn_3arg_s8ic)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]);
++ } else {
++ run_cast_test3_s8i((volk_gnsssdr_fn_3arg_s8i)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]);
++ }
++ }
++ //ADDED BY GNSS-SDR. END
++ else throw "unsupported 3 arg function >1 scalars";
+ break;
+ case 4:
+ run_cast_test4((volk_gnsssdr_fn_4arg)(manual_func), test_data[i], vlen, iter, arch_list[i]);
+ break;
++ //ADDED BY GNSS-SDR. START
++ case 8:
++ if(inputsc.size() == 0) {
++ run_cast_test8((volk_gnsssdr_fn_8arg)(manual_func), test_data[i], vlen, iter, arch_list[i]);
++ } else if(inputsc.size() == 1 && inputsc[0].is_float) {
++ if(inputsc[0].is_complex) {
++ run_cast_test8_s32fc((volk_gnsssdr_fn_8arg_s32fc)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]);
++ } else {
++ run_cast_test8_s32f((volk_gnsssdr_fn_8arg_s32f)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]);
++ }
++ }
++ else if(inputsc.size() == 1 && !inputsc[0].is_float) {
++ if(inputsc[0].is_complex) {
++ run_cast_test8_s8ic((volk_gnsssdr_fn_8arg_s8ic)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]);
++ } else {
++ run_cast_test8_s8i((volk_gnsssdr_fn_8arg_s8i)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]);
++ }
++ }
++ else throw "unsupported 8 arg function >1 scalars";
++ break;
++ case 12:
++ if(inputsc.size() == 0) {
++ run_cast_test12((volk_gnsssdr_fn_12arg)(manual_func), test_data[i], vlen, iter, arch_list[i]);
++ } else if(inputsc.size() == 1 && inputsc[0].is_float) {
++ if(inputsc[0].is_complex) {
++ run_cast_test12_s32fc((volk_gnsssdr_fn_12arg_s32fc)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]);
++ } else {
++ run_cast_test12_s32f((volk_gnsssdr_fn_12arg_s32f)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]);
++ }
++ }
++ else if(inputsc.size() == 1 && !inputsc[0].is_float) {
++ if(inputsc[0].is_complex) {
++ run_cast_test12_s8ic((volk_gnsssdr_fn_12arg_s8ic)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]);
++ } else {
++ run_cast_test12_s8i((volk_gnsssdr_fn_12arg_s8i)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]);
++ }
++ }
++ else throw "unsupported 12 arg function >1 scalars";
++ break;
++ //ADDED BY GNSS-SDR. END
+ default:
+ throw "no function handler for this signature";
+ break;
+diff -rupN /Users/andres/Desktop/volk_gnsssdr/lib/qa_utils.h /Users/andres/Desktop/volk_gnsssdr_original/lib/qa_utils.h
+--- /Users/andres/Desktop/volk_gnsssdr/lib/qa_utils.h 2014-10-17 04:26:39.000000000 +0200
++++ /Users/andres/Desktop/volk_gnsssdr_original/lib/qa_utils.h 2014-10-17 04:21:51.000000000 +0200
+@@ -77,4 +77,26 @@ typedef void (*volk_gnsssdr_fn_1arg_s32f
+ typedef void (*volk_gnsssdr_fn_2arg_s32fc)(void *, void *, lv_32fc_t, unsigned int, const char*);
+ typedef void (*volk_gnsssdr_fn_3arg_s32fc)(void *, void *, void *, lv_32fc_t, unsigned int, const char*);
+
++//ADDED BY GNSS-SDR. START
++typedef void (*volk_gnsssdr_fn_1arg_s8i)(void *, char, unsigned int, const char*); //one input vector, one scalar char input
++typedef void (*volk_gnsssdr_fn_2arg_s8i)(void *, void *, char, unsigned int, const char*);
++typedef void (*volk_gnsssdr_fn_3arg_s8i)(void *, void *, void *, char, unsigned int, const char*);
++typedef void (*volk_gnsssdr_fn_1arg_s8ic)(void *, lv_8sc_t, unsigned int, const char*); //one input vector, one scalar lv_8sc_t vector input
++typedef void (*volk_gnsssdr_fn_2arg_s8ic)(void *, void *, lv_8sc_t, unsigned int, const char*);
++typedef void (*volk_gnsssdr_fn_3arg_s8ic)(void *, void *, void *, lv_8sc_t, unsigned int, const char*);
++
++typedef void (*volk_gnsssdr_fn_8arg)(void *, void *, void *, void *, void *, void *, void *, void *, unsigned int, const char*);
++typedef void (*volk_gnsssdr_fn_8arg_s32f)(void *, void *, void *, void *, void *, void *, void *, void *, float, unsigned int, const char*);
++typedef void (*volk_gnsssdr_fn_8arg_s32fc)(void *, void *, void *, void *, void *, void *, void *, void *, lv_32fc_t, unsigned int, const char*);
++typedef void (*volk_gnsssdr_fn_8arg_s8i)(void *, void *, void *, void *, void *, void *, void *, void *, char, unsigned int, const char*);
++typedef void (*volk_gnsssdr_fn_8arg_s8ic)(void *, void *, void *, void *, void *, void *, void *, void *, lv_8sc_t, unsigned int, const char*);
++
++typedef void (*volk_gnsssdr_fn_12arg)(void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, unsigned int, const char*);
++typedef void (*volk_gnsssdr_fn_12arg_s32f)(void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, float, unsigned int, const char*);
++typedef void (*volk_gnsssdr_fn_12arg_s32fc)(void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, lv_32fc_t, unsigned int, const char*);
++typedef void (*volk_gnsssdr_fn_12arg_s8i)(void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, char, unsigned int, const char*);
++typedef void (*volk_gnsssdr_fn_12arg_s8ic)(void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, lv_8sc_t, unsigned int, const char*);
++//ADDED BY GNSS-SDR. END
++
++
+ #endif //VOLK_QA_UTILS_H
+diff -rupN /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr.tmpl.h /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr.tmpl.h
+--- /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr.tmpl.h 2014-10-17 04:26:39.000000000 +0200
++++ /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr.tmpl.h 2014-10-17 04:23:30.000000000 +0200
+@@ -19,8 +19,8 @@
+ * Boston, MA 02110-1301, USA.
+ */
+
+-#ifndef INCLUDED_VOLK_RUNTIME
+-#define INCLUDED_VOLK_RUNTIME
++#ifndef INCLUDED_VOLK_GNSSSDR_RUNTIME
++#define INCLUDED_VOLK_GNSSSDR_RUNTIME
+
+ #include <volk_gnsssdr/volk_gnsssdr_typedefs.h>
+ #include <volk_gnsssdr/volk_gnsssdr_config_fixed.h>
+@@ -91,4 +91,4 @@ extern VOLK_API volk_gnsssdr_func_desc_t
+
+ __VOLK_DECL_END
+
+-#endif /*INCLUDED_VOLK_RUNTIME*/
++#endif /*INCLUDED_VOLK_GNSSSDR_RUNTIME*/
+diff -rupN /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr_config_fixed.tmpl.h /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr_config_fixed.tmpl.h
+--- /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr_config_fixed.tmpl.h 2014-10-17 04:26:39.000000000 +0200
++++ /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr_config_fixed.tmpl.h 2014-10-17 04:22:58.000000000 +0200
+@@ -19,11 +19,11 @@
+ * Boston, MA 02110-1301, USA.
+ */
+
+-#ifndef INCLUDED_VOLK_CONFIG_FIXED_H
+-#define INCLUDED_VOLK_CONFIG_FIXED_H
++#ifndef INCLUDED_VOLK_GNSSSDR_CONFIG_FIXED_H
++#define INCLUDED_VOLK_GNSSSDR_CONFIG_FIXED_H
+
+ #for $i, $arch in enumerate($archs)
+ #define LV_$(arch.name.upper()) $i
+ #end for
+
+-#endif /*INCLUDED_VOLK_CONFIG_FIXED*/
++#endif /*INCLUDED_VOLK_GNSSSDR_CONFIG_FIXED*/
+diff -rupN /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr_cpu.tmpl.h /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr_cpu.tmpl.h
+--- /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr_cpu.tmpl.h 2014-10-17 04:26:39.000000000 +0200
++++ /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr_cpu.tmpl.h 2014-10-17 04:23:07.000000000 +0200
+@@ -19,8 +19,8 @@
+ * Boston, MA 02110-1301, USA.
+ */
+
+-#ifndef INCLUDED_VOLK_CPU_H
+-#define INCLUDED_VOLK_CPU_H
++#ifndef INCLUDED_VOLK_GNSSSDR_CPU_H
++#define INCLUDED_VOLK_GNSSSDR_CPU_H
+
+ #include <volk_gnsssdr/volk_gnsssdr_common.h>
+
+@@ -39,4 +39,4 @@ unsigned int volk_gnsssdr_get_lvarch ();
+
+ __VOLK_DECL_END
+
+-#endif /*INCLUDED_VOLK_CPU_H*/
++#endif /*INCLUDED_VOLK_GNSSSDR_CPU_H*/
+diff -rupN /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr_machines.tmpl.h /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr_machines.tmpl.h
+--- /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr_machines.tmpl.h 2014-10-17 04:26:39.000000000 +0200
++++ /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr_machines.tmpl.h 2014-10-17 04:23:16.000000000 +0200
+@@ -19,8 +19,8 @@
+ * Boston, MA 02110-1301, USA.
+ */
+
+-#ifndef INCLUDED_LIBVOLK_MACHINES_H
+-#define INCLUDED_LIBVOLK_MACHINES_H
++#ifndef INCLUDED_LIBVOLK_GNSSSDR_MACHINES_H
++#define INCLUDED_LIBVOLK_GNSSSDR_MACHINES_H
+
+ #include <volk_gnsssdr/volk_gnsssdr_common.h>
+ #include <volk_gnsssdr/volk_gnsssdr_typedefs.h>
+@@ -52,4 +52,4 @@ extern struct volk_gnsssdr_machine volk_
+
+ __VOLK_DECL_END
+
+-#endif //INCLUDED_LIBVOLK_MACHINES_H
++#endif //INCLUDED_LIBVOLK_GNSSSDR_MACHINES_H
+diff -rupN /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr_typedefs.tmpl.h /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr_typedefs.tmpl.h
+--- /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr_typedefs.tmpl.h 2014-10-17 04:26:39.000000000 +0200
++++ /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr_typedefs.tmpl.h 2014-10-17 04:23:23.000000000 +0200
+@@ -19,8 +19,8 @@
+ * Boston, MA 02110-1301, USA.
+ */
+
+-#ifndef INCLUDED_VOLK_TYPEDEFS
+-#define INCLUDED_VOLK_TYPEDEFS
++#ifndef INCLUDED_VOLK_GNSSSDR_TYPEDEFS
++#define INCLUDED_VOLK_GNSSSDR_TYPEDEFS
+
+ #include <inttypes.h>
+ #include <volk_gnsssdr/volk_gnsssdr_complex.h>
+@@ -29,4 +29,4 @@
+ typedef void (*$(kern.pname))($kern.arglist_types);
+ #end for
+
+-#endif /*INCLUDED_VOLK_TYPEDEFS*/
++#endif /*INCLUDED_VOLK_GNSSSDR_TYPEDEFS*/
diff -rupN /Users/andres/Desktop/volk_gnsssdr/patches for generating volk_gnsssdr/2014-10-17_Patch_with_protokernels.patch /Users/andres/Desktop/volk_gnsssdr_original/patches for generating volk_gnsssdr/2014-10-17_Patch_with_protokernels.patch
--- /Users/andres/Desktop/volk_gnsssdr/patches for generating volk_gnsssdr/2014-10-17_Patch_with_protokernels.patch 1970-01-01 01:00:00.000000000 +0100
+++ /Users/andres/Desktop/volk_gnsssdr_original/patches for generating volk_gnsssdr/2014-10-17_Patch_with_protokernels.patch 2014-10-17 04:27:54.000000000 +0200
@@ -0,0 +1,38251 @@
+Binary files /Users/andres/Desktop/volk_gnsssdr/.DS_Store and /Users/andres/Desktop/volk_gnsssdr_original/.DS_Store differ
+diff -rupN /Users/andres/Desktop/volk_gnsssdr/apps/volk_gnsssdr_profile.cc /Users/andres/Desktop/volk_gnsssdr_original/apps/volk_gnsssdr_profile.cc
+--- /Users/andres/Desktop/volk_gnsssdr/apps/volk_gnsssdr_profile.cc 2014-10-17 04:26:39.000000000 +0200
++++ /Users/andres/Desktop/volk_gnsssdr_original/apps/volk_gnsssdr_profile.cc 2014-10-17 01:45:18.000000000 +0200
+@@ -37,49 +37,6 @@
+
+ namespace fs = boost::filesystem;
+
+-void write_json(std::ofstream &json_file, std::vector<volk_gnsssdr_test_results_t> results) {
+- json_file << "{" << std::endl;
+- json_file << " \"volk_gnsssdr_tests\": [" << std::endl;
+- size_t len = results.size();
+- size_t i = 0;
+- BOOST_FOREACH(volk_gnsssdr_test_results_t &result, results) {
+- json_file << " {" << std::endl;
+- json_file << " \"name\": \"" << result.name << "\"," << std::endl;
+- json_file << " \"vlen\": " << result.vlen << "," << std::endl;
+- json_file << " \"iter\": " << result.iter << "," << std::endl;
+- json_file << " \"best_arch_a\": \"" << result.best_arch_a
+- << "\"," << std::endl;
+- json_file << " \"best_arch_u\": \"" << result.best_arch_u
+- << "\"," << std::endl;
+- json_file << " \"results\": {" << std::endl;
+- size_t results_len = result.results.size();
+- size_t ri = 0;
+- typedef std::pair<std::string, volk_gnsssdr_test_time_t> tpair;
+- BOOST_FOREACH(tpair pair, result.results) {
+- volk_gnsssdr_test_time_t time = pair.second;
+- json_file << " \"" << time.name << "\": {" << std::endl;
+- json_file << " \"name\": \"" << time.name << "\"," << std::endl;
+- json_file << " \"time\": " << time.time << "," << std::endl;
+- json_file << " \"units\": \"" << time.units << "\"" << std::endl;
+- json_file << " }" ;
+- if(ri+1 != results_len) {
+- json_file << ",";
+- }
+- json_file << std::endl;
+- ri++;
+- }
+- json_file << " }" << std::endl;
+- json_file << " }";
+- if(i+1 != len) {
+- json_file << ",";
+- }
+- json_file << std::endl;
+- i++;
+- }
+- json_file << " ]" << std::endl;
+- json_file << "}" << std::endl;
+-}
+-
+ int main(int argc, char *argv[]) {
+ // Adding program options
+ boost::program_options::options_description desc("Options");
+@@ -92,9 +49,6 @@ int main(int argc, char *argv[]) {
+ ("tests-regex,R",
+ boost::program_options::value<std::string>(),
+ "Run tests matching regular expression.")
+- ("json,j",
+- boost::program_options::value<std::string>(),
+- "JSON output file")
+ ;
+
+ // Handle the options that were given
+@@ -102,8 +56,6 @@ int main(int argc, char *argv[]) {
+ bool benchmark_mode;
+ std::string kernel_regex;
+ bool store_results = true;
+- std::ofstream json_file;
+-
+ try {
+ boost::program_options::store(boost::program_options::parse_command_line(argc, argv, desc), vm);
+ boost::program_options::notify(vm);
+@@ -131,14 +83,9 @@ int main(int argc, char *argv[]) {
+ return 0;
+ }
+
+- if ( vm.count("json") )
+- {
+- json_file.open( vm["json"].as<std::string>().c_str() );
+- }
+-
+
+ // Run tests
+- std::vector<volk_gnsssdr_test_results_t> results;
++ std::vector<std::string> results;
+
+ //VOLK_PROFILE(volk_gnsssdr_16i_x5_add_quad_16i_x4, 1e-4, 2046, 10000, &results, benchmark_mode, kernel_regex);
+ //VOLK_PROFILE(volk_gnsssdr_16i_branch_4_state_8, 1e-4, 2046, 10000, &results, benchmark_mode, kernel_regex);
+@@ -155,6 +102,55 @@ int main(int argc, char *argv[]) {
+
+ // Until we can update the config on a kernel by kernel basis
+ // do not overwrite volk_gnsssdr_config when using a regex.
++
++ //GNSS-SDR PROTO-KERNELS
++ //lv_32fc_t sfv = lv_cmake((float)1, (float)2);
++ //example: VOLK_PROFILE(volk_gnsssdr_8ic_s8ic_multiply_8ic, 1e-4, sfv, 204602, 1000, &results, benchmark_mode, kernel_regex);
++
++ //CAN NOT BE TESTED YET BECAUSE VOLK MODULE DOES NOT SUPPORT IT:
++ //VOLK_PROFILE(volk_gnsssdr_s32f_x2_update_local_carrier_32fc, 1e-4, 0, 16007, 1, &results, benchmark_mode, kernel_regex);
++ //VOLK_PROFILE(volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc, 1e-4, 0, 7, 1, &results, benchmark_mode, kernel_regex);
++
++ VOLK_PROFILE(volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex);
++ VOLK_PROFILE(volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex);
++ VOLK_PROFILE(volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex);
++ VOLK_PROFILE(volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex);
++
++ VOLK_PROFILE(volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex);
++ VOLK_PROFILE(volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex);
++ VOLK_PROFILE(volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex);
++
++ VOLK_PROFILE(volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex);
++ VOLK_PROFILE(volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex);
++ VOLK_PROFILE(volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex);
++ VOLK_PROFILE(volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex);
++
++ VOLK_PROFILE(volk_gnsssdr_32fc_convert_16ic, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex);
++ VOLK_PROFILE(volk_gnsssdr_32fc_convert_8ic, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex);
++ VOLK_PROFILE(volk_gnsssdr_32fc_s32f_convert_8ic, 1e-4, 5, 16000, 250, &results, benchmark_mode, kernel_regex);
++
++ /*VOLK_PROFILE(volk_gnsssdr_32f_accumulator_s32f, 1e-4, 0, 204602, 10000, &results, benchmark_mode, kernel_regex);
++ VOLK_PROFILE(volk_gnsssdr_8i_accumulator_s8i, 1e-4, 0, 204602, 10000, &results, benchmark_mode, kernel_regex);
++ VOLK_PROFILE(volk_gnsssdr_32f_index_max_16u, 3, 0, 204602, 5000, &results, benchmark_mode, kernel_regex);
++ VOLK_PROFILE(volk_gnsssdr_8i_index_max_16u, 3, 0, 204602, 5000, &results, benchmark_mode, kernel_regex);
++ VOLK_PROFILE(volk_gnsssdr_8i_max_s8i, 3, 0, 204602, 5000, &results, benchmark_mode, kernel_regex);
++ VOLK_PROFILE(volk_gnsssdr_32f_x2_add_32f, 1e-4, 0, 204602, 10000, &results, benchmark_mode, kernel_regex);
++ VOLK_PROFILE(volk_gnsssdr_8i_x2_add_8i, 1e-4, 0, 204602, 10000, &results, benchmark_mode, kernel_regex);
++ VOLK_PROFILE(volk_gnsssdr_32fc_conjugate_32fc, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
++ VOLK_PROFILE(volk_gnsssdr_8ic_conjugate_8ic, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
++ VOLK_PROFILE(volk_gnsssdr_32fc_magnitude_squared_32f, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
++ VOLK_PROFILE(volk_gnsssdr_8ic_magnitude_squared_8i, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
++ VOLK_PROFILE(volk_gnsssdr_32fc_s32fc_multiply_32fc, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
++ VOLK_PROFILE(volk_gnsssdr_8ic_s8ic_multiply_8ic, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
++ VOLK_PROFILE(volk_gnsssdr_32fc_x2_dot_prod_32fc, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
++ VOLK_PROFILE(volk_gnsssdr_8ic_x2_dot_prod_8ic, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
++ VOLK_PROFILE(volk_gnsssdr_32fc_x2_multiply_32fc, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
++ VOLK_PROFILE(volk_gnsssdr_8ic_x2_multiply_8ic, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
++ VOLK_PROFILE(volk_gnsssdr_8u_x2_multiply_8u, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
++ VOLK_PROFILE(volk_gnsssdr_64f_accumulator_64f, 1e-4, 0, 16000, 1000, &results, benchmark_mode, kernel_regex);
++ VOLK_PROFILE(volk_gnsssdr_32f_s32f_convert_16i, 1e-4, 1, 204602, 250, &results, benchmark_mode, kernel_regex);
++ VOLK_PROFILE(volk_gnsssdr_16i_s32f_convert_32f, 1e-4, 1, 204602, 250, &results, benchmark_mode, kernel_regex);*/
++
+ if(store_results) {
+ char path[1024];
+ volk_gnsssdr_get_config_path(path);
+@@ -178,10 +174,8 @@ int main(int argc, char *argv[]) {
+ #the function name is followed by the preferred architecture.\n\
+ ";
+
+- BOOST_FOREACH(volk_gnsssdr_test_results_t result, results) {
+- config << result.config_name << " "
+- << result.best_arch_a << " "
+- << result.best_arch_u << std::endl;
++ BOOST_FOREACH(std::string result, results) {
++ config << result << std::endl;
+ }
+ config.close();
+ }
+Binary files /Users/andres/Desktop/volk_gnsssdr/kernels/.DS_Store and /Users/andres/Desktop/volk_gnsssdr_original/kernels/.DS_Store differ
+diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/CommonMacros/CommonMacros.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/CommonMacros/CommonMacros.h
+--- /Users/andres/Desktop/volk_gnsssdr/kernels/CommonMacros/CommonMacros.h 1970-01-01 01:00:00.000000000 +0100
++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/CommonMacros/CommonMacros.h 2014-10-15 01:55:08.000000000 +0200
+@@ -0,0 +1,174 @@
++/*!
++ * \file CommonMacros.h
++ * \brief Common macros used inside the volk protokernels.
++ * \authors <ul>
++ * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
++ * </ul>
++ *
++ * -------------------------------------------------------------------------
++ *
++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
++ *
++ * GNSS-SDR is a software defined Global Navigation
++ * Satellite Systems receiver
++ *
++ * This file is part of GNSS-SDR.
++ *
++ * GNSS-SDR is free software: you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation, either version 3 of the License, or
++ * at your option) any later version.
++ *
++ * GNSS-SDR is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
++ *
++ * -------------------------------------------------------------------------
++ */
++#ifndef INCLUDED_gnsssdr_CommonMacros_u_H
++#define INCLUDED_gnsssdr_CommonMacros_u_H
++
++ #ifdef LV_HAVE_SSE4_1
++ /*!
++ \brief Macros for U_SSE4_1
++ */
++
++ #ifndef CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1
++ #define CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(input1, input2, real, imag)\
++ imag = _mm_srli_si128 (input1, 2);\
++ imag = _mm_blend_epi16 (input2, imag, 85);\
++ real = _mm_slli_si128 (input2, 2);\
++ real = _mm_blend_epi16 (real, input1, 85);
++ #endif /* CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1 */
++
++ #ifndef CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1
++ #define CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(input, input_i_1, input_i_2, output_i32, output_ps)\
++ input_i_1 = _mm_cvtepi16_epi32(input);\
++ input = _mm_srli_si128 (input, 8);\
++ input_i_2 = _mm_cvtepi16_epi32(input);\
++ output_i32 = _mm_add_epi32 (input_i_1, input_i_2);\
++ output_ps = _mm_cvtepi32_ps(output_i32);
++ #endif /* CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1 */
++
++ #ifndef CM_8IC_CONVERT_AND_ACC_32FC_U_SSE4_1
++ #define CM_8IC_CONVERT_AND_ACC_32FC_U_SSE4_1(input, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)\
++ input_i_1 = _mm_cvtepi8_epi32(input);\
++ input = _mm_srli_si128 (input, 4);\
++ input_i_2 = _mm_cvtepi8_epi32(input);\
++ input = _mm_srli_si128 (input, 4);\
++ output_i32_1 = _mm_add_epi32 (input_i_1, input_i_2);\
++ input_i_1 = _mm_cvtepi8_epi32(input);\
++ input = _mm_srli_si128 (input, 4);\
++ input_i_2 = _mm_cvtepi8_epi32(input);\
++ input = _mm_srli_si128 (input, 4);\
++ output_i32_2 = _mm_add_epi32 (input_i_1, input_i_2);\
++ output_i32 = _mm_add_epi32 (output_i32_1, output_i32_2);\
++ output_ps = _mm_cvtepi32_ps(output_i32);
++ #endif /* CM_8IC_CONVERT_AND_ACC_32FC_U_SSE4_1 */
++
++ #endif /* LV_HAVE_SSE4_1 */
++
++ #ifdef LV_HAVE_SSE2
++ /*!
++ \brief Macros for U_SSE2
++ */
++
++ #ifdef LV_HAVE_SSSE3
++ /*!
++ \brief Macros for U_SSSE3
++ */
++
++ #ifndef CM_8IC_X2_SCALAR_PRODUCT_16IC_X2_U_SSSE3
++ #define CM_8IC_X2_SCALAR_PRODUCT_16IC_X2_U_SSSE3(y, x, check_sign_sequence, rearrange_sequence, y_aux, x_abs, real_output, imag_output)\
++ y_aux = _mm_sign_epi8 (y, x);\
++ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence);\
++ real_output = _mm_maddubs_epi16 (x_abs, y_aux);\
++ \
++ y_aux = _mm_shuffle_epi8 (y, rearrange_sequence);\
++ y_aux = _mm_sign_epi8 (y_aux, x);\
++ imag_output = _mm_maddubs_epi16 (x_abs, y_aux);
++ #endif /* CM_8IC_X2_SCALAR_PRODUCT_16IC_X2_U_SSSE3 */
++
++ #endif /* LV_HAVE_SSSE3 */
++
++ #ifndef CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2
++ #define CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output)\
++ realx_mult_realy = _mm_mullo_epi16 (realx, realy);\
++ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);\
++ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);\
++ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);\
++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);\
++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
++ #endif /* CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2 */
++
++ #ifndef CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2
++ #define CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(input, mult1, real, imag)\
++ imag = _mm_srli_si128 (input, 1);\
++ imag = _mm_and_si128 (imag, mult1);\
++ real = _mm_and_si128 (input, mult1);
++ #endif /* CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2 */
++
++ #ifndef CM_8IC_CONVERT_AND_ACC_32FC_U_SSE2
++ #define CM_8IC_CONVERT_AND_ACC_32FC_U_SSE2(input, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)\
++ input_i_1 = _mm_unpacklo_epi8(_mm_setzero_si128(), input);\
++ input_i_2 = _mm_unpacklo_epi16(_mm_setzero_si128(), input_i_1);\
++ input_i_1 = _mm_unpackhi_epi16(_mm_setzero_si128(), input_i_1);\
++ input_i_1 = _mm_srai_epi32(input_i_1, 24);\
++ input_i_2 = _mm_srai_epi32(input_i_2, 24);\
++ output_i32 = _mm_add_epi32(input_i_1, input_i_2);\
++ output_ps_1 = _mm_cvtepi32_ps(output_i32);\
++ \
++ input_i_1 = _mm_unpackhi_epi8(_mm_setzero_si128(), input);\
++ input_i_2 = _mm_unpacklo_epi16(_mm_setzero_si128(), input_i_1);\
++ input_i_1 = _mm_unpackhi_epi16(_mm_setzero_si128(), input_i_1);\
++ input_i_1 = _mm_srai_epi32(input_i_1, 24);\
++ input_i_2 = _mm_srai_epi32(input_i_2, 24);\
++ output_i32 = _mm_add_epi32(input_i_1, input_i_2);\
++ output_ps_2 = _mm_cvtepi32_ps(output_i32);
++ #endif /* CM_8IC_CONVERT_AND_ACC_32FC_U_SSE2 */
++
++ #ifndef CM_8IC_CONTROLMINUS128_8IC_U_SSE2
++ #define CM_8IC_CONTROLMINUS128_8IC_U_SSE2(y, minus128, minus128control)\
++ minus128control = _mm_cmpeq_epi8 (y, minus128);\
++ y = _mm_sub_epi8 (y, minus128control);
++ #endif /* CM_8IC_CONTROLMINUS128_8IC_U_SSE2 */
++
++ #endif /* LV_HAVE_SSE2 */
++
++ #ifdef LV_HAVE_GENERIC
++ /*!
++ \brief Macros for U_GENERIC
++ */
++
++ #endif /* LV_HAVE_GENERIC */
++#endif /* INCLUDED_gnsssdr_CommonMacros_u_H */
++
++
++#ifndef INCLUDED_gnsssdr_CommonMacros_a_H
++#define INCLUDED_gnsssdr_CommonMacros_a_H
++
++ #ifdef LV_HAVE_SSE4_1
++ /*!
++ \brief Macros for A_SSE4_1
++ */
++
++ #endif /* LV_HAVE_SSE4_1 */
++
++ #ifdef LV_HAVE_SSE2
++ /*!
++ \brief Macros for U_SSE2
++ */
++
++ #endif /* LV_HAVE_SSE2 */
++
++ #ifdef LV_HAVE_GENERIC
++ /*!
++ \brief Macros for A_GENERIC
++ */
++
++ #endif /* LV_HAVE_GENERIC */
++#endif /* INCLUDED_gnsssdr_CommonMacros_a_H */
+diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/CommonMacros/CommonMacros_16ic_cw_epl_corr_32fc.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/CommonMacros/CommonMacros_16ic_cw_epl_corr_32fc.h
+--- /Users/andres/Desktop/volk_gnsssdr/kernels/CommonMacros/CommonMacros_16ic_cw_epl_corr_32fc.h 1970-01-01 01:00:00.000000000 +0100
++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/CommonMacros/CommonMacros_16ic_cw_epl_corr_32fc.h 2014-10-15 01:55:08.000000000 +0200
+@@ -0,0 +1,76 @@
++/*!
++ * \file CommonMacros_16ic_cw_corr_32fc.h
++ * \brief Common macros used inside the 16ic_cw_corr_32fc volk protokernels.
++ * \authors <ul>
++ * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
++ * </ul>
++ *
++ * -------------------------------------------------------------------------
++ *
++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
++ *
++ * GNSS-SDR is a software defined Global Navigation
++ * Satellite Systems receiver
++ *
++ * This file is part of GNSS-SDR.
++ *
++ * GNSS-SDR is free software: you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation, either version 3 of the License, or
++ * at your option) any later version.
++ *
++ * GNSS-SDR is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
++ *
++ * -------------------------------------------------------------------------
++ */
++#ifndef INCLUDED_gnsssdr_CommonMacros_16ic_cw_corr_32fc_u_H
++#define INCLUDED_gnsssdr_CommonMacros_16ic_cw_corr_32fc_u_H
++#include "CommonMacros/CommonMacros.h"
++
++ #ifdef LV_HAVE_SSE4_1
++ /*!
++ \brief Macros for U_SSE4_1
++ */
++
++ #ifndef CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1
++ #define CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)\
++ CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy)\
++ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(real_bb_signal_sample, imag_bb_signal_sample, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output)\
++ CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(real_output, input_i_1, input_i_2, output_i32, real_output_ps)\
++ CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(imag_output, input_i_1, input_i_2, output_i32, imag_output_ps)
++ #endif /* CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1 */
++
++ #endif /* LV_HAVE_SSE4_1 */
++
++ #ifdef LV_HAVE_GENERIC
++ /*!
++ \brief Macros for U_GENERIC
++ */
++
++ #endif /* LV_HAVE_GENERIC */
++#endif /* INCLUDED_gnsssdr_CommonMacros_16ic_cw_corr_32fc_u_H */
++
++
++#ifndef INCLUDED_gnsssdr_CommonMacros_16ic_cw_corr_32fc_a_H
++#define INCLUDED_gnsssdr_CommonMacros_16ic_cw_corr_32fc_a_H
++
++ #ifdef LV_HAVE_SSE4_1
++ /*!
++ \brief Macros for A_SSE4_1
++ */
++
++ #endif /* LV_HAVE_SSE4_1 */
++
++ #ifdef LV_HAVE_GENERIC
++ /*!
++ \brief Macros for A_GENERIC
++ */
++
++ #endif /* LV_HAVE_GENERIC */
++#endif /* INCLUDED_gnsssdr_CommonMacros_16ic_cw_corr_32fc_a_H */
+diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h
+--- /Users/andres/Desktop/volk_gnsssdr/kernels/CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h 1970-01-01 01:00:00.000000000 +0100
++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h 2014-10-15 01:55:08.000000000 +0200
+@@ -0,0 +1,114 @@
++/*!
++ * \file CommonMacros_8ic_cw_corr_32fc.h
++ * \brief Common macros used inside the 8ic_cw_corr_32fc volk protokernels.
++ * \authors <ul>
++ * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
++ * </ul>
++ *
++ * -------------------------------------------------------------------------
++ *
++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
++ *
++ * GNSS-SDR is a software defined Global Navigation
++ * Satellite Systems receiver
++ *
++ * This file is part of GNSS-SDR.
++ *
++ * GNSS-SDR is free software: you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation, either version 3 of the License, or
++ * at your option) any later version.
++ *
++ * GNSS-SDR is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
++ *
++ * -------------------------------------------------------------------------
++ */
++#ifndef INCLUDED_gnsssdr_CommonMacros_8ic_cw_corr_32fc_u_H
++#define INCLUDED_gnsssdr_CommonMacros_8ic_cw_corr_32fc_u_H
++#include "CommonMacros/CommonMacros.h"
++
++ #ifdef LV_HAVE_SSE4_1
++ /*!
++ \brief Macros for U_SSE4_1
++ */
++
++ #ifndef CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1
++ #define CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)\
++ CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy)\
++ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(real_bb_signal_sample, imag_bb_signal_sample, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output)\
++ \
++ imag_output = _mm_slli_si128 (imag_output, 1);\
++ output = _mm_blendv_epi8 (imag_output, real_output, mult1);\
++ \
++ CM_8IC_CONVERT_AND_ACC_32FC_U_SSE4_1(output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
++ #endif /* CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1 */
++
++ #ifndef CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1
++ #define CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)\
++ CM_8IC_CONTROLMINUS128_8IC_U_SSE2(y, minus128, minus128control)\
++ CM_8IC_X2_SCALAR_PRODUCT_16IC_X2_U_SSSE3(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output)\
++ CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(real_output, input_i_1, input_i_2, output_i32, real_output_ps)\
++ CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(imag_output, input_i_1, input_i_2, output_i32, imag_output_ps)
++ #endif /* CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1 */
++
++ #ifndef CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1
++ #define CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)\
++ CM_8IC_X2_SCALAR_PRODUCT_16IC_X2_U_SSSE3(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output)\
++ CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(real_output, input_i_1, input_i_2, output_i32, real_output_ps)\
++ CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(imag_output, input_i_1, input_i_2, output_i32, imag_output_ps)
++ #endif /* CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1 */
++
++ #endif /* LV_HAVE_SSE4_1 */
++
++ #ifdef LV_HAVE_SSE2
++ /*!
++ \brief Macros for U_SSE2
++ */
++
++ #ifndef CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2
++ #define CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)\
++ CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy)\
++ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(real_bb_signal_sample, imag_bb_signal_sample, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output)\
++ \
++ real_output = _mm_and_si128 (real_output, mult1);\
++ imag_output = _mm_and_si128 (imag_output, mult1);\
++ imag_output = _mm_slli_si128 (imag_output, 1);\
++ output = _mm_or_si128 (real_output, imag_output);\
++ \
++ CM_8IC_CONVERT_AND_ACC_32FC_U_SSE2(output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
++ #endif /* CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2 */
++
++ #endif /* LV_HAVE_SSE2 */
++
++ #ifdef LV_HAVE_GENERIC
++ /*!
++ \brief Macros for U_GENERIC
++ */
++
++ #endif /* LV_HAVE_GENERIC */
++#endif /* INCLUDED_gnsssdr_CommonMacros_8ic_cw_corr_32fc_u_H */
++
++
++#ifndef INCLUDED_gnsssdr_CommonMacros_8ic_cw_corr_32fc_a_H
++#define INCLUDED_gnsssdr_CommonMacros_8ic_cw_corr_32fc_a_H
++
++ #ifdef LV_HAVE_SSE4_1
++ /*!
++ \brief Macros for A_SSE4_1
++ */
++
++ #endif /* LV_HAVE_SSE4_1 */
++
++ #ifdef LV_HAVE_GENERIC
++ /*!
++ \brief Macros for A_GENERIC
++ */
++
++ #endif /* LV_HAVE_GENERIC */
++#endif /* INCLUDED_gnsssdr_CommonMacros_8ic_cw_corr_32fc_a_H */
+diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/CommonMacros/README.txt /Users/andres/Desktop/volk_gnsssdr_original/kernels/CommonMacros/README.txt
+--- /Users/andres/Desktop/volk_gnsssdr/kernels/CommonMacros/README.txt 1970-01-01 01:00:00.000000000 +0100
++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/CommonMacros/README.txt 2014-10-15 01:55:08.000000000 +0200
+@@ -0,0 +1,34 @@
++####################################################################
++Common Macros inside volk_gnsssdr module
++####################################################################
++
++First of all, sorry for making you need to read this: macros are evil, they can not be debugged, you do not know where the errors come from, syntax is annoying.. BUT this is the only way I found that allows to share one piece of code between various proto-kernels without performance penalties.
++Inline functions have been tested, and they introduce a really small time penalty, but it becomes huge because of long loops, with thousands of samples.
++
++####################################################################
++Syntax
++####################################################################
++
++In order to allow better understanding of the code I created the macros with an specific syntax.
++
++1) Inside CommonMacros.h you will find macros for common operations. I will explain the syntax with an example:
++
++example: CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output)
++
++First of all, you find the characters “CM”, which means CommonMacros. After that the type and the amount of inputs is placed: “_16IC_X4” (16 bits complex integers, four inputs). The syntax for type is the same as the one used with volk protokernels, refer to GNURadio documentation for more help. The it comes the name of the macro (“_SCALAR_PRODUCT”), and after that the type and the amount of outputs (“_16IC_X2”). Finally it is placed the SSE minimum version needed to run (“_U_SSE2”). In the arguments you will find (from left to right) the inputs (four inputs: realx, imagx, realy, imagy), some variables that the macro needs to work (realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy) and finally the outputs (two outputs: real_output, imag_output).
++The variables that the macro needs are specified when calling it in order to avoid after-compile problems: if you want to use a macro you will need to declare all the variables it needs before, or you will not be able to compile.
++
++2) Inside all the other headers, CommonMacros_XXXXXX.h you will find macros for a specific group of proto-kernels. The syntax is the same as the CommonMacros.h
++
++####################################################################
++Workflow
++####################################################################
++
++In order to use the macros easily, I usually test the code without macros inside a testing proto-kernel, where you are able to test it, debug it and use breakpoints.
++When it works I place code inside a macro an I test it again.
++
++####################################################################
++Why macros
++####################################################################
++1) They are the only way I could find for sharing code between proto-kernels without performance penalty.
++2) It is true that they are really difficult to debug, but if you work with them responsibly it is not so hard. Volk_gnsssdr checks all the SSE proto-kernels implementations results against the generic implementation results, so if your macro is not working you will appreciate it after profiling it.
+\ No newline at end of file
+diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16i_s32f_convert_32f.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_16i_s32f_convert_32f.h
+--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16i_s32f_convert_32f.h 1970-01-01 01:00:00.000000000 +0100
++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_16i_s32f_convert_32f.h 2014-10-15 01:55:08.000000000 +0200
+@@ -0,0 +1,241 @@
++#ifndef INCLUDED_volk_gnsssdr_16i_s32f_convert_32f_u_H
++#define INCLUDED_volk_gnsssdr_16i_s32f_convert_32f_u_H
++
++#include <inttypes.h>
++#include <stdio.h>
++
++#ifdef LV_HAVE_SSE4_1
++#include <smmintrin.h>
++
++ /*!
++ \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
++ \param inputVector The 16 bit input data buffer
++ \param outputVector The floating point output data buffer
++ \param scalar The value divided against each point in the output buffer
++ \param num_points The number of data values to be converted
++ \note Output buffer does NOT need to be properly aligned
++ */
++static inline void volk_gnsssdr_16i_s32f_convert_32f_u_sse4_1(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
++ unsigned int number = 0;
++ const unsigned int eighthPoints = num_points / 8;
++
++ float* outputVectorPtr = outputVector;
++ __m128 invScalar = _mm_set_ps1(1.0/scalar);
++ int16_t* inputPtr = (int16_t*)inputVector;
++ __m128i inputVal;
++ __m128i inputVal2;
++ __m128 ret;
++
++ for(;number < eighthPoints; number++){
++
++ // Load the 8 values
++ inputVal = _mm_loadu_si128((__m128i*)inputPtr);
++
++ // Shift the input data to the right by 64 bits ( 8 bytes )
++ inputVal2 = _mm_srli_si128(inputVal, 8);
++
++ // Convert the lower 4 values into 32 bit words
++ inputVal = _mm_cvtepi16_epi32(inputVal);
++ inputVal2 = _mm_cvtepi16_epi32(inputVal2);
++
++ ret = _mm_cvtepi32_ps(inputVal);
++ ret = _mm_mul_ps(ret, invScalar);
++ _mm_storeu_ps(outputVectorPtr, ret);
++ outputVectorPtr += 4;
++
++ ret = _mm_cvtepi32_ps(inputVal2);
++ ret = _mm_mul_ps(ret, invScalar);
++ _mm_storeu_ps(outputVectorPtr, ret);
++
++ outputVectorPtr += 4;
++
++ inputPtr += 8;
++ }
++
++ number = eighthPoints * 8;
++ for(; number < num_points; number++){
++ outputVector[number] =((float)(inputVector[number])) / scalar;
++ }
++}
++#endif /* LV_HAVE_SSE4_1 */
++
++#ifdef LV_HAVE_SSE
++#include <xmmintrin.h>
++
++ /*!
++ \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
++ \param inputVector The 16 bit input data buffer
++ \param outputVector The floating point output data buffer
++ \param scalar The value divided against each point in the output buffer
++ \param num_points The number of data values to be converted
++ \note Output buffer does NOT need to be properly aligned
++ */
++static inline void volk_gnsssdr_16i_s32f_convert_32f_u_sse(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
++ unsigned int number = 0;
++ const unsigned int quarterPoints = num_points / 4;
++
++ float* outputVectorPtr = outputVector;
++ __m128 invScalar = _mm_set_ps1(1.0/scalar);
++ int16_t* inputPtr = (int16_t*)inputVector;
++ __m128 ret;
++
++ for(;number < quarterPoints; number++){
++ ret = _mm_set_ps((float)(inputPtr[3]), (float)(inputPtr[2]), (float)(inputPtr[1]), (float)(inputPtr[0]));
++
++ ret = _mm_mul_ps(ret, invScalar);
++ _mm_storeu_ps(outputVectorPtr, ret);
++
++ inputPtr += 4;
++ outputVectorPtr += 4;
++ }
++
++ number = quarterPoints * 4;
++ for(; number < num_points; number++){
++ outputVector[number] = (float)(inputVector[number]) / scalar;
++ }
++}
++#endif /* LV_HAVE_SSE */
++
++#ifdef LV_HAVE_GENERIC
++ /*!
++ \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
++ \param inputVector The 16 bit input data buffer
++ \param outputVector The floating point output data buffer
++ \param scalar The value divided against each point in the output buffer
++ \param num_points The number of data values to be converted
++ \note Output buffer does NOT need to be properly aligned
++ */
++static inline void volk_gnsssdr_16i_s32f_convert_32f_generic(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
++ float* outputVectorPtr = outputVector;
++ const int16_t* inputVectorPtr = inputVector;
++ unsigned int number = 0;
++
++ for(number = 0; number < num_points; number++){
++ *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar;
++ }
++}
++#endif /* LV_HAVE_GENERIC */
++
++
++
++
++#endif /* INCLUDED_volk_gnsssdr_16i_s32f_convert_32f_u_H */
++#ifndef INCLUDED_volk_gnsssdr_16i_s32f_convert_32f_a_H
++#define INCLUDED_volk_gnsssdr_16i_s32f_convert_32f_a_H
++
++#include <inttypes.h>
++#include <stdio.h>
++
++#ifdef LV_HAVE_SSE4_1
++#include <smmintrin.h>
++
++ /*!
++ \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
++ \param inputVector The 16 bit input data buffer
++ \param outputVector The floating point output data buffer
++ \param scalar The value divided against each point in the output buffer
++ \param num_points The number of data values to be converted
++ */
++static inline void volk_gnsssdr_16i_s32f_convert_32f_a_sse4_1(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
++ unsigned int number = 0;
++ const unsigned int eighthPoints = num_points / 8;
++
++ float* outputVectorPtr = outputVector;
++ __m128 invScalar = _mm_set_ps1(1.0/scalar);
++ int16_t* inputPtr = (int16_t*)inputVector;
++ __m128i inputVal;
++ __m128i inputVal2;
++ __m128 ret;
++
++ for(;number < eighthPoints; number++){
++
++ // Load the 8 values
++ inputVal = _mm_loadu_si128((__m128i*)inputPtr);
++
++ // Shift the input data to the right by 64 bits ( 8 bytes )
++ inputVal2 = _mm_srli_si128(inputVal, 8);
++
++ // Convert the lower 4 values into 32 bit words
++ inputVal = _mm_cvtepi16_epi32(inputVal);
++ inputVal2 = _mm_cvtepi16_epi32(inputVal2);
++
++ ret = _mm_cvtepi32_ps(inputVal);
++ ret = _mm_mul_ps(ret, invScalar);
++ _mm_storeu_ps(outputVectorPtr, ret);
++ outputVectorPtr += 4;
++
++ ret = _mm_cvtepi32_ps(inputVal2);
++ ret = _mm_mul_ps(ret, invScalar);
++ _mm_storeu_ps(outputVectorPtr, ret);
++
++ outputVectorPtr += 4;
++
++ inputPtr += 8;
++ }
++
++ number = eighthPoints * 8;
++ for(; number < num_points; number++){
++ outputVector[number] =((float)(inputVector[number])) / scalar;
++ }
++}
++#endif /* LV_HAVE_SSE4_1 */
++
++#ifdef LV_HAVE_SSE
++#include <xmmintrin.h>
++
++ /*!
++ \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
++ \param inputVector The 16 bit input data buffer
++ \param outputVector The floating point output data buffer
++ \param scalar The value divided against each point in the output buffer
++ \param num_points The number of data values to be converted
++ */
++static inline void volk_gnsssdr_16i_s32f_convert_32f_a_sse(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
++ unsigned int number = 0;
++ const unsigned int quarterPoints = num_points / 4;
++
++ float* outputVectorPtr = outputVector;
++ __m128 invScalar = _mm_set_ps1(1.0/scalar);
++ int16_t* inputPtr = (int16_t*)inputVector;
++ __m128 ret;
++
++ for(;number < quarterPoints; number++){
++ ret = _mm_set_ps((float)(inputPtr[3]), (float)(inputPtr[2]), (float)(inputPtr[1]), (float)(inputPtr[0]));
++
++ ret = _mm_mul_ps(ret, invScalar);
++ _mm_storeu_ps(outputVectorPtr, ret);
++
++ inputPtr += 4;
++ outputVectorPtr += 4;
++ }
++
++ number = quarterPoints * 4;
++ for(; number < num_points; number++){
++ outputVector[number] = (float)(inputVector[number]) / scalar;
++ }
++}
++#endif /* LV_HAVE_SSE */
++
++#ifdef LV_HAVE_GENERIC
++ /*!
++ \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
++ \param inputVector The 16 bit input data buffer
++ \param outputVector The floating point output data buffer
++ \param scalar The value divided against each point in the output buffer
++ \param num_points The number of data values to be converted
++ */
++static inline void volk_gnsssdr_16i_s32f_convert_32f_a_generic(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
++ float* outputVectorPtr = outputVector;
++ const int16_t* inputVectorPtr = inputVector;
++ unsigned int number = 0;
++
++ for(number = 0; number < num_points; number++){
++ *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar;
++ }
++}
++#endif /* LV_HAVE_GENERIC */
++
++
++
++
++#endif /* INCLUDED_volk_gnsssdr_16i_s32f_convert_32f_a_H */
+diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3.h
+--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3.h 1970-01-01 01:00:00.000000000 +0100
++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3.h 2014-10-15 01:55:08.000000000 +0200
+@@ -0,0 +1,461 @@
++/*!
++ * \file volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3.h
++ * \brief Volk protokernel: performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation with 32 bits vectors
++ * \authors <ul>
++ * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
++ * </ul>
++ *
++ * Volk protokernel that performs the carrier wipe-off mixing and the
++ * Early, Prompt, and Late correlation with 32 bits vectors (16 bits the
++ * real part and 16 bits the imaginary part):
++ * - The carrier wipe-off is done by multiplying the input signal by the
++ * carrier (multiplication of 32 bits vectors) It returns the input
++ * signal in base band (BB)
++ * - Early values are calculated by multiplying the input signal in BB by the
++ * early code (multiplication of 32 bits vectors), accumulating the results
++ * - Prompt values are calculated by multiplying the input signal in BB by the
++ * prompt code (multiplication of 32 bits vectors), accumulating the results
++ * - Late values are calculated by multiplying the input signal in BB by the
++ * late code (multiplication of 32 bits vectors), accumulating the results
++ *
++ * -------------------------------------------------------------------------
++ *
++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
++ *
++ * GNSS-SDR is a software defined Global Navigation
++ * Satellite Systems receiver
++ *
++ * This file is part of GNSS-SDR.
++ *
++ * GNSS-SDR is free software: you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation, either version 3 of the License, or
++ * at your option) any later version.
++ *
++ * GNSS-SDR is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
++ *
++ * -------------------------------------------------------------------------
++ */
++
++#ifndef INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_u_H
++#define INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_u_H
++
++#include <inttypes.h>
++#include <stdio.h>
++#include <volk_gnsssdr/volk_gnsssdr_complex.h>
++#include <float.h>
++#include <string.h>
++
++#ifdef LV_HAVE_SSE4_1
++#include "smmintrin.h"
++#include "CommonMacros/CommonMacros_16ic_cw_epl_corr_32fc.h"
++#include "CommonMacros/CommonMacros.h"
++ /*!
++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
++ \param input The input signal input
++ \param carrier The carrier signal input
++ \param E_code Early PRN code replica input
++ \param P_code Early PRN code replica input
++ \param L_code Early PRN code replica input
++ \param E_out Early correlation output
++ \param P_out Early correlation output
++ \param L_out Early correlation output
++ \param num_points The number of complex values in vectors
++ */
++static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_u_sse4_1(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points)
++{
++ const unsigned int sse_iters = num_points / 8;
++
++ __m128i x1, x2, y1, y2, real_bb_signal_sample, imag_bb_signal_sample;
++ __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output;
++
++ __m128 real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc;
++ __m128i input_i_1, input_i_2, output_i32;
++ __m128 real_output_ps, imag_output_ps;
++
++ float E_out_real = 0;
++ float E_out_imag = 0;
++ float P_out_real = 0;
++ float P_out_imag = 0;
++ float L_out_real = 0;
++ float L_out_imag = 0;
++
++ const lv_16sc_t* input_ptr = input;
++ const lv_16sc_t* carrier_ptr = carrier;
++
++ const lv_16sc_t* E_code_ptr = E_code;
++ lv_32fc_t* E_out_ptr = E_out;
++ const lv_16sc_t* L_code_ptr = L_code;
++ lv_32fc_t* L_out_ptr = L_out;
++ const lv_16sc_t* P_code_ptr = P_code;
++ lv_32fc_t* P_out_ptr = P_out;
++
++ *E_out_ptr = 0;
++ *P_out_ptr = 0;
++ *L_out_ptr = 0;
++
++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
++
++ real_E_code_acc = _mm_setzero_ps();
++ imag_E_code_acc = _mm_setzero_ps();
++ real_P_code_acc = _mm_setzero_ps();
++ imag_P_code_acc = _mm_setzero_ps();
++ real_L_code_acc = _mm_setzero_ps();
++ imag_L_code_acc = _mm_setzero_ps();
++
++ if (sse_iters>0)
++ {
++ for(int number = 0;number < sse_iters; number++){
++
++ //Perform the carrier wipe-off
++ x1 = _mm_lddqu_si128((__m128i*)input_ptr);
++ input_ptr += 4;
++ x2 = _mm_lddqu_si128((__m128i*)input_ptr);
++
++ y1 = _mm_lddqu_si128((__m128i*)carrier_ptr);
++ carrier_ptr += 4;
++ y2 = _mm_lddqu_si128((__m128i*)carrier_ptr);
++
++ CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(x1, x2, realx, imagx)
++ CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy)
++ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample)
++
++ //Get early values
++ y1 = _mm_lddqu_si128((__m128i*)E_code_ptr);
++ E_code_ptr += 4;
++ y2 = _mm_lddqu_si128((__m128i*)E_code_ptr);
++
++ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
++
++ //Adds the float 32 results
++ real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps);
++ imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps);
++
++ //Get prompt values
++ y1 = _mm_lddqu_si128((__m128i*)P_code_ptr);
++ P_code_ptr += 4;
++ y2 = _mm_lddqu_si128((__m128i*)P_code_ptr);
++
++ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
++
++ real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps);
++ imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps);
++
++ //Get late values
++ y1 = _mm_lddqu_si128((__m128i*)L_code_ptr);
++ L_code_ptr += 4;
++ y2 = _mm_lddqu_si128((__m128i*)L_code_ptr);
++
++ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
++
++ real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps);
++ imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps);
++
++ input_ptr += 4;
++ carrier_ptr += 4;
++ E_code_ptr += 4;
++ P_code_ptr += 4;
++ L_code_ptr += 4;
++ }
++
++ __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4];
++
++ _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector
++ _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector
++ _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector
++ _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector
++ _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector
++ _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector
++
++ for (int i = 0; i<4; ++i)
++ {
++ E_out_real += real_E_dotProductVector[i];
++ E_out_imag += imag_E_dotProductVector[i];
++ P_out_real += real_P_dotProductVector[i];
++ P_out_imag += imag_P_dotProductVector[i];
++ L_out_real += real_L_dotProductVector[i];
++ L_out_imag += imag_L_dotProductVector[i];
++ }
++ *E_out_ptr = lv_cmake(E_out_real, E_out_imag);
++ *P_out_ptr = lv_cmake(P_out_real, P_out_imag);
++ *L_out_ptr = lv_cmake(L_out_real, L_out_imag);
++ }
++
++ lv_16sc_t bb_signal_sample;
++ for(int i=0; i < num_points%8; ++i)
++ {
++ //Perform the carrier wipe-off
++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
++ // Now get early, late, and prompt values for each
++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++));
++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++));
++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++));
++ }
++
++}
++#endif /* LV_HAVE_SSE4_1 */
++
++#ifdef LV_HAVE_GENERIC
++/*!
++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
++ \param input The input signal input
++ \param carrier The carrier signal input
++ \param E_code Early PRN code replica input
++ \param P_code Early PRN code replica input
++ \param L_code Early PRN code replica input
++ \param E_out Early correlation output
++ \param P_out Early correlation output
++ \param L_out Early correlation output
++ \param num_points The number of complex values in vectors
++ */
++static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_generic(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points)
++{
++ lv_16sc_t bb_signal_sample;
++ lv_16sc_t tmp1;
++ lv_16sc_t tmp2;
++ lv_16sc_t tmp3;
++
++ bb_signal_sample = lv_cmake(0, 0);
++
++ *E_out = 0;
++ *P_out = 0;
++ *L_out = 0;
++ // perform Early, Prompt and Late correlation
++
++ for(int i=0; i < num_points; ++i)
++ {
++ //Perform the carrier wipe-off
++ bb_signal_sample = input[i] * carrier[i];
++
++ tmp1 = bb_signal_sample * E_code[i];
++ tmp2 = bb_signal_sample * P_code[i];
++ tmp3 = bb_signal_sample * L_code[i];
++
++ // Now get early, late, and prompt values for each
++ *E_out += (lv_32fc_t)tmp1;
++ *P_out += (lv_32fc_t)tmp2;
++ *L_out += (lv_32fc_t)tmp3;
++ }
++}
++#endif /* LV_HAVE_GENERIC */
++#endif /* INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_u_H */
++
++
++#ifndef INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_a_H
++#define INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_a_H
++
++#include <inttypes.h>
++#include <stdio.h>
++#include <volk_gnsssdr/volk_gnsssdr_complex.h>
++#include <float.h>
++#include <string.h>
++
++#ifdef LV_HAVE_SSE4_1
++#include "smmintrin.h"
++#include "CommonMacros/CommonMacros_16ic_cw_epl_corr_32fc.h"
++#include "CommonMacros/CommonMacros.h"
++/*!
++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
++ \param input The input signal input
++ \param carrier The carrier signal input
++ \param E_code Early PRN code replica input
++ \param P_code Early PRN code replica input
++ \param L_code Early PRN code replica input
++ \param E_out Early correlation output
++ \param P_out Early correlation output
++ \param L_out Early correlation output
++ \param num_points The number of complex values in vectors
++ */
++static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_a_sse4_1(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points)
++{
++ const unsigned int sse_iters = num_points / 8;
++
++ __m128i x1, x2, y1, y2, real_bb_signal_sample, imag_bb_signal_sample;
++ __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output;
++
++ __m128 real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc;
++ __m128i input_i_1, input_i_2, output_i32;
++ __m128 real_output_ps, imag_output_ps;
++
++ float E_out_real = 0;
++ float E_out_imag = 0;
++ float P_out_real = 0;
++ float P_out_imag = 0;
++ float L_out_real = 0;
++ float L_out_imag = 0;
++
++ const lv_16sc_t* input_ptr = input;
++ const lv_16sc_t* carrier_ptr = carrier;
++
++ const lv_16sc_t* E_code_ptr = E_code;
++ lv_32fc_t* E_out_ptr = E_out;
++ const lv_16sc_t* L_code_ptr = L_code;
++ lv_32fc_t* L_out_ptr = L_out;
++ const lv_16sc_t* P_code_ptr = P_code;
++ lv_32fc_t* P_out_ptr = P_out;
++
++ *E_out_ptr = 0;
++ *P_out_ptr = 0;
++ *L_out_ptr = 0;
++
++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
++
++ real_E_code_acc = _mm_setzero_ps();
++ imag_E_code_acc = _mm_setzero_ps();
++ real_P_code_acc = _mm_setzero_ps();
++ imag_P_code_acc = _mm_setzero_ps();
++ real_L_code_acc = _mm_setzero_ps();
++ imag_L_code_acc = _mm_setzero_ps();
++
++ if (sse_iters>0)
++ {
++ for(int number = 0;number < sse_iters; number++){
++
++ //Perform the carrier wipe-off
++ x1 = _mm_load_si128((__m128i*)input_ptr);
++ input_ptr += 4;
++ x2 = _mm_load_si128((__m128i*)input_ptr);
++
++ y1 = _mm_load_si128((__m128i*)carrier_ptr);
++ carrier_ptr += 4;
++ y2 = _mm_load_si128((__m128i*)carrier_ptr);
++
++ CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(x1, x2, realx, imagx)
++ CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy)
++ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample)
++
++ //Get early values
++ y1 = _mm_load_si128((__m128i*)E_code_ptr);
++ E_code_ptr += 4;
++ y2 = _mm_load_si128((__m128i*)E_code_ptr);
++
++ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
++
++ //Adds the float 32 results
++ real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps);
++ imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps);
++
++ //Get prompt values
++ y1 = _mm_load_si128((__m128i*)P_code_ptr);
++ P_code_ptr += 4;
++ y2 = _mm_load_si128((__m128i*)P_code_ptr);
++
++ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
++
++ real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps);
++ imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps);
++
++ //Get late values
++ y1 = _mm_load_si128((__m128i*)L_code_ptr);
++ L_code_ptr += 4;
++ y2 = _mm_load_si128((__m128i*)L_code_ptr);
++
++ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
++
++ real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps);
++ imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps);
++
++ input_ptr += 4;
++ carrier_ptr += 4;
++ E_code_ptr += 4;
++ P_code_ptr += 4;
++ L_code_ptr += 4;
++ }
++
++ __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4];
++
++ _mm_store_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector
++ _mm_store_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector
++ _mm_store_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector
++ _mm_store_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector
++ _mm_store_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector
++ _mm_store_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector
++
++ for (int i = 0; i<4; ++i)
++ {
++ E_out_real += real_E_dotProductVector[i];
++ E_out_imag += imag_E_dotProductVector[i];
++ P_out_real += real_P_dotProductVector[i];
++ P_out_imag += imag_P_dotProductVector[i];
++ L_out_real += real_L_dotProductVector[i];
++ L_out_imag += imag_L_dotProductVector[i];
++ }
++ *E_out_ptr = lv_cmake(E_out_real, E_out_imag);
++ *P_out_ptr = lv_cmake(P_out_real, P_out_imag);
++ *L_out_ptr = lv_cmake(L_out_real, L_out_imag);
++ }
++
++ lv_16sc_t bb_signal_sample;
++ for(int i=0; i < num_points%8; ++i)
++ {
++ //Perform the carrier wipe-off
++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
++ // Now get early, late, and prompt values for each
++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++));
++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++));
++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++));
++ }
++
++}
++#endif /* LV_HAVE_SSE4_1 */
++
++#ifdef LV_HAVE_GENERIC
++/*!
++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
++ \param input The input signal input
++ \param carrier The carrier signal input
++ \param E_code Early PRN code replica input
++ \param P_code Early PRN code replica input
++ \param L_code Early PRN code replica input
++ \param E_out Early correlation output
++ \param P_out Early correlation output
++ \param L_out Early correlation output
++ \param num_points The number of complex values in vectors
++ */
++static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_a_generic(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points)
++{
++ lv_16sc_t bb_signal_sample;
++ lv_16sc_t tmp1;
++ lv_16sc_t tmp2;
++ lv_16sc_t tmp3;
++
++ bb_signal_sample = lv_cmake(0, 0);
++
++ *E_out = 0;
++ *P_out = 0;
++ *L_out = 0;
++ // perform Early, Prompt and Late correlation
++
++ for(int i=0; i < num_points; ++i)
++ {
++ //Perform the carrier wipe-off
++ bb_signal_sample = input[i] * carrier[i];
++
++ tmp1 = bb_signal_sample * E_code[i];
++ tmp2 = bb_signal_sample * P_code[i];
++ tmp3 = bb_signal_sample * L_code[i];
++
++ // Now get early, late, and prompt values for each
++ *E_out += (lv_32fc_t)tmp1;
++ *P_out += (lv_32fc_t)tmp2;
++ *L_out += (lv_32fc_t)tmp3;
++ }
++}
++#endif /* LV_HAVE_GENERIC */
++#endif /* INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_a_H */
+diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3.h
+--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3.h 1970-01-01 01:00:00.000000000 +0100
++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3.h 2014-10-15 01:55:08.000000000 +0200
+@@ -0,0 +1,1568 @@
++/*!
++ * \file volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3.h
++ * \brief Volk protokernel: performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation with 32 bits vectors using different methods: inside u_sse4_1_first there is one method, inside u_sse4_1_second there is another... This protokernel has been created to test the performance of different methods.
++ * \authors <ul>
++ * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
++ * </ul>
++ *
++ * Volk protokernel that performs the carrier wipe-off mixing and the
++ * Early, Prompt, and Late correlation with 32 bits vectors (16 bits the
++ * real part and 16 bits the imaginary part):
++ * - The carrier wipe-off is done by multiplying the input signal by the
++ * carrier (multiplication of 32 bits vectors) It returns the input
++ * signal in base band (BB)
++ * - Early values are calculated by multiplying the input signal in BB by the
++ * early code (multiplication of 32 bits vectors), accumulating the results
++ * - Prompt values are calculated by multiplying the input signal in BB by the
++ * prompt code (multiplication of 32 bits vectors), accumulating the results
++ * - Late values are calculated by multiplying the input signal in BB by the
++ * late code (multiplication of 32 bits vectors), accumulating the results
++ *
++ * -------------------------------------------------------------------------
++ *
++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
++ *
++ * GNSS-SDR is a software defined Global Navigation
++ * Satellite Systems receiver
++ *
++ * This file is part of GNSS-SDR.
++ *
++ * GNSS-SDR is free software: you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation, either version 3 of the License, or
++ * at your option) any later version.
++ *
++ * GNSS-SDR is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
++ *
++ * -------------------------------------------------------------------------
++ */
++
++#ifndef INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_u_H
++#define INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_u_H
++
++#include <inttypes.h>
++#include <stdio.h>
++#include <volk_gnsssdr/volk_gnsssdr_complex.h>
++#include <float.h>
++#include <string.h>
++
++#ifdef LV_HAVE_SSE4_1
++#include "smmintrin.h"
++ /*!
++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
++ \param input The input signal input
++ \param carrier The carrier signal input
++ \param E_code Early PRN code replica input
++ \param P_code Early PRN code replica input
++ \param L_code Early PRN code replica input
++ \param E_out Early correlation output
++ \param P_out Early correlation output
++ \param L_out Early correlation output
++ \param num_points The number of complex values in vectors
++ */
++static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_u_sse4_1_first(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points)
++{
++ const unsigned int sse_iters = num_points / 4;
++
++ __m128i x, y, yaux, yl, yh, tmp1, tmp2, z, bb_signal_sample, bb_signal_sample_suffled;
++
++ __m128 z_ps_1, z_ps_2, z_E, z_P, z_L;
++ __m128i z_i_1, z_i_2;
++
++ lv_32fc_t dotProduct_E;
++ lv_32fc_t dotProduct_P;
++ lv_32fc_t dotProduct_L;
++
++ z_E = _mm_setzero_ps();
++ z_P = _mm_setzero_ps();
++ z_L = _mm_setzero_ps();
++
++ const lv_16sc_t* _input = input;
++ const lv_16sc_t* _carrier = carrier;
++ const lv_16sc_t* _E_code = E_code;
++ const lv_16sc_t* _P_code = P_code;
++ const lv_16sc_t* _L_code = L_code;
++
++ if (sse_iters>0)
++ {
++ for(int number = 0;number < sse_iters; number++)
++ {
++ //Perform the carrier wipe-off
++ x = _mm_lddqu_si128((__m128i*)_input); // Load the ar + ai, br + bi as ar,ai,br,bi
++ y = _mm_lddqu_si128((__m128i*)_carrier); // Load the cr + ci, dr + di as cr,ci,dr,di
++
++ // Load yl with cr,cr,dr,dr
++ // Load yh with ci,ci,di,di
++ yaux = _mm_shuffle_epi8 (y, _mm_set_epi8 (15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0));
++ yl = _mm_unpacklo_epi16(yaux, yaux);
++ yh = _mm_unpackhi_epi16(yaux, yaux);
++
++ tmp1 = _mm_mullo_epi16(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
++
++ x = _mm_shuffle_epi8 (x, _mm_set_epi8 (13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2)); // Re-arrange x to be ai,ar,bi,br
++
++ tmp2 = _mm_mullo_epi16(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
++
++ tmp2 = _mm_mullo_epi16(tmp2,_mm_set_epi16 (1, -1, 1, -1, 1, -1, 1, -1));
++ bb_signal_sample = _mm_add_epi16(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
++ bb_signal_sample_suffled = _mm_shuffle_epi8 (bb_signal_sample, _mm_set_epi8 (13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2)); // Re-arrange bb_signal_sample to be ai,ar,bi,br
++
++ // correlation E,P,L (3x vector scalar product)
++ // Early
++ y = _mm_lddqu_si128((__m128i*)_E_code); // Load the cr + ci, dr + di as cr,ci,dr,di
++
++ yaux = _mm_shuffle_epi8 (y, _mm_set_epi8 (15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0));
++ yl = _mm_unpacklo_epi16(yaux, yaux);
++ yh = _mm_unpackhi_epi16(yaux, yaux);
++
++ tmp1 = _mm_mullo_epi16(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
++
++ tmp2 = _mm_mullo_epi16(bb_signal_sample_suffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
++
++ tmp2 = _mm_mullo_epi16(tmp2,_mm_set_epi16 (1, -1, 1, -1, 1, -1, 1, -1));
++ z = _mm_add_epi16(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
++
++ z_i_1 = _mm_cvtepi16_epi32(z);
++ z_ps_1 = _mm_cvtepi32_ps(z_i_1);
++ z = _mm_srli_si128 (z, 8);
++ z_i_2 = _mm_cvtepi16_epi32(z);
++ z_ps_2 = _mm_cvtepi32_ps(z_i_2);
++
++ z_E = _mm_add_ps(z_E, z_ps_1); // Add the complex multiplication results together
++ z_E = _mm_add_ps(z_E, z_ps_2); // Add the complex multiplication results together
++
++ // Prompt
++ y = _mm_lddqu_si128((__m128i*)_P_code); // Load the cr + ci, dr + di as cr,ci,dr,di
++
++ yaux = _mm_shuffle_epi8 (y, _mm_set_epi8 (15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0));
++ yl = _mm_unpacklo_epi16(yaux, yaux);
++ yh = _mm_unpackhi_epi16(yaux, yaux);
++
++ tmp1 = _mm_mullo_epi16(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
++
++ tmp2 = _mm_mullo_epi16(bb_signal_sample_suffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
++
++ tmp2 = _mm_mullo_epi16(tmp2,_mm_set_epi16 (1, -1, 1, -1, 1, -1, 1, -1));
++ z = _mm_add_epi16(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
++
++ z_i_1 = _mm_cvtepi16_epi32(z);
++ z_ps_1 = _mm_cvtepi32_ps(z_i_1);
++ z = _mm_srli_si128 (z, 8);
++ z_i_2 = _mm_cvtepi16_epi32(z);
++ z_ps_2 = _mm_cvtepi32_ps(z_i_2);
++
++ z_P = _mm_add_ps(z_P, z_ps_1); // Add the complex multiplication results together
++ z_P = _mm_add_ps(z_P, z_ps_2); // Add the complex multiplication results together
++
++ // Late
++ y = _mm_lddqu_si128((__m128i*)_L_code); // Load the cr + ci, dr + di as cr,ci,dr,di
++
++ yaux = _mm_shuffle_epi8 (y, _mm_set_epi8 (15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0));
++ yl = _mm_unpacklo_epi16(yaux, yaux);
++ yh = _mm_unpackhi_epi16(yaux, yaux);
++
++ tmp1 = _mm_mullo_epi16(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
++
++ tmp2 = _mm_mullo_epi16(bb_signal_sample_suffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
++
++ tmp2 = _mm_mullo_epi16(tmp2,_mm_set_epi16 (1, -1, 1, -1, 1, -1, 1, -1));
++ z = _mm_add_epi16(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
++
++ z_i_1 = _mm_cvtepi16_epi32(z);
++ z_ps_1 = _mm_cvtepi32_ps(z_i_1);
++ z = _mm_srli_si128 (z, 8);
++ z_i_2 = _mm_cvtepi16_epi32(z);
++ z_ps_2 = _mm_cvtepi32_ps(z_i_2);
++
++ z_L = _mm_add_ps(z_L, z_ps_1); // Add the complex multiplication results together
++ z_L = _mm_add_ps(z_L, z_ps_2); // Add the complex multiplication results together
++
++ _input += 4;
++ _carrier += 4;
++ _E_code += 4;
++ _L_code += 4;
++ _P_code += 4;
++ }
++
++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_E[2];
++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_P[2];
++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_L[2];
++
++ _mm_storeu_ps((float*)dotProductVector_E,z_E); // Store the results back into the dot product vector
++ _mm_storeu_ps((float*)dotProductVector_P,z_P); // Store the results back into the dot product vector
++ _mm_storeu_ps((float*)dotProductVector_L,z_L); // Store the results back into the dot product vector
++
++ dotProduct_E = ( dotProductVector_E[0] + dotProductVector_E[1] );
++ dotProduct_P = ( dotProductVector_P[0] + dotProductVector_P[1] );
++ dotProduct_L = ( dotProductVector_L[0] + dotProductVector_L[1] );
++ }
++
++ for(int i=0; i < num_points%4; ++i)
++ {
++ dotProduct_E += (lv_32fc_t)((*_input) * (*_E_code++)*(*_carrier));
++ dotProduct_P += (lv_32fc_t)((*_input) * (*_P_code++)*(*_carrier));
++ dotProduct_L += (lv_32fc_t)((*_input++) * (*_L_code++)*(*_carrier++));
++ }
++
++ *E_out = dotProduct_E;
++ *P_out = dotProduct_P;
++ *L_out = dotProduct_L;
++
++
++
++}
++#endif /* LV_HAVE_SSE4_1 */
++
++#ifdef LV_HAVE_SSE4_1
++#include "smmintrin.h"
++/*!
++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
++ \param input The input signal input
++ \param carrier The carrier signal input
++ \param E_code Early PRN code replica input
++ \param P_code Early PRN code replica input
++ \param L_code Early PRN code replica input
++ \param E_out Early correlation output
++ \param P_out Early correlation output
++ \param L_out Early correlation output
++ \param num_points The number of complex values in vectors
++ */
++static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_u_sse4_1_second(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points)
++{
++ const unsigned int sse_iters = num_points / 8;
++
++ __m128i x1, x2, y1, y2, real_bb_signal_sample, imag_bb_signal_sample;
++ __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output;
++
++ __m128 real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc;
++ __m128i real_output_i_1, real_output_i_2, imag_output_i_1, imag_output_i_2;
++ __m128 real_output_ps_1, real_output_ps_2, imag_output_ps_1, imag_output_ps_2;
++
++ float E_out_real = 0;
++ float E_out_imag = 0;
++ float P_out_real = 0;
++ float P_out_imag = 0;
++ float L_out_real = 0;
++ float L_out_imag = 0;
++
++ const lv_16sc_t* input_ptr = input;
++ const lv_16sc_t* carrier_ptr = carrier;
++
++ const lv_16sc_t* E_code_ptr = E_code;
++ lv_32fc_t* E_out_ptr = E_out;
++ const lv_16sc_t* L_code_ptr = L_code;
++ lv_32fc_t* L_out_ptr = L_out;
++ const lv_16sc_t* P_code_ptr = P_code;
++ lv_32fc_t* P_out_ptr = P_out;
++
++ *E_out_ptr = 0;
++ *P_out_ptr = 0;
++ *L_out_ptr = 0;
++
++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
++
++ real_E_code_acc = _mm_setzero_ps();
++ imag_E_code_acc = _mm_setzero_ps();
++ real_P_code_acc = _mm_setzero_ps();
++ imag_P_code_acc = _mm_setzero_ps();
++ real_L_code_acc = _mm_setzero_ps();
++ imag_L_code_acc = _mm_setzero_ps();
++
++ if (sse_iters>0)
++ {
++ for(int number = 0;number < sse_iters; number++){
++
++ //Perform the carrier wipe-off
++ x1 = _mm_lddqu_si128((__m128i*)input_ptr);
++ input_ptr += 4;
++ x2 = _mm_lddqu_si128((__m128i*)input_ptr);
++
++ y1 = _mm_lddqu_si128((__m128i*)carrier_ptr);
++ carrier_ptr += 4;
++ y2 = _mm_lddqu_si128((__m128i*)carrier_ptr);
++
++ imagx = _mm_srli_si128 (x1, 2);
++ imagx = _mm_blend_epi16 (x2, imagx, 85);
++ realx = _mm_slli_si128 (x2, 2);
++ realx = _mm_blend_epi16 (realx, x1, 85);
++
++ imagy = _mm_srli_si128 (y1, 2);
++ imagy = _mm_blend_epi16 (y2, imagy, 85);
++ realy = _mm_slli_si128 (y2, 2);
++ realy = _mm_blend_epi16 (realy, y1, 85);
++
++ realx_mult_realy = _mm_mullo_epi16 (realx, realy);
++ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);
++ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);
++ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);
++
++ real_bb_signal_sample = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
++ imag_bb_signal_sample = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
++
++ //Get early values
++ y1 = _mm_lddqu_si128((__m128i*)E_code_ptr);
++ E_code_ptr += 4;
++ y2 = _mm_lddqu_si128((__m128i*)E_code_ptr);
++
++ imagy = _mm_srli_si128 (y1, 2);
++ imagy = _mm_blend_epi16 (y2, imagy, 85);
++ realy = _mm_slli_si128 (y2, 2);
++ realy = _mm_blend_epi16 (realy, y1, 85);
++
++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy);
++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy);
++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy);
++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy);
++
++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
++
++ real_output_i_1 = _mm_cvtepi16_epi32(real_output);
++ real_output_ps_1 = _mm_cvtepi32_ps(real_output_i_1);
++ real_output = _mm_srli_si128 (real_output, 8);
++ real_output_i_2 = _mm_cvtepi16_epi32(real_output);
++ real_output_ps_2 = _mm_cvtepi32_ps(real_output_i_2);
++
++ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output);
++ imag_output_ps_1 = _mm_cvtepi32_ps(imag_output_i_1);
++ imag_output = _mm_srli_si128 (imag_output, 8);
++ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output);
++ imag_output_ps_2 = _mm_cvtepi32_ps(imag_output_i_2);
++
++ real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps_1);
++ real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps_2);
++ imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps_1);
++ imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps_2);
++
++ //Get prompt values
++ y1 = _mm_lddqu_si128((__m128i*)P_code_ptr);
++ P_code_ptr += 4;
++ y2 = _mm_lddqu_si128((__m128i*)P_code_ptr);
++
++ imagy = _mm_srli_si128 (y1, 2);
++ imagy = _mm_blend_epi16 (y2, imagy, 85);
++ realy = _mm_slli_si128 (y2, 2);
++ realy = _mm_blend_epi16 (realy, y1, 85);
++
++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy);
++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy);
++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy);
++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy);
++
++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
++
++ real_output_i_1 = _mm_cvtepi16_epi32(real_output);
++ real_output_ps_1 = _mm_cvtepi32_ps(real_output_i_1);
++ real_output = _mm_srli_si128 (real_output, 8);
++ real_output_i_2 = _mm_cvtepi16_epi32(real_output);
++ real_output_ps_2 = _mm_cvtepi32_ps(real_output_i_2);
++
++ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output);
++ imag_output_ps_1 = _mm_cvtepi32_ps(imag_output_i_1);
++ imag_output = _mm_srli_si128 (imag_output, 8);
++ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output);
++ imag_output_ps_2 = _mm_cvtepi32_ps(imag_output_i_2);
++
++ real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps_1);
++ real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps_2);
++ imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps_1);
++ imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps_2);
++
++ //Get late values
++ y1 = _mm_lddqu_si128((__m128i*)L_code_ptr);
++ L_code_ptr += 4;
++ y2 = _mm_lddqu_si128((__m128i*)L_code_ptr);
++
++ imagy = _mm_srli_si128 (y1, 2);
++ imagy = _mm_blend_epi16 (y2, imagy, 85);
++ realy = _mm_slli_si128 (y2, 2);
++ realy = _mm_blend_epi16 (realy, y1, 85);
++
++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy);
++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy);
++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy);
++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy);
++
++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
++
++ real_output_i_1 = _mm_cvtepi16_epi32(real_output);
++ real_output_ps_1 = _mm_cvtepi32_ps(real_output_i_1);
++ real_output = _mm_srli_si128 (real_output, 8);
++ real_output_i_2 = _mm_cvtepi16_epi32(real_output);
++ real_output_ps_2 = _mm_cvtepi32_ps(real_output_i_2);
++
++ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output);
++ imag_output_ps_1 = _mm_cvtepi32_ps(imag_output_i_1);
++ imag_output = _mm_srli_si128 (imag_output, 8);
++ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output);
++ imag_output_ps_2 = _mm_cvtepi32_ps(imag_output_i_2);
++
++ real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps_1);
++ real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps_2);
++ imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps_1);
++ imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps_2);
++
++ input_ptr += 4;
++ carrier_ptr += 4;
++ E_code_ptr += 4;
++ L_code_ptr += 4;
++ P_code_ptr += 4;
++ }
++
++ __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4];
++
++ _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector
++ _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector
++ _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector
++ _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector
++ _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector
++ _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector
++
++ for (int i = 0; i<4; ++i)
++ {
++ E_out_real += real_E_dotProductVector[i];
++ E_out_imag += imag_E_dotProductVector[i];
++ P_out_real += real_P_dotProductVector[i];
++ P_out_imag += imag_P_dotProductVector[i];
++ L_out_real += real_L_dotProductVector[i];
++ L_out_imag += imag_L_dotProductVector[i];
++ }
++ *E_out_ptr = lv_cmake(E_out_real, E_out_imag);
++ *P_out_ptr = lv_cmake(P_out_real, P_out_imag);
++ *L_out_ptr = lv_cmake(L_out_real, L_out_imag);
++ }
++
++ lv_16sc_t bb_signal_sample;
++ for(int i=0; i < num_points%8; ++i)
++ {
++ //Perform the carrier wipe-off
++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
++ // Now get early, late, and prompt values for each
++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++));
++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++));
++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++));
++ }
++}
++#endif /* LV_HAVE_SSE4_1 */
++
++#ifdef LV_HAVE_SSE4_1
++#include "smmintrin.h"
++/*!
++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
++ \param input The input signal input
++ \param carrier The carrier signal input
++ \param E_code Early PRN code replica input
++ \param P_code Early PRN code replica input
++ \param L_code Early PRN code replica input
++ \param E_out Early correlation output
++ \param P_out Early correlation output
++ \param L_out Early correlation output
++ \param num_points The number of complex values in vectors
++ */
++static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_u_sse4_1_third(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points)
++{
++ const unsigned int sse_iters = num_points / 8;
++ unsigned int index = 0;
++ unsigned int indexPlus4 = 0;
++
++ __m128i x1, x2, y1, y2, real_bb_signal_sample, imag_bb_signal_sample;
++ __m128i realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, real_output_i32, imag_output_i32;
++
++ __m128 real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc;
++ __m128i real_output_i_1, real_output_i_2, imag_output_i_1, imag_output_i_2;
++ __m128 real_output_ps, imag_output_ps;
++
++ float E_out_real = 0;
++ float E_out_imag = 0;
++ float P_out_real = 0;
++ float P_out_imag = 0;
++ float L_out_real = 0;
++ float L_out_imag = 0;
++
++ const lv_16sc_t* input_ptr = input;
++ const lv_16sc_t* carrier_ptr = carrier;
++
++ const lv_16sc_t* E_code_ptr = E_code;
++ lv_32fc_t* E_out_ptr = E_out;
++ const lv_16sc_t* L_code_ptr = L_code;
++ lv_32fc_t* L_out_ptr = L_out;
++ const lv_16sc_t* P_code_ptr = P_code;
++ lv_32fc_t* P_out_ptr = P_out;
++
++ *E_out_ptr = 0;
++ *P_out_ptr = 0;
++ *L_out_ptr = 0;
++
++ real_E_code_acc = _mm_setzero_ps();
++ imag_E_code_acc = _mm_setzero_ps();
++ real_P_code_acc = _mm_setzero_ps();
++ imag_P_code_acc = _mm_setzero_ps();
++ real_L_code_acc = _mm_setzero_ps();
++ imag_L_code_acc = _mm_setzero_ps();
++
++ if (sse_iters>0)
++ {
++ for(index = 0;index < 8*sse_iters; index+=8){
++ indexPlus4 = index + 4;
++ //Perform the carrier wipe-off
++ x1 = _mm_lddqu_si128((__m128i*)&input_ptr[index]);
++ x2 = _mm_lddqu_si128((__m128i*)&input_ptr[indexPlus4]);
++
++ y1 = _mm_lddqu_si128((__m128i*)&carrier_ptr[index]);
++ y2 = _mm_lddqu_si128((__m128i*)&carrier_ptr[indexPlus4]);
++
++ imagx = _mm_srli_si128 (x1, 2);
++ imagx = _mm_blend_epi16 (x2, imagx, 85);
++ realx = _mm_slli_si128 (x2, 2);
++ realx = _mm_blend_epi16 (realx, x1, 85);
++
++ imagy = _mm_srli_si128 (y1, 2);
++ imagy = _mm_blend_epi16 (y2, imagy, 85);
++ realy = _mm_slli_si128 (y2, 2);
++ realy = _mm_blend_epi16 (realy, y1, 85);
++
++ realx_mult_realy = _mm_mullo_epi16 (realx, realy);
++ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);
++ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);
++ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);
++
++ real_bb_signal_sample = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
++ imag_bb_signal_sample = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
++
++ //Get early values
++ y1 = _mm_lddqu_si128((__m128i*)&E_code_ptr[index]);
++ y2 = _mm_lddqu_si128((__m128i*)&E_code_ptr[indexPlus4]);
++
++ imagy = _mm_srli_si128 (y1, 2);
++ imagy = _mm_blend_epi16 (y2, imagy, 85);
++ realy = _mm_slli_si128 (y2, 2);
++ realy = _mm_blend_epi16 (realy, y1, 85);
++
++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy);
++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy);
++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy);
++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy);
++
++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
++
++ real_output_i_1 = _mm_cvtepi16_epi32(real_output);
++ real_output = _mm_srli_si128 (real_output, 8);
++ real_output_i_2 = _mm_cvtepi16_epi32(real_output);
++ real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2);
++ real_output_ps = _mm_cvtepi32_ps(real_output_i32);
++
++ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output);
++ imag_output = _mm_srli_si128 (imag_output, 8);
++ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output);
++ imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2);
++ imag_output_ps = _mm_cvtepi32_ps(imag_output_i32);
++
++ real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps);
++ imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps);
++
++ //Get prompt values
++ y1 = _mm_lddqu_si128((__m128i*)&P_code_ptr[index]);
++ y2 = _mm_lddqu_si128((__m128i*)&P_code_ptr[indexPlus4]);
++
++ imagy = _mm_srli_si128 (y1, 2);
++ imagy = _mm_blend_epi16 (y2, imagy, 85);
++ realy = _mm_slli_si128 (y2, 2);
++ realy = _mm_blend_epi16 (realy, y1, 85);
++
++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy);
++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy);
++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy);
++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy);
++
++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
++
++ real_output_i_1 = _mm_cvtepi16_epi32(real_output);
++ real_output = _mm_srli_si128 (real_output, 8);
++ real_output_i_2 = _mm_cvtepi16_epi32(real_output);
++ real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2);
++ real_output_ps = _mm_cvtepi32_ps(real_output_i32);
++
++ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output);
++ imag_output = _mm_srli_si128 (imag_output, 8);
++ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output);
++ imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2);
++ imag_output_ps = _mm_cvtepi32_ps(imag_output_i32);
++
++ real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps);
++ imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps);
++
++ //Get late values
++ y1 = _mm_lddqu_si128((__m128i*)&L_code_ptr[index]);
++ y2 = _mm_lddqu_si128((__m128i*)&L_code_ptr[indexPlus4]);
++
++ imagy = _mm_srli_si128 (y1, 2);
++ imagy = _mm_blend_epi16 (y2, imagy, 85);
++ realy = _mm_slli_si128 (y2, 2);
++ realy = _mm_blend_epi16 (realy, y1, 85);
++
++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy);
++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy);
++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy);
++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy);
++
++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
++
++ real_output_i_1 = _mm_cvtepi16_epi32(real_output);
++ real_output = _mm_srli_si128 (real_output, 8);
++ real_output_i_2 = _mm_cvtepi16_epi32(real_output);
++ real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2);
++ real_output_ps = _mm_cvtepi32_ps(real_output_i32);
++
++ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output);
++ imag_output = _mm_srli_si128 (imag_output, 8);
++ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output);
++ imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2);
++ imag_output_ps = _mm_cvtepi32_ps(imag_output_i32);
++
++ real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps);
++ imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps);
++ }
++
++ __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4];
++
++ _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector
++ _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector
++ _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector
++ _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector
++ _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector
++ _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector
++
++ for (int i = 0; i<4; ++i)
++ {
++ E_out_real += real_E_dotProductVector[i];
++ E_out_imag += imag_E_dotProductVector[i];
++ P_out_real += real_P_dotProductVector[i];
++ P_out_imag += imag_P_dotProductVector[i];
++ L_out_real += real_L_dotProductVector[i];
++ L_out_imag += imag_L_dotProductVector[i];
++ }
++ *E_out_ptr = lv_cmake(E_out_real, E_out_imag);
++ *P_out_ptr = lv_cmake(P_out_real, P_out_imag);
++ *L_out_ptr = lv_cmake(L_out_real, L_out_imag);
++ }
++
++ lv_16sc_t bb_signal_sample;
++ for(; index < num_points; index++)
++ {
++ //Perform the carrier wipe-off
++ bb_signal_sample = input_ptr[index] * carrier_ptr[index];
++ // Now get early, late, and prompt values for each
++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * E_code_ptr[index]);
++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * P_code_ptr[index]);
++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * L_code_ptr[index]);
++ }
++}
++#endif /* LV_HAVE_SSE4_1 */
++
++#ifdef LV_HAVE_SSE4_1
++#include "smmintrin.h"
++/*!
++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
++ \param input The input signal input
++ \param carrier The carrier signal input
++ \param E_code Early PRN code replica input
++ \param P_code Early PRN code replica input
++ \param L_code Early PRN code replica input
++ \param E_out Early correlation output
++ \param P_out Early correlation output
++ \param L_out Early correlation output
++ \param num_points The number of complex values in vectors
++ */
++static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_u_sse4_1_fourth(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points)
++{
++ const unsigned int sse_iters = num_points / 8;
++
++ __m128i x1, x2, y1, y2, real_bb_signal_sample, imag_bb_signal_sample;
++ __m128i realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, real_output_i32, imag_output_i32;
++
++ __m128 real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc;
++ __m128i real_output_i_1, real_output_i_2, imag_output_i_1, imag_output_i_2;
++ __m128 real_output_ps, imag_output_ps;
++
++ float E_out_real = 0;
++ float E_out_imag = 0;
++ float P_out_real = 0;
++ float P_out_imag = 0;
++ float L_out_real = 0;
++ float L_out_imag = 0;
++
++ const lv_16sc_t* input_ptr = input;
++ const lv_16sc_t* carrier_ptr = carrier;
++
++ const lv_16sc_t* E_code_ptr = E_code;
++ lv_32fc_t* E_out_ptr = E_out;
++ const lv_16sc_t* L_code_ptr = L_code;
++ lv_32fc_t* L_out_ptr = L_out;
++ const lv_16sc_t* P_code_ptr = P_code;
++ lv_32fc_t* P_out_ptr = P_out;
++
++ *E_out_ptr = 0;
++ *P_out_ptr = 0;
++ *L_out_ptr = 0;
++
++ real_E_code_acc = _mm_setzero_ps();
++ imag_E_code_acc = _mm_setzero_ps();
++ real_P_code_acc = _mm_setzero_ps();
++ imag_P_code_acc = _mm_setzero_ps();
++ real_L_code_acc = _mm_setzero_ps();
++ imag_L_code_acc = _mm_setzero_ps();
++
++ if (sse_iters>0)
++ {
++ for(int number = 0;number < sse_iters; number++){
++
++ //Perform the carrier wipe-off
++ x1 = _mm_lddqu_si128((__m128i*)input_ptr);
++ input_ptr += 4;
++ x2 = _mm_lddqu_si128((__m128i*)input_ptr);
++
++ y1 = _mm_lddqu_si128((__m128i*)carrier_ptr);
++ carrier_ptr += 4;
++ y2 = _mm_lddqu_si128((__m128i*)carrier_ptr);
++
++ imagx = _mm_srli_si128 (x1, 2);
++ imagx = _mm_blend_epi16 (x2, imagx, 85);
++ realx = _mm_slli_si128 (x2, 2);
++ realx = _mm_blend_epi16 (realx, x1, 85);
++
++ imagy = _mm_srli_si128 (y1, 2);
++ imagy = _mm_blend_epi16 (y2, imagy, 85);
++ realy = _mm_slli_si128 (y2, 2);
++ realy = _mm_blend_epi16 (realy, y1, 85);
++
++ realx_mult_realy = _mm_mullo_epi16 (realx, realy);
++ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);
++ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);
++ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);
++
++ real_bb_signal_sample = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
++ imag_bb_signal_sample = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
++
++ //Get early values
++ y1 = _mm_lddqu_si128((__m128i*)E_code_ptr);
++ E_code_ptr += 4;
++ y2 = _mm_lddqu_si128((__m128i*)E_code_ptr);
++
++ imagy = _mm_srli_si128 (y1, 2);
++ imagy = _mm_blend_epi16 (y2, imagy, 85);
++ realy = _mm_slli_si128 (y2, 2);
++ realy = _mm_blend_epi16 (realy, y1, 85);
++
++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy);
++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy);
++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy);
++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy);
++
++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
++
++ real_output_i_1 = _mm_cvtepi16_epi32(real_output);
++ real_output = _mm_srli_si128 (real_output, 8);
++ real_output_i_2 = _mm_cvtepi16_epi32(real_output);
++ real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2);
++ real_output_ps = _mm_cvtepi32_ps(real_output_i32);
++
++ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output);
++ imag_output = _mm_srli_si128 (imag_output, 8);
++ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output);
++ imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2);
++ imag_output_ps = _mm_cvtepi32_ps(imag_output_i32);
++
++ real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps);
++ imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps);
++
++ //Get prompt values
++ y1 = _mm_lddqu_si128((__m128i*)P_code_ptr);
++ P_code_ptr += 4;
++ y2 = _mm_lddqu_si128((__m128i*)P_code_ptr);
++
++ imagy = _mm_srli_si128 (y1, 2);
++ imagy = _mm_blend_epi16 (y2, imagy, 85);
++ realy = _mm_slli_si128 (y2, 2);
++ realy = _mm_blend_epi16 (realy, y1, 85);
++
++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy);
++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy);
++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy);
++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy);
++
++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
++
++ real_output_i_1 = _mm_cvtepi16_epi32(real_output);
++ real_output = _mm_srli_si128 (real_output, 8);
++ real_output_i_2 = _mm_cvtepi16_epi32(real_output);
++ real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2);
++ real_output_ps = _mm_cvtepi32_ps(real_output_i32);
++
++ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output);
++ imag_output = _mm_srli_si128 (imag_output, 8);
++ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output);
++ imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2);
++ imag_output_ps = _mm_cvtepi32_ps(imag_output_i32);
++
++ real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps);
++ imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps);
++
++ //Get late values
++ y1 = _mm_lddqu_si128((__m128i*)L_code_ptr);
++ L_code_ptr += 4;
++ y2 = _mm_lddqu_si128((__m128i*)L_code_ptr);
++
++ imagy = _mm_srli_si128 (y1, 2);
++ imagy = _mm_blend_epi16 (y2, imagy, 85);
++ realy = _mm_slli_si128 (y2, 2);
++ realy = _mm_blend_epi16 (realy, y1, 85);
++
++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy);
++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy);
++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy);
++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy);
++
++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
++
++ real_output_i_1 = _mm_cvtepi16_epi32(real_output);
++ real_output = _mm_srli_si128 (real_output, 8);
++ real_output_i_2 = _mm_cvtepi16_epi32(real_output);
++ real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2);
++ real_output_ps = _mm_cvtepi32_ps(real_output_i32);
++
++ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output);
++ imag_output = _mm_srli_si128 (imag_output, 8);
++ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output);
++ imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2);
++ imag_output_ps = _mm_cvtepi32_ps(imag_output_i32);
++
++ real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps);
++ imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps);
++
++ input_ptr += 4;
++ carrier_ptr += 4;
++ E_code_ptr += 4;
++ L_code_ptr += 4;
++ P_code_ptr += 4;
++ }
++
++ __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4];
++
++ _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector
++ _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector
++ _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector
++ _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector
++ _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector
++ _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector
++
++ for (int i = 0; i<4; ++i)
++ {
++ E_out_real += real_E_dotProductVector[i];
++ E_out_imag += imag_E_dotProductVector[i];
++ P_out_real += real_P_dotProductVector[i];
++ P_out_imag += imag_P_dotProductVector[i];
++ L_out_real += real_L_dotProductVector[i];
++ L_out_imag += imag_L_dotProductVector[i];
++ }
++ *E_out_ptr = lv_cmake(E_out_real, E_out_imag);
++ *P_out_ptr = lv_cmake(P_out_real, P_out_imag);
++ *L_out_ptr = lv_cmake(L_out_real, L_out_imag);
++ }
++
++ lv_16sc_t bb_signal_sample;
++ for(int i=0; i < num_points%8; ++i)
++ {
++ //Perform the carrier wipe-off
++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
++ // Now get early, late, and prompt values for each
++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++));
++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++));
++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++));
++ }
++}
++#endif /* LV_HAVE_SSE4_1 */
++
++#ifdef LV_HAVE_SSE4_1
++#include "smmintrin.h"
++#include "CommonMacros/CommonMacros.h"
++/*!
++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
++ \param input The input signal input
++ \param carrier The carrier signal input
++ \param E_code Early PRN code replica input
++ \param P_code Early PRN code replica input
++ \param L_code Early PRN code replica input
++ \param E_out Early correlation output
++ \param P_out Early correlation output
++ \param L_out Early correlation output
++ \param num_points The number of complex values in vectors
++ */
++
++static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_u_sse4_1_fifth(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points)
++{
++ const unsigned int sse_iters = num_points / 8;
++
++ __m128i realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy;
++ __m128i input_i_1, input_i_2, output_i32;
++
++ __m128i x1, x2, y1, y2, real_bb_signal_sample, imag_bb_signal_sample;
++ __m128i realx, imagx, realy, imagy, real_output, imag_output;
++
++ __m128 real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc;
++ __m128 real_output_ps, imag_output_ps;
++
++ float E_out_real = 0;
++ float E_out_imag = 0;
++ float P_out_real = 0;
++ float P_out_imag = 0;
++ float L_out_real = 0;
++ float L_out_imag = 0;
++
++ const lv_16sc_t* input_ptr = input;
++ const lv_16sc_t* carrier_ptr = carrier;
++
++ const lv_16sc_t* E_code_ptr = E_code;
++ lv_32fc_t* E_out_ptr = E_out;
++ const lv_16sc_t* L_code_ptr = L_code;
++ lv_32fc_t* L_out_ptr = L_out;
++ const lv_16sc_t* P_code_ptr = P_code;
++ lv_32fc_t* P_out_ptr = P_out;
++
++ *E_out_ptr = 0;
++ *P_out_ptr = 0;
++ *L_out_ptr = 0;
++
++ real_E_code_acc = _mm_setzero_ps();
++ imag_E_code_acc = _mm_setzero_ps();
++ real_P_code_acc = _mm_setzero_ps();
++ imag_P_code_acc = _mm_setzero_ps();
++ real_L_code_acc = _mm_setzero_ps();
++ imag_L_code_acc = _mm_setzero_ps();
++
++ if (sse_iters>0)
++ {
++ for(int number = 0;number < sse_iters; number++){
++
++ //Perform the carrier wipe-off
++ x1 = _mm_lddqu_si128((__m128i*)input_ptr);
++ input_ptr += 4;
++ x2 = _mm_lddqu_si128((__m128i*)input_ptr);
++
++ y1 = _mm_lddqu_si128((__m128i*)carrier_ptr);
++ carrier_ptr += 4;
++ y2 = _mm_lddqu_si128((__m128i*)carrier_ptr);
++
++ CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(x1, x2, realx, imagx)
++ CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy)
++ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample)
++
++ //Get early values
++ y1 = _mm_lddqu_si128((__m128i*)E_code_ptr);
++ E_code_ptr += 4;
++ y2 = _mm_lddqu_si128((__m128i*)E_code_ptr);
++
++ CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy)
++ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(real_bb_signal_sample, imag_bb_signal_sample, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output)
++
++ CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(real_output, input_i_1, input_i_2, output_i32, real_output_ps)
++ CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(imag_output, input_i_1, input_i_2, output_i32, imag_output_ps)
++
++ real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps);
++ imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps);
++
++ //Get prompt values
++ y1 = _mm_lddqu_si128((__m128i*)P_code_ptr);
++ P_code_ptr += 4;
++ y2 = _mm_lddqu_si128((__m128i*)P_code_ptr);
++
++ CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy)
++ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(real_bb_signal_sample, imag_bb_signal_sample, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output)
++
++ CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(real_output, input_i_1, input_i_2, output_i32, real_output_ps)
++ CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(imag_output, input_i_1, input_i_2, output_i32, imag_output_ps)
++
++ real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps);
++ imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps);
++
++ //Get late values
++ y1 = _mm_lddqu_si128((__m128i*)L_code_ptr);
++ L_code_ptr += 4;
++ y2 = _mm_lddqu_si128((__m128i*)L_code_ptr);
++
++ CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy)
++ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(real_bb_signal_sample, imag_bb_signal_sample, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output)
++
++ CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(real_output, input_i_1, input_i_2, output_i32, real_output_ps)
++ CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(imag_output, input_i_1, input_i_2, output_i32, imag_output_ps)
++
++ real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps);
++ imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps);
++
++ input_ptr += 4;
++ carrier_ptr += 4;
++ E_code_ptr += 4;
++ L_code_ptr += 4;
++ P_code_ptr += 4;
++ }
++
++ __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4];
++
++ _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector
++ _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector
++ _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector
++ _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector
++ _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector
++ _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector
++
++ for (int i = 0; i<4; ++i)
++ {
++ E_out_real += real_E_dotProductVector[i];
++ E_out_imag += imag_E_dotProductVector[i];
++ P_out_real += real_P_dotProductVector[i];
++ P_out_imag += imag_P_dotProductVector[i];
++ L_out_real += real_L_dotProductVector[i];
++ L_out_imag += imag_L_dotProductVector[i];
++ }
++ *E_out_ptr = lv_cmake(E_out_real, E_out_imag);
++ *P_out_ptr = lv_cmake(P_out_real, P_out_imag);
++ *L_out_ptr = lv_cmake(L_out_real, L_out_imag);
++ }
++
++ lv_16sc_t bb_signal_sample;
++ for(int i=0; i < num_points%8; ++i)
++ {
++ //Perform the carrier wipe-off
++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
++ // Now get early, late, and prompt values for each
++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++));
++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++));
++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++));
++ }
++}
++#endif /* LV_HAVE_SSE4_1 */
++
++#ifdef LV_HAVE_SSE4_1
++#include "smmintrin.h"
++#include "CommonMacros/CommonMacros_16ic_cw_epl_corr_32fc.h"
++#include "CommonMacros/CommonMacros.h"
++/*!
++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
++ \param input The input signal input
++ \param carrier The carrier signal input
++ \param E_code Early PRN code replica input
++ \param P_code Early PRN code replica input
++ \param L_code Early PRN code replica input
++ \param E_out Early correlation output
++ \param P_out Early correlation output
++ \param L_out Early correlation output
++ \param num_points The number of complex values in vectors
++ */
++
++static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_u_sse4_1_sixth(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points)
++{
++ const unsigned int sse_iters = num_points / 8;
++
++ __m128i realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy;
++ __m128i input_i_1, input_i_2, output_i32;
++
++ __m128i x1, x2, y1, y2, real_bb_signal_sample, imag_bb_signal_sample;
++ __m128i realx, imagx, realy, imagy, real_output, imag_output;
++
++ __m128 real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc;
++ __m128 real_output_ps, imag_output_ps;
++
++ float E_out_real = 0;
++ float E_out_imag = 0;
++ float P_out_real = 0;
++ float P_out_imag = 0;
++ float L_out_real = 0;
++ float L_out_imag = 0;
++
++ const lv_16sc_t* input_ptr = input;
++ const lv_16sc_t* carrier_ptr = carrier;
++
++ const lv_16sc_t* E_code_ptr = E_code;
++ lv_32fc_t* E_out_ptr = E_out;
++ const lv_16sc_t* L_code_ptr = L_code;
++ lv_32fc_t* L_out_ptr = L_out;
++ const lv_16sc_t* P_code_ptr = P_code;
++ lv_32fc_t* P_out_ptr = P_out;
++
++ *E_out_ptr = 0;
++ *P_out_ptr = 0;
++ *L_out_ptr = 0;
++
++ real_E_code_acc = _mm_setzero_ps();
++ imag_E_code_acc = _mm_setzero_ps();
++ real_P_code_acc = _mm_setzero_ps();
++ imag_P_code_acc = _mm_setzero_ps();
++ real_L_code_acc = _mm_setzero_ps();
++ imag_L_code_acc = _mm_setzero_ps();
++
++ if (sse_iters>0)
++ {
++ for(int number = 0;number < sse_iters; number++){
++
++ //Perform the carrier wipe-off
++ x1 = _mm_lddqu_si128((__m128i*)input_ptr);
++ input_ptr += 4;
++ x2 = _mm_lddqu_si128((__m128i*)input_ptr);
++
++ y1 = _mm_lddqu_si128((__m128i*)carrier_ptr);
++ carrier_ptr += 4;
++ y2 = _mm_lddqu_si128((__m128i*)carrier_ptr);
++
++ CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(x1, x2, realx, imagx)
++ CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy)
++ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample)
++
++ //Get early values
++ y1 = _mm_lddqu_si128((__m128i*)E_code_ptr);
++ E_code_ptr += 4;
++ y2 = _mm_lddqu_si128((__m128i*)E_code_ptr);
++
++ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
++
++ real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps);
++ imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps);
++
++ //Get prompt values
++ y1 = _mm_lddqu_si128((__m128i*)P_code_ptr);
++ P_code_ptr += 4;
++ y2 = _mm_lddqu_si128((__m128i*)P_code_ptr);
++
++ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
++
++ real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps);
++ imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps);
++
++ //Get late values
++ y1 = _mm_lddqu_si128((__m128i*)L_code_ptr);
++ L_code_ptr += 4;
++ y2 = _mm_lddqu_si128((__m128i*)L_code_ptr);
++
++ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
++
++ real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps);
++ imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps);
++
++ input_ptr += 4;
++ carrier_ptr += 4;
++ E_code_ptr += 4;
++ L_code_ptr += 4;
++ P_code_ptr += 4;
++ }
++
++ __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4];
++
++ _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector
++ _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector
++ _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector
++ _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector
++ _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector
++ _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector
++
++ for (int i = 0; i<4; ++i)
++ {
++ E_out_real += real_E_dotProductVector[i];
++ E_out_imag += imag_E_dotProductVector[i];
++ P_out_real += real_P_dotProductVector[i];
++ P_out_imag += imag_P_dotProductVector[i];
++ L_out_real += real_L_dotProductVector[i];
++ L_out_imag += imag_L_dotProductVector[i];
++ }
++ *E_out_ptr = lv_cmake(E_out_real, E_out_imag);
++ *P_out_ptr = lv_cmake(P_out_real, P_out_imag);
++ *L_out_ptr = lv_cmake(L_out_real, L_out_imag);
++ }
++
++ lv_16sc_t bb_signal_sample;
++ for(int i=0; i < num_points%8; ++i)
++ {
++ //Perform the carrier wipe-off
++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
++ // Now get early, late, and prompt values for each
++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++));
++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++));
++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++));
++ }
++}
++#endif /* LV_HAVE_SSE4_1 */
++
++#ifdef LV_HAVE_GENERIC
++/*!
++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
++ \param input The input signal input
++ \param carrier The carrier signal input
++ \param E_code Early PRN code replica input
++ \param P_code Early PRN code replica input
++ \param L_code Early PRN code replica input
++ \param E_out Early correlation output
++ \param P_out Early correlation output
++ \param L_out Early correlation output
++ \param num_points The number of complex values in vectors
++ */
++static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_generic(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points)
++{
++ lv_16sc_t bb_signal_sample;
++ lv_16sc_t tmp1;
++ lv_16sc_t tmp2;
++ lv_16sc_t tmp3;
++
++ bb_signal_sample = lv_cmake(0, 0);
++
++ *E_out = 0;
++ *P_out = 0;
++ *L_out = 0;
++ // perform Early, Prompt and Late correlation
++
++ for(int i=0; i < num_points; ++i)
++ {
++ //Perform the carrier wipe-off
++ bb_signal_sample = input[i] * carrier[i];
++
++ tmp1 = bb_signal_sample * E_code[i];
++ tmp2 = bb_signal_sample * P_code[i];
++ tmp3 = bb_signal_sample * L_code[i];
++
++ // Now get early, late, and prompt values for each
++ *E_out += (lv_32fc_t)tmp1;
++ *P_out += (lv_32fc_t)tmp2;
++ *L_out += (lv_32fc_t)tmp3;
++ }
++}
++#endif /* LV_HAVE_GENERIC */
++#endif /* INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_u_H */
++
++
++#ifndef INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_a_H
++#define INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_a_H
++
++#include <inttypes.h>
++#include <stdio.h>
++#include <volk_gnsssdr/volk_gnsssdr_complex.h>
++#include <float.h>
++#include <string.h>
++//
++//#ifdef LV_HAVE_SSE4_1
++//#include "smmintrin.h"
++///*!
++// \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
++// \param input The input signal input
++// \param carrier The carrier signal input
++// \param E_code Early PRN code replica input
++// \param P_code Early PRN code replica input
++// \param L_code Early PRN code replica input
++// \param E_out Early correlation output
++// \param P_out Early correlation output
++// \param L_out Early correlation output
++// \param num_points The number of complex values in vectors
++// */
++//static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_a_sse4_1(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points)
++//{
++// const unsigned int sse_iters = num_points / 8;
++//
++// __m128i x1, x2, y1, y2, real_bb_signal_sample, imag_bb_signal_sample;
++// __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output;
++//
++// __m128 real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc;
++// __m128i real_output_i_1, real_output_i_2, imag_output_i_1, imag_output_i_2;
++// __m128 real_output_ps_1, real_output_ps_2, imag_output_ps_1, imag_output_ps_2;
++//
++// float E_out_real = 0;
++// float E_out_imag = 0;
++// float P_out_real = 0;
++// float P_out_imag = 0;
++// float L_out_real = 0;
++// float L_out_imag = 0;
++//
++// const lv_16sc_t* input_ptr = input;
++// const lv_16sc_t* carrier_ptr = carrier;
++//
++// const lv_16sc_t* E_code_ptr = E_code;
++// lv_32fc_t* E_out_ptr = E_out;
++// const lv_16sc_t* L_code_ptr = L_code;
++// lv_32fc_t* L_out_ptr = L_out;
++// const lv_16sc_t* P_code_ptr = P_code;
++// lv_32fc_t* P_out_ptr = P_out;
++//
++// *E_out_ptr = 0;
++// *P_out_ptr = 0;
++// *L_out_ptr = 0;
++//
++// mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
++//
++// real_E_code_acc = _mm_setzero_ps();
++// imag_E_code_acc = _mm_setzero_ps();
++// real_P_code_acc = _mm_setzero_ps();
++// imag_P_code_acc = _mm_setzero_ps();
++// real_L_code_acc = _mm_setzero_ps();
++// imag_L_code_acc = _mm_setzero_ps();
++//
++// if (sse_iters>0)
++// {
++// for(int number = 0;number < sse_iters; number++){
++//
++// //Perform the carrier wipe-off
++// x1 = _mm_lddqu_si128((__m128i*)input_ptr);
++// input_ptr += 4;
++// x2 = _mm_lddqu_si128((__m128i*)input_ptr);
++//
++// y1 = _mm_lddqu_si128((__m128i*)carrier_ptr);
++// carrier_ptr += 4;
++// y2 = _mm_lddqu_si128((__m128i*)carrier_ptr);
++//
++// imagx = _mm_srli_si128 (x1, 2);
++// imagx = _mm_blend_epi16 (x2, imagx, 85);
++// realx = _mm_slli_si128 (x2, 2);
++// realx = _mm_blend_epi16 (realx, x1, 85);
++//
++// imagy = _mm_srli_si128 (y1, 2);
++// imagy = _mm_blend_epi16 (y2, imagy, 85);
++// realy = _mm_slli_si128 (y2, 2);
++// realy = _mm_blend_epi16 (realy, y1, 85);
++//
++// realx_mult_realy = _mm_mullo_epi16 (realx, realy);
++// imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);
++// realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);
++// imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);
++//
++// real_bb_signal_sample = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
++// imag_bb_signal_sample = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
++//
++// //Get early values
++// y1 = _mm_lddqu_si128((__m128i*)E_code_ptr);
++// E_code_ptr += 4;
++// y2 = _mm_lddqu_si128((__m128i*)E_code_ptr);
++//
++// imagy = _mm_srli_si128 (y1, 2);
++// imagy = _mm_blend_epi16 (y2, imagy, 85);
++// realy = _mm_slli_si128 (y2, 2);
++// realy = _mm_blend_epi16 (realy, y1, 85);
++//
++// realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy);
++// imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy);
++// realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy);
++// imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy);
++//
++// real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
++// imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
++//
++// real_output_i_1 = _mm_cvtepi16_epi32(real_output);
++// real_output_ps_1 = _mm_cvtepi32_ps(real_output_i_1);
++// real_output = _mm_srli_si128 (real_output, 8);
++// real_output_i_2 = _mm_cvtepi16_epi32(real_output);
++// real_output_ps_2 = _mm_cvtepi32_ps(real_output_i_2);
++//
++// imag_output_i_1 = _mm_cvtepi16_epi32(imag_output);
++// imag_output_ps_1 = _mm_cvtepi32_ps(imag_output_i_1);
++// imag_output = _mm_srli_si128 (imag_output, 8);
++// imag_output_i_2 = _mm_cvtepi16_epi32(imag_output);
++// imag_output_ps_2 = _mm_cvtepi32_ps(imag_output_i_2);
++//
++// real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps_1);
++// real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps_2);
++// imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps_1);
++// imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps_2);
++//
++// //Get prompt values
++// y1 = _mm_lddqu_si128((__m128i*)P_code_ptr);
++// P_code_ptr += 4;
++// y2 = _mm_lddqu_si128((__m128i*)P_code_ptr);
++//
++// imagy = _mm_srli_si128 (y1, 2);
++// imagy = _mm_blend_epi16 (y2, imagy, 85);
++// realy = _mm_slli_si128 (y2, 2);
++// realy = _mm_blend_epi16 (realy, y1, 85);
++//
++// realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy);
++// imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy);
++// realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy);
++// imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy);
++//
++// real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
++// imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
++//
++// real_output_i_1 = _mm_cvtepi16_epi32(real_output);
++// real_output_ps_1 = _mm_cvtepi32_ps(real_output_i_1);
++// real_output = _mm_srli_si128 (real_output, 8);
++// real_output_i_2 = _mm_cvtepi16_epi32(real_output);
++// real_output_ps_2 = _mm_cvtepi32_ps(real_output_i_2);
++//
++// imag_output_i_1 = _mm_cvtepi16_epi32(imag_output);
++// imag_output_ps_1 = _mm_cvtepi32_ps(imag_output_i_1);
++// imag_output = _mm_srli_si128 (imag_output, 8);
++// imag_output_i_2 = _mm_cvtepi16_epi32(imag_output);
++// imag_output_ps_2 = _mm_cvtepi32_ps(imag_output_i_2);
++//
++// real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps_1);
++// real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps_2);
++// imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps_1);
++// imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps_2);
++//
++// //Get late values
++// y1 = _mm_lddqu_si128((__m128i*)L_code_ptr);
++// L_code_ptr += 4;
++// y2 = _mm_lddqu_si128((__m128i*)L_code_ptr);
++//
++// imagy = _mm_srli_si128 (y1, 2);
++// imagy = _mm_blend_epi16 (y2, imagy, 85);
++// realy = _mm_slli_si128 (y2, 2);
++// realy = _mm_blend_epi16 (realy, y1, 85);
++//
++// realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy);
++// imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy);
++// realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy);
++// imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy);
++//
++// real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
++// imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
++//
++// real_output_i_1 = _mm_cvtepi16_epi32(real_output);
++// real_output_ps_1 = _mm_cvtepi32_ps(real_output_i_1);
++// real_output = _mm_srli_si128 (real_output, 8);
++// real_output_i_2 = _mm_cvtepi16_epi32(real_output);
++// real_output_ps_2 = _mm_cvtepi32_ps(real_output_i_2);
++//
++// imag_output_i_1 = _mm_cvtepi16_epi32(imag_output);
++// imag_output_ps_1 = _mm_cvtepi32_ps(imag_output_i_1);
++// imag_output = _mm_srli_si128 (imag_output, 8);
++// imag_output_i_2 = _mm_cvtepi16_epi32(imag_output);
++// imag_output_ps_2 = _mm_cvtepi32_ps(imag_output_i_2);
++//
++// real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps_1);
++// real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps_2);
++// imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps_1);
++// imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps_2);
++//
++// input_ptr += 4;
++// carrier_ptr += 4;
++// E_code_ptr += 4;
++// L_code_ptr += 4;
++// P_code_ptr += 4;
++// }
++//
++// __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4];
++// __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4];
++// __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4];
++// __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4];
++// __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4];
++// __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4];
++//
++// _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector
++// _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector
++// _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector
++// _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector
++// _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector
++// _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector
++//
++// for (int i = 0; i<4; ++i)
++// {
++// E_out_real += real_E_dotProductVector[i];
++// E_out_imag += imag_E_dotProductVector[i];
++// P_out_real += real_P_dotProductVector[i];
++// P_out_imag += imag_P_dotProductVector[i];
++// L_out_real += real_L_dotProductVector[i];
++// L_out_imag += imag_L_dotProductVector[i];
++// }
++// *E_out_ptr = lv_cmake(E_out_real, E_out_imag);
++// *P_out_ptr = lv_cmake(P_out_real, P_out_imag);
++// *L_out_ptr = lv_cmake(L_out_real, L_out_imag);
++// }
++//
++// lv_16sc_t bb_signal_sample;
++// for(int i=0; i < num_points%8; ++i)
++// {
++// //Perform the carrier wipe-off
++// bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
++// // Now get early, late, and prompt values for each
++// *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++));
++// *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++));
++// *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++));
++// }
++//}
++//#endif /* LV_HAVE_SSE4_1 */
++//
++#ifdef LV_HAVE_GENERIC
++/*!
++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
++ \param input The input signal input
++ \param carrier The carrier signal input
++ \param E_code Early PRN code replica input
++ \param P_code Early PRN code replica input
++ \param L_code Early PRN code replica input
++ \param E_out Early correlation output
++ \param P_out Early correlation output
++ \param L_out Early correlation output
++ \param num_points The number of complex values in vectors
++ */
++static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_a_generic(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points)
++{
++ lv_16sc_t bb_signal_sample;
++ lv_16sc_t tmp1;
++ lv_16sc_t tmp2;
++ lv_16sc_t tmp3;
++
++ bb_signal_sample = lv_cmake(0, 0);
++
++ *E_out = 0;
++ *P_out = 0;
++ *L_out = 0;
++ // perform Early, Prompt and Late correlation
++
++ for(int i=0; i < num_points; ++i)
++ {
++ //Perform the carrier wipe-off
++ bb_signal_sample = input[i] * carrier[i];
++
++ tmp1 = bb_signal_sample * E_code[i];
++ tmp2 = bb_signal_sample * P_code[i];
++ tmp3 = bb_signal_sample * L_code[i];
++
++ // Now get early, late, and prompt values for each
++ *E_out += (lv_32fc_t)tmp1;
++ *P_out += (lv_32fc_t)tmp2;
++ *L_out += (lv_32fc_t)tmp3;
++ }
++}
++#endif /* LV_HAVE_GENERIC */
++#endif /* INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_a_H */
+diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5.h
+--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5.h 1970-01-01 01:00:00.000000000 +0100
++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5.h 2014-10-15 01:55:08.000000000 +0200
+@@ -0,0 +1,595 @@
++/*!
++ * \file volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5.h
++ * \brief Volk protokernel: performs the carrier wipe-off mixing and the Very early, Early, Prompt, Late and very late correlation with 32 bits vectors and returns float32 values.
++ * \authors <ul>
++ * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
++ * </ul>
++ *
++ * Volk protokernel that performs the carrier wipe-off mixing and the
++ * Very Early, Early, Prompt, Late and Very Late correlation with 32 bits vectors (16 bits the
++ * real part and 16 bits the imaginary part) and accumulates into float32 values, returning them:
++ * - The carrier wipe-off is done by multiplying the input signal by the
++ * carrier (multiplication of 32 bits vectors) It returns the input
++ * signal in base band (BB)
++ * - Very Early values are calculated by multiplying the input signal in BB by the
++ * very early code (multiplication of 32 bits vectors), converting that to float32 and accumulating the results
++ * - Early values are calculated by multiplying the input signal in BB by the
++ * early code (multiplication of 32 bits vectors), converting that to float32 and accumulating the results
++ * - Prompt values are calculated by multiplying the input signal in BB by the
++ * prompt code (multiplication of 32 bits vectors), converting that to float32 and accumulating the results
++ * - Late values are calculated by multiplying the input signal in BB by the
++ * late code (multiplication of 32 bits vectors), converting that to float32 and accumulating the results
++ * - Very Late values are calculated by multiplying the input signal in BB by the
++ * very late code (multiplication of 32 bits vectors), converting that to float32 and accumulating the results
++ *
++ * -------------------------------------------------------------------------
++ *
++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
++ *
++ * GNSS-SDR is a software defined Global Navigation
++ * Satellite Systems receiver
++ *
++ * This file is part of GNSS-SDR.
++ *
++ * GNSS-SDR is free software: you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation, either version 3 of the License, or
++ * at your option) any later version.
++ *
++ * GNSS-SDR is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
++ *
++ * -------------------------------------------------------------------------
++ */
++
++#ifndef INCLUDED_gnsssdr_volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5_u_H
++#define INCLUDED_gnsssdr_volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5_u_H
++
++#include <inttypes.h>
++#include <stdio.h>
++#include <volk_gnsssdr/volk_gnsssdr_complex.h>
++#include <float.h>
++#include <string.h>
++
++#ifdef LV_HAVE_SSE4_1
++#include "smmintrin.h"
++#include "CommonMacros/CommonMacros_16ic_cw_epl_corr_32fc.h"
++#include "CommonMacros/CommonMacros.h"
++ /*!
++ \brief Performs the carrier wipe-off mixing and the Very Early, Early, Prompt, Late and Very Vate correlation
++ \param input The input signal input
++ \param carrier The carrier signal input
++ \param VE_code Very Early PRN code replica input
++ \param E_code Early PRN code replica input
++ \param P_code Prompt PRN code replica input
++ \param L_code Late PRN code replica input
++ \param VL_code Very Late PRN code replica input
++ \param VE_out Very Early correlation output
++ \param E_out Early correlation output
++ \param P_out Prompt correlation output
++ \param L_out Late correlation output
++ \param VL_out Very Late correlation output
++ \param num_points The number of complex values in vectors
++ */
++static inline void volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5_u_sse4_1(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* VE_code, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, const lv_16sc_t* VL_code, unsigned int num_points)
++{
++ const unsigned int sse_iters = num_points / 8;
++
++ __m128i x1, x2, y1, y2, real_bb_signal_sample, imag_bb_signal_sample;
++ __m128i realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output;
++
++ __m128 real_VE_code_acc, imag_VE_code_acc, real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc, real_VL_code_acc, imag_VL_code_acc;
++ __m128i input_i_1, input_i_2, output_i32;
++ __m128 real_output_ps, imag_output_ps;
++
++ float VE_out_real = 0;
++ float VE_out_imag = 0;
++ float E_out_real = 0;
++ float E_out_imag = 0;
++ float P_out_real = 0;
++ float P_out_imag = 0;
++ float L_out_real = 0;
++ float L_out_imag = 0;
++ float VL_out_real = 0;
++ float VL_out_imag = 0;
++
++ const lv_16sc_t* input_ptr = input;
++ const lv_16sc_t* carrier_ptr = carrier;
++
++ const lv_16sc_t* VE_code_ptr = VE_code;
++ lv_32fc_t* VE_out_ptr = VE_out;
++ const lv_16sc_t* E_code_ptr = E_code;
++ lv_32fc_t* E_out_ptr = E_out;
++ const lv_16sc_t* L_code_ptr = L_code;
++ lv_32fc_t* L_out_ptr = L_out;
++ const lv_16sc_t* P_code_ptr = P_code;
++ lv_32fc_t* P_out_ptr = P_out;
++ const lv_16sc_t* VL_code_ptr = VL_code;
++ lv_32fc_t* VL_out_ptr = VL_out;
++
++ *VE_out_ptr = 0;
++ *E_out_ptr = 0;
++ *P_out_ptr = 0;
++ *L_out_ptr = 0;
++ *VL_out_ptr = 0;
++
++ real_VE_code_acc = _mm_setzero_ps();
++ imag_VE_code_acc = _mm_setzero_ps();
++ real_E_code_acc = _mm_setzero_ps();
++ imag_E_code_acc = _mm_setzero_ps();
++ real_P_code_acc = _mm_setzero_ps();
++ imag_P_code_acc = _mm_setzero_ps();
++ real_L_code_acc = _mm_setzero_ps();
++ imag_L_code_acc = _mm_setzero_ps();
++ real_VL_code_acc = _mm_setzero_ps();
++ imag_VL_code_acc = _mm_setzero_ps();
++
++ if (sse_iters>0)
++ {
++ for(int number = 0;number < sse_iters; number++){
++
++ //Perform the carrier wipe-off
++ x1 = _mm_lddqu_si128((__m128i*)input_ptr);
++ input_ptr += 4;
++ x2 = _mm_lddqu_si128((__m128i*)input_ptr);
++
++ y1 = _mm_lddqu_si128((__m128i*)carrier_ptr);
++ carrier_ptr += 4;
++ y2 = _mm_lddqu_si128((__m128i*)carrier_ptr);
++
++ CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(x1, x2, realx, imagx)
++ CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy)
++ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample)
++
++ //Get very early values
++ y1 = _mm_lddqu_si128((__m128i*)VE_code_ptr);
++ VE_code_ptr += 4;
++ y2 = _mm_lddqu_si128((__m128i*)VE_code_ptr);
++
++ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
++
++ real_VE_code_acc = _mm_add_ps (real_VE_code_acc, real_output_ps);
++ imag_VE_code_acc = _mm_add_ps (imag_VE_code_acc, imag_output_ps);
++
++ //Get early values
++ y1 = _mm_lddqu_si128((__m128i*)E_code_ptr);
++ E_code_ptr += 4;
++ y2 = _mm_lddqu_si128((__m128i*)E_code_ptr);
++
++ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
++
++ real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps);
++ imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps);
++
++ //Get prompt values
++ y1 = _mm_lddqu_si128((__m128i*)P_code_ptr);
++ P_code_ptr += 4;
++ y2 = _mm_lddqu_si128((__m128i*)P_code_ptr);
++
++ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
++
++ real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps);
++ imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps);
++
++ //Get late values
++ y1 = _mm_lddqu_si128((__m128i*)L_code_ptr);
++ L_code_ptr += 4;
++ y2 = _mm_lddqu_si128((__m128i*)L_code_ptr);
++
++ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
++
++ real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps);
++ imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps);
++
++ //Get very late values
++ y1 = _mm_lddqu_si128((__m128i*)VL_code_ptr);
++ VL_code_ptr += 4;
++ y2 = _mm_lddqu_si128((__m128i*)VL_code_ptr);
++
++ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
++
++ real_VL_code_acc = _mm_add_ps (real_VL_code_acc, real_output_ps);
++ imag_VL_code_acc = _mm_add_ps (imag_VL_code_acc, imag_output_ps);
++
++ input_ptr += 4;
++ carrier_ptr += 4;
++ VE_code_ptr += 4;
++ E_code_ptr += 4;
++ P_code_ptr += 4;
++ L_code_ptr += 4;
++ VL_code_ptr += 4;
++ }
++
++ __VOLK_ATTR_ALIGNED(16) float real_VE_dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(16) float imag_VE_dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(16) float real_VL_dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(16) float imag_VL_dotProductVector[4];
++
++ _mm_storeu_ps((float*)real_VE_dotProductVector,real_VE_code_acc); // Store the results back into the dot product vector
++ _mm_storeu_ps((float*)imag_VE_dotProductVector,imag_VE_code_acc); // Store the results back into the dot product vector
++ _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector
++ _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector
++ _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector
++ _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector
++ _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector
++ _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector
++ _mm_storeu_ps((float*)real_VL_dotProductVector,real_VL_code_acc); // Store the results back into the dot product vector
++ _mm_storeu_ps((float*)imag_VL_dotProductVector,imag_VL_code_acc); // Store the results back into the dot product vector
++
++ for (int i = 0; i<4; ++i)
++ {
++ VE_out_real += real_VE_dotProductVector[i];
++ VE_out_imag += imag_VE_dotProductVector[i];
++ E_out_real += real_E_dotProductVector[i];
++ E_out_imag += imag_E_dotProductVector[i];
++ P_out_real += real_P_dotProductVector[i];
++ P_out_imag += imag_P_dotProductVector[i];
++ L_out_real += real_L_dotProductVector[i];
++ L_out_imag += imag_L_dotProductVector[i];
++ VL_out_real += real_VL_dotProductVector[i];
++ VL_out_imag += imag_VL_dotProductVector[i];
++ }
++ *VE_out_ptr = lv_cmake(VE_out_real, VE_out_imag);
++ *E_out_ptr = lv_cmake(E_out_real, E_out_imag);
++ *P_out_ptr = lv_cmake(P_out_real, P_out_imag);
++ *L_out_ptr = lv_cmake(L_out_real, L_out_imag);
++ *VL_out_ptr = lv_cmake(VL_out_real, VL_out_imag);
++ }
++
++ lv_16sc_t bb_signal_sample;
++ for(int i=0; i < num_points%8; ++i)
++ {
++ //Perform the carrier wipe-off
++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
++ // Now get early, late, and prompt values for each
++ *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VE_code_ptr++));
++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++));
++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++));
++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++));
++ *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VL_code_ptr++));
++ }
++
++}
++#endif /* LV_HAVE_SSE4_1 */
++
++#ifdef LV_HAVE_GENERIC
++/*!
++ \brief Performs the carrier wipe-off mixing and the Very Early, Early, Prompt, Late and Very Vate correlation
++ \param input The input signal input
++ \param carrier The carrier signal input
++ \param VE_code Very Early PRN code replica input
++ \param E_code Early PRN code replica input
++ \param P_code Prompt PRN code replica input
++ \param L_code Late PRN code replica input
++ \param VL_code Very Late PRN code replica input
++ \param VE_out Very Early correlation output
++ \param E_out Early correlation output
++ \param P_out Prompt correlation output
++ \param L_out Late correlation output
++ \param VL_out Very Late correlation output
++ \param num_points The number of complex values in vectors
++ */
++static inline void volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* VE_code, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, const lv_16sc_t* VL_code, unsigned int num_points)
++{
++ lv_16sc_t bb_signal_sample;
++ lv_16sc_t tmp1;
++ lv_16sc_t tmp2;
++ lv_16sc_t tmp3;
++ lv_16sc_t tmp4;
++ lv_16sc_t tmp5;
++
++ bb_signal_sample = lv_cmake(0, 0);
++
++ *VE_out = 0;
++ *E_out = 0;
++ *P_out = 0;
++ *L_out = 0;
++ *VL_out = 0;
++ // perform Early, Prompt and Late correlation
++
++ for(int i=0; i < num_points; ++i)
++ {
++ //Perform the carrier wipe-off
++ bb_signal_sample = input[i] * carrier[i];
++
++ tmp1 = bb_signal_sample * VE_code[i];
++ tmp2 = bb_signal_sample * E_code[i];
++ tmp3 = bb_signal_sample * P_code[i];
++ tmp4 = bb_signal_sample * L_code[i];
++ tmp5 = bb_signal_sample * VL_code[i];
++
++ // Now get early, late, and prompt values for each
++ *VE_out += (lv_32fc_t)tmp1;
++ *E_out += (lv_32fc_t)tmp2;
++ *P_out += (lv_32fc_t)tmp3;
++ *L_out += (lv_32fc_t)tmp4;
++ *VL_out += (lv_32fc_t)tmp5;
++ }
++}
++#endif /* LV_HAVE_GENERIC */
++#endif /* INCLUDED_gnsssdr_volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5_u_H */
++
++
++#ifndef INCLUDED_gnsssdr_volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5_a_H
++#define INCLUDED_gnsssdr_volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5_a_H
++
++#include <inttypes.h>
++#include <stdio.h>
++#include <volk_gnsssdr/volk_gnsssdr_complex.h>
++#include <float.h>
++#include <string.h>
++
++#ifdef LV_HAVE_SSE4_1
++#include "smmintrin.h"
++#include "CommonMacros/CommonMacros_16ic_cw_epl_corr_32fc.h"
++#include "CommonMacros/CommonMacros.h"
++/*!
++ \brief Performs the carrier wipe-off mixing and the Very Early, Early, Prompt, Late and Very Vate correlation
++ \param input The input signal input
++ \param carrier The carrier signal input
++ \param VE_code Very Early PRN code replica input
++ \param E_code Early PRN code replica input
++ \param P_code Prompt PRN code replica input
++ \param L_code Late PRN code replica input
++ \param VL_code Very Late PRN code replica input
++ \param VE_out Very Early correlation output
++ \param E_out Early correlation output
++ \param P_out Prompt correlation output
++ \param L_out Late correlation output
++ \param VL_out Very Late correlation output
++ \param num_points The number of complex values in vectors
++ */
++static inline void volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5_a_sse4_1(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* VE_code, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, const lv_16sc_t* VL_code, unsigned int num_points)
++{
++ const unsigned int sse_iters = num_points / 8;
++
++ __m128i x1, x2, y1, y2, real_bb_signal_sample, imag_bb_signal_sample;
++ __m128i realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output;
++
++ __m128 real_VE_code_acc, imag_VE_code_acc, real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc, real_VL_code_acc, imag_VL_code_acc;
++ __m128i input_i_1, input_i_2, output_i32;
++ __m128 real_output_ps, imag_output_ps;
++
++ float VE_out_real = 0;
++ float VE_out_imag = 0;
++ float E_out_real = 0;
++ float E_out_imag = 0;
++ float P_out_real = 0;
++ float P_out_imag = 0;
++ float L_out_real = 0;
++ float L_out_imag = 0;
++ float VL_out_real = 0;
++ float VL_out_imag = 0;
++
++ const lv_16sc_t* input_ptr = input;
++ const lv_16sc_t* carrier_ptr = carrier;
++
++ const lv_16sc_t* VE_code_ptr = VE_code;
++ lv_32fc_t* VE_out_ptr = VE_out;
++ const lv_16sc_t* E_code_ptr = E_code;
++ lv_32fc_t* E_out_ptr = E_out;
++ const lv_16sc_t* L_code_ptr = L_code;
++ lv_32fc_t* L_out_ptr = L_out;
++ const lv_16sc_t* P_code_ptr = P_code;
++ lv_32fc_t* P_out_ptr = P_out;
++ const lv_16sc_t* VL_code_ptr = VL_code;
++ lv_32fc_t* VL_out_ptr = VL_out;
++
++ *VE_out_ptr = 0;
++ *E_out_ptr = 0;
++ *P_out_ptr = 0;
++ *L_out_ptr = 0;
++ *VL_out_ptr = 0;
++
++ real_VE_code_acc = _mm_setzero_ps();
++ imag_VE_code_acc = _mm_setzero_ps();
++ real_E_code_acc = _mm_setzero_ps();
++ imag_E_code_acc = _mm_setzero_ps();
++ real_P_code_acc = _mm_setzero_ps();
++ imag_P_code_acc = _mm_setzero_ps();
++ real_L_code_acc = _mm_setzero_ps();
++ imag_L_code_acc = _mm_setzero_ps();
++ real_VL_code_acc = _mm_setzero_ps();
++ imag_VL_code_acc = _mm_setzero_ps();
++
++ if (sse_iters>0)
++ {
++ for(int number = 0;number < sse_iters; number++){
++
++ //Perform the carrier wipe-off
++ x1 = _mm_load_si128((__m128i*)input_ptr);
++ input_ptr += 4;
++ x2 = _mm_load_si128((__m128i*)input_ptr);
++
++ y1 = _mm_load_si128((__m128i*)carrier_ptr);
++ carrier_ptr += 4;
++ y2 = _mm_load_si128((__m128i*)carrier_ptr);
++
++ CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(x1, x2, realx, imagx)
++ CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy)
++ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample)
++
++ //Get very early values
++ y1 = _mm_load_si128((__m128i*)VE_code_ptr);
++ VE_code_ptr += 4;
++ y2 = _mm_load_si128((__m128i*)VE_code_ptr);
++
++ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
++
++ real_VE_code_acc = _mm_add_ps (real_VE_code_acc, real_output_ps);
++ imag_VE_code_acc = _mm_add_ps (imag_VE_code_acc, imag_output_ps);
++
++ //Get early values
++ y1 = _mm_load_si128((__m128i*)E_code_ptr);
++ E_code_ptr += 4;
++ y2 = _mm_load_si128((__m128i*)E_code_ptr);
++
++ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
++
++ real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps);
++ imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps);
++
++ //Get prompt values
++ y1 = _mm_load_si128((__m128i*)P_code_ptr);
++ P_code_ptr += 4;
++ y2 = _mm_load_si128((__m128i*)P_code_ptr);
++
++ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
++
++ real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps);
++ imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps);
++
++ //Get late values
++ y1 = _mm_load_si128((__m128i*)L_code_ptr);
++ L_code_ptr += 4;
++ y2 = _mm_load_si128((__m128i*)L_code_ptr);
++
++ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
++
++ real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps);
++ imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps);
++
++ //Get very late values
++ y1 = _mm_load_si128((__m128i*)VL_code_ptr);
++ VL_code_ptr += 4;
++ y2 = _mm_load_si128((__m128i*)VL_code_ptr);
++
++ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
++
++ real_VL_code_acc = _mm_add_ps (real_VL_code_acc, real_output_ps);
++ imag_VL_code_acc = _mm_add_ps (imag_VL_code_acc, imag_output_ps);
++
++ input_ptr += 4;
++ carrier_ptr += 4;
++ VE_code_ptr += 4;
++ E_code_ptr += 4;
++ P_code_ptr += 4;
++ L_code_ptr += 4;
++ VL_code_ptr += 4;
++ }
++
++ __VOLK_ATTR_ALIGNED(16) float real_VE_dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(16) float imag_VE_dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(16) float real_VL_dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(16) float imag_VL_dotProductVector[4];
++
++ _mm_store_ps((float*)real_VE_dotProductVector,real_VE_code_acc); // Store the results back into the dot product vector
++ _mm_store_ps((float*)imag_VE_dotProductVector,imag_VE_code_acc); // Store the results back into the dot product vector
++ _mm_store_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector
++ _mm_store_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector
++ _mm_store_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector
++ _mm_store_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector
++ _mm_store_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector
++ _mm_store_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector
++ _mm_store_ps((float*)real_VL_dotProductVector,real_VL_code_acc); // Store the results back into the dot product vector
++ _mm_store_ps((float*)imag_VL_dotProductVector,imag_VL_code_acc); // Store the results back into the dot product vector
++
++ for (int i = 0; i<4; ++i)
++ {
++ VE_out_real += real_VE_dotProductVector[i];
++ VE_out_imag += imag_VE_dotProductVector[i];
++ E_out_real += real_E_dotProductVector[i];
++ E_out_imag += imag_E_dotProductVector[i];
++ P_out_real += real_P_dotProductVector[i];
++ P_out_imag += imag_P_dotProductVector[i];
++ L_out_real += real_L_dotProductVector[i];
++ L_out_imag += imag_L_dotProductVector[i];
++ VL_out_real += real_VL_dotProductVector[i];
++ VL_out_imag += imag_VL_dotProductVector[i];
++ }
++ *VE_out_ptr = lv_cmake(VE_out_real, VE_out_imag);
++ *E_out_ptr = lv_cmake(E_out_real, E_out_imag);
++ *P_out_ptr = lv_cmake(P_out_real, P_out_imag);
++ *L_out_ptr = lv_cmake(L_out_real, L_out_imag);
++ *VL_out_ptr = lv_cmake(VL_out_real, VL_out_imag);
++ }
++
++ lv_16sc_t bb_signal_sample;
++ for(int i=0; i < num_points%8; ++i)
++ {
++ //Perform the carrier wipe-off
++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
++ // Now get early, late, and prompt values for each
++ *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VE_code_ptr++));
++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++));
++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++));
++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++));
++ *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VL_code_ptr++));
++ }
++
++}
++#endif /* LV_HAVE_SSE4_1 */
++
++#ifdef LV_HAVE_GENERIC
++/*!
++ \brief Performs the carrier wipe-off mixing and the Very Early, Early, Prompt, Late and Very Vate correlation
++ \param input The input signal input
++ \param carrier The carrier signal input
++ \param VE_code Very Early PRN code replica input
++ \param E_code Early PRN code replica input
++ \param P_code Prompt PRN code replica input
++ \param L_code Late PRN code replica input
++ \param VL_code Very Late PRN code replica input
++ \param VE_out Very Early correlation output
++ \param E_out Early correlation output
++ \param P_out Prompt correlation output
++ \param L_out Late correlation output
++ \param VL_out Very Late correlation output
++ \param num_points The number of complex values in vectors
++ */
++static inline void volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5_a_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* VE_code, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, const lv_16sc_t* VL_code, unsigned int num_points)
++{
++ lv_16sc_t bb_signal_sample;
++ lv_16sc_t tmp1;
++ lv_16sc_t tmp2;
++ lv_16sc_t tmp3;
++ lv_16sc_t tmp4;
++ lv_16sc_t tmp5;
++
++ bb_signal_sample = lv_cmake(0, 0);
++
++ *VE_out = 0;
++ *E_out = 0;
++ *P_out = 0;
++ *L_out = 0;
++ *VL_out = 0;
++ // perform Early, Prompt and Late correlation
++
++ for(int i=0; i < num_points; ++i)
++ {
++ //Perform the carrier wipe-off
++ bb_signal_sample = input[i] * carrier[i];
++
++ tmp1 = bb_signal_sample * VE_code[i];
++ tmp2 = bb_signal_sample * E_code[i];
++ tmp3 = bb_signal_sample * P_code[i];
++ tmp4 = bb_signal_sample * L_code[i];
++ tmp5 = bb_signal_sample * VL_code[i];
++
++ // Now get early, late, and prompt values for each
++ *VE_out += (lv_32fc_t)tmp1;
++ *E_out += (lv_32fc_t)tmp2;
++ *P_out += (lv_32fc_t)tmp3;
++ *L_out += (lv_32fc_t)tmp4;
++ *VL_out += (lv_32fc_t)tmp5;
++ }
++}
++#endif /* LV_HAVE_GENERIC */
++#endif /* INCLUDED_gnsssdr_volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5_a_H */
+diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_accumulator_s32f.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32f_accumulator_s32f.h
+--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_accumulator_s32f.h 1970-01-01 01:00:00.000000000 +0100
++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32f_accumulator_s32f.h 2014-10-15 01:55:08.000000000 +0200
+@@ -0,0 +1,68 @@
++#ifndef INCLUDED_volk_gnsssdr_32f_accumulator_s32f_a_H
++#define INCLUDED_volk_gnsssdr_32f_accumulator_s32f_a_H
++
++#include <volk_gnsssdr/volk_gnsssdr_common.h>
++#include <inttypes.h>
++#include <stdio.h>
++
++#ifdef LV_HAVE_SSE
++#include <xmmintrin.h>
++/*!
++ \brief Accumulates the values in the input buffer
++ \param result The accumulated result
++ \param inputBuffer The buffer of data to be accumulated
++ \param num_points The number of values in inputBuffer to be accumulated
++*/
++static inline void volk_gnsssdr_32f_accumulator_s32f_a_sse(float* result, const float* inputBuffer, unsigned int num_points){
++ float returnValue = 0;
++ unsigned int number = 0;
++ const unsigned int quarterPoints = num_points / 4;
++
++ const float* aPtr = inputBuffer;
++ __VOLK_ATTR_ALIGNED(16) float tempBuffer[4];
++
++ __m128 accumulator = _mm_setzero_ps();
++ __m128 aVal = _mm_setzero_ps();
++
++ for(;number < quarterPoints; number++){
++ aVal = _mm_load_ps(aPtr);
++ accumulator = _mm_add_ps(accumulator, aVal);
++ aPtr += 4;
++ }
++ _mm_store_ps(tempBuffer,accumulator); // Store the results back into the C container
++ returnValue = tempBuffer[0];
++ returnValue += tempBuffer[1];
++ returnValue += tempBuffer[2];
++ returnValue += tempBuffer[3];
++
++ number = quarterPoints * 4;
++ for(;number < num_points; number++){
++ returnValue += (*aPtr++);
++ }
++ *result = returnValue;
++}
++#endif /* LV_HAVE_SSE */
++
++#ifdef LV_HAVE_GENERIC
++/*!
++ \brief Accumulates the values in the input buffer
++ \param result The accumulated result
++ \param inputBuffer The buffer of data to be accumulated
++ \param num_points The number of values in inputBuffer to be accumulated
++*/
++static inline void volk_gnsssdr_32f_accumulator_s32f_generic(float* result, const float* inputBuffer, unsigned int num_points){
++ const float* aPtr = inputBuffer;
++ unsigned int number = 0;
++ float returnValue = 0;
++
++ for(;number < num_points; number++){
++ returnValue += (*aPtr++);
++ }
++ *result = returnValue;
++}
++#endif /* LV_HAVE_GENERIC */
++
++
++
++
++#endif /* INCLUDED_volk_gnsssdr_32f_accumulator_s32f_a_H */
+diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_index_max_16u.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32f_index_max_16u.h
+--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_index_max_16u.h 1970-01-01 01:00:00.000000000 +0100
++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32f_index_max_16u.h 2014-10-15 01:55:08.000000000 +0200
+@@ -0,0 +1,149 @@
++#ifndef INCLUDED_volk_gnsssdr_32f_index_max_16u_a_H
++#define INCLUDED_volk_gnsssdr_32f_index_max_16u_a_H
++
++#include <volk_gnsssdr/volk_gnsssdr_common.h>
++#include <volk_gnsssdr/volk_gnsssdr_common.h>
++#include <inttypes.h>
++#include <stdio.h>
++
++#ifdef LV_HAVE_SSE4_1
++#include<smmintrin.h>
++
++static inline void volk_gnsssdr_32f_index_max_16u_a_sse4_1(unsigned int* target, const float* src0, unsigned int num_points) {
++ if(num_points > 0){
++ unsigned int number = 0;
++ const unsigned int quarterPoints = num_points / 4;
++
++ float* inputPtr = (float*)src0;
++
++ __m128 indexIncrementValues = _mm_set1_ps(4);
++ __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4);
++
++ float max = src0[0];
++ float index = 0;
++ __m128 maxValues = _mm_set1_ps(max);
++ __m128 maxValuesIndex = _mm_setzero_ps();
++ __m128 compareResults;
++ __m128 currentValues;
++
++ __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
++ __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
++
++ for(;number < quarterPoints; number++){
++
++ currentValues = _mm_load_ps(inputPtr); inputPtr += 4;
++ currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
++
++ compareResults = _mm_cmpgt_ps(maxValues, currentValues);
++
++ maxValuesIndex = _mm_blendv_ps(currentIndexes, maxValuesIndex, compareResults);
++ maxValues = _mm_blendv_ps(currentValues, maxValues, compareResults);
++ }
++
++ // Calculate the largest value from the remaining 4 points
++ _mm_store_ps(maxValuesBuffer, maxValues);
++ _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
++
++ for(number = 0; number < 4; number++){
++ if(maxValuesBuffer[number] > max){
++ index = maxIndexesBuffer[number];
++ max = maxValuesBuffer[number];
++ }
++ }
++
++ number = quarterPoints * 4;
++ for(;number < num_points; number++){
++ if(src0[number] > max){
++ index = number;
++ max = src0[number];
++ }
++ }
++ target[0] = (unsigned int)index;
++ }
++}
++
++#endif /*LV_HAVE_SSE4_1*/
++
++#ifdef LV_HAVE_SSE
++#include<xmmintrin.h>
++
++static inline void volk_gnsssdr_32f_index_max_16u_a_sse(unsigned int* target, const float* src0, unsigned int num_points) {
++ if(num_points > 0){
++ unsigned int number = 0;
++ const unsigned int quarterPoints = num_points / 4;
++
++ float* inputPtr = (float*)src0;
++
++ __m128 indexIncrementValues = _mm_set1_ps(4);
++ __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4);
++
++ float max = src0[0];
++ float index = 0;
++ __m128 maxValues = _mm_set1_ps(max);
++ __m128 maxValuesIndex = _mm_setzero_ps();
++ __m128 compareResults;
++ __m128 currentValues;
++
++ __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
++ __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
++
++ for(;number < quarterPoints; number++){
++
++ currentValues = _mm_load_ps(inputPtr); inputPtr += 4;
++ currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
++
++ compareResults = _mm_cmpgt_ps(maxValues, currentValues);
++
++ maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, maxValuesIndex) , _mm_andnot_ps(compareResults, currentIndexes));
++
++ maxValues = _mm_or_ps(_mm_and_ps(compareResults, maxValues) , _mm_andnot_ps(compareResults, currentValues));
++ }
++
++ // Calculate the largest value from the remaining 4 points
++ _mm_store_ps(maxValuesBuffer, maxValues);
++ _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
++
++ for(number = 0; number < 4; number++){
++ if(maxValuesBuffer[number] > max){
++ index = maxIndexesBuffer[number];
++ max = maxValuesBuffer[number];
++ }
++ }
++
++ number = quarterPoints * 4;
++ for(;number < num_points; number++){
++ if(src0[number] > max){
++ index = number;
++ max = src0[number];
++ }
++ }
++ target[0] = (unsigned int)index;
++ }
++}
++
++#endif /*LV_HAVE_SSE*/
++
++#ifdef LV_HAVE_GENERIC
++static inline void volk_gnsssdr_32f_index_max_16u_generic(unsigned int* target, const float* src0, unsigned int num_points) {
++ if(num_points > 0){
++ float max = src0[0];
++ unsigned int index = 0;
++
++ unsigned int i = 1;
++
++ for(; i < num_points; ++i) {
++
++ if(src0[i] > max){
++ index = i;
++ max = src0[i];
++ }
++
++ }
++ target[0] = index;
++ }
++}
++
++#endif /*LV_HAVE_GENERIC*/
++
++
++#endif /*INCLUDED_volk_gnsssdr_32f_index_max_16u_a_H*/
+diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_s32f_convert_16i.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32f_s32f_convert_16i.h
+--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_s32f_convert_16i.h 1970-01-01 01:00:00.000000000 +0100
++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32f_s32f_convert_16i.h 2014-10-15 01:55:08.000000000 +0200
+@@ -0,0 +1,302 @@
++#ifndef INCLUDED_volk_gnsssdr_32f_s32f_convert_16i_u_H
++#define INCLUDED_volk_gnsssdr_32f_s32f_convert_16i_u_H
++
++#include <inttypes.h>
++#include <stdio.h>
++#include <math.h>
++
++#ifdef LV_HAVE_SSE2
++#include <emmintrin.h>
++ /*!
++ \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
++ \param inputVector The floating point input data buffer
++ \param outputVector The 16 bit output data buffer
++ \param scalar The value multiplied against each point in the input buffer
++ \param num_points The number of data values to be converted
++ \note Input buffer does NOT need to be properly aligned
++ */
++static inline void volk_gnsssdr_32f_s32f_convert_16i_u_sse2(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
++ unsigned int number = 0;
++
++ const unsigned int eighthPoints = num_points / 8;
++
++ const float* inputVectorPtr = (const float*)inputVector;
++ int16_t* outputVectorPtr = outputVector;
++
++ float min_val = -32768;
++ float max_val = 32767;
++ float r;
++
++ __m128 vScalar = _mm_set_ps1(scalar);
++ __m128 inputVal1, inputVal2;
++ __m128i intInputVal1, intInputVal2;
++ __m128 ret1, ret2;
++ __m128 vmin_val = _mm_set_ps1(min_val);
++ __m128 vmax_val = _mm_set_ps1(max_val);
++
++ for(;number < eighthPoints; number++){
++ inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
++ inputVal2 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
++
++ // Scale and clip
++ ret1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
++ ret2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
++
++ intInputVal1 = _mm_cvtps_epi32(ret1);
++ intInputVal2 = _mm_cvtps_epi32(ret2);
++
++ intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
++
++ _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
++ outputVectorPtr += 8;
++ }
++
++ number = eighthPoints * 8;
++ for(; number < num_points; number++){
++ r = inputVector[number] * scalar;
++ if(r > max_val)
++ r = max_val;
++ else if(r < min_val)
++ r = min_val;
++ outputVector[number] = (int16_t)rintf(r);
++ }
++}
++#endif /* LV_HAVE_SSE2 */
++
++#ifdef LV_HAVE_SSE
++#include <xmmintrin.h>
++ /*!
++ \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
++ \param inputVector The floating point input data buffer
++ \param outputVector The 16 bit output data buffer
++ \param scalar The value multiplied against each point in the input buffer
++ \param num_points The number of data values to be converted
++ \note Input buffer does NOT need to be properly aligned
++ */
++static inline void volk_gnsssdr_32f_s32f_convert_16i_u_sse(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
++ unsigned int number = 0;
++
++ const unsigned int quarterPoints = num_points / 4;
++
++ const float* inputVectorPtr = (const float*)inputVector;
++ int16_t* outputVectorPtr = outputVector;
++
++ float min_val = -32768;
++ float max_val = 32767;
++ float r;
++
++ __m128 vScalar = _mm_set_ps1(scalar);
++ __m128 ret;
++ __m128 vmin_val = _mm_set_ps1(min_val);
++ __m128 vmax_val = _mm_set_ps1(max_val);
++
++ __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
++
++ for(;number < quarterPoints; number++){
++ ret = _mm_loadu_ps(inputVectorPtr);
++ inputVectorPtr += 4;
++
++ // Scale and clip
++ ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
++
++ _mm_store_ps(outputFloatBuffer, ret);
++ *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]);
++ *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]);
++ *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]);
++ *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]);
++ }
++
++ number = quarterPoints * 4;
++ for(; number < num_points; number++){
++ r = inputVector[number] * scalar;
++ if(r > max_val)
++ r = max_val;
++ else if(r < min_val)
++ r = min_val;
++ outputVector[number] = (int16_t)rintf(r);
++ }
++}
++#endif /* LV_HAVE_SSE */
++
++#ifdef LV_HAVE_GENERIC
++ /*!
++ \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
++ \param inputVector The floating point input data buffer
++ \param outputVector The 16 bit output data buffer
++ \param scalar The value multiplied against each point in the input buffer
++ \param num_points The number of data values to be converted
++ \note Input buffer does NOT need to be properly aligned
++ */
++static inline void volk_gnsssdr_32f_s32f_convert_16i_generic(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
++ int16_t* outputVectorPtr = outputVector;
++ const float* inputVectorPtr = inputVector;
++ unsigned int number = 0;
++ float min_val = -32768;
++ float max_val = 32767;
++ float r;
++
++ for(number = 0; number < num_points; number++){
++ r = *inputVectorPtr++ * scalar;
++ if(r > max_val)
++ r = max_val;
++ else if(r < min_val)
++ r = min_val;
++ *outputVectorPtr++ = (int16_t)rintf(r);
++ }
++}
++#endif /* LV_HAVE_GENERIC */
++
++
++
++
++#endif /* INCLUDED_volk_gnsssdr_32f_s32f_convert_16i_u_H */
++#ifndef INCLUDED_volk_gnsssdr_32f_s32f_convert_16i_a_H
++#define INCLUDED_volk_gnsssdr_32f_s32f_convert_16i_a_H
++
++#include <volk/volk_common.h>
++#include <inttypes.h>
++#include <stdio.h>
++#include <math.h>
++
++#ifdef LV_HAVE_SSE2
++#include <emmintrin.h>
++ /*!
++ \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
++ \param inputVector The floating point input data buffer
++ \param outputVector The 16 bit output data buffer
++ \param scalar The value multiplied against each point in the input buffer
++ \param num_points The number of data values to be converted
++ */
++static inline void volk_gnsssdr_32f_s32f_convert_16i_a_sse2(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
++ unsigned int number = 0;
++
++ const unsigned int eighthPoints = num_points / 8;
++
++ const float* inputVectorPtr = (const float*)inputVector;
++ int16_t* outputVectorPtr = outputVector;
++
++ float min_val = -32768;
++ float max_val = 32767;
++ float r;
++
++ __m128 vScalar = _mm_set_ps1(scalar);
++ __m128 inputVal1, inputVal2;
++ __m128i intInputVal1, intInputVal2;
++ __m128 ret1, ret2;
++ __m128 vmin_val = _mm_set_ps1(min_val);
++ __m128 vmax_val = _mm_set_ps1(max_val);
++
++ for(;number < eighthPoints; number++){
++ inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
++ inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
++
++ // Scale and clip
++ ret1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
++ ret2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
++
++ intInputVal1 = _mm_cvtps_epi32(ret1);
++ intInputVal2 = _mm_cvtps_epi32(ret2);
++
++ intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
++
++ _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
++ outputVectorPtr += 8;
++ }
++
++ number = eighthPoints * 8;
++ for(; number < num_points; number++){
++ r = inputVector[number] * scalar;
++ if(r > max_val)
++ r = max_val;
++ else if(r < min_val)
++ r = min_val;
++ outputVector[number] = (int16_t)rintf(r);
++ }
++}
++#endif /* LV_HAVE_SSE2 */
++
++#ifdef LV_HAVE_SSE
++#include <xmmintrin.h>
++ /*!
++ \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
++ \param inputVector The floating point input data buffer
++ \param outputVector The 16 bit output data buffer
++ \param scalar The value multiplied against each point in the input buffer
++ \param num_points The number of data values to be converted
++ */
++static inline void volk_gnsssdr_32f_s32f_convert_16i_a_sse(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
++ unsigned int number = 0;
++
++ const unsigned int quarterPoints = num_points / 4;
++
++ const float* inputVectorPtr = (const float*)inputVector;
++ int16_t* outputVectorPtr = outputVector;
++
++ float min_val = -32768;
++ float max_val = 32767;
++ float r;
++
++ __m128 vScalar = _mm_set_ps1(scalar);
++ __m128 ret;
++ __m128 vmin_val = _mm_set_ps1(min_val);
++ __m128 vmax_val = _mm_set_ps1(max_val);
++
++ __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
++
++ for(;number < quarterPoints; number++){
++ ret = _mm_load_ps(inputVectorPtr);
++ inputVectorPtr += 4;
++
++ // Scale and clip
++ ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
++
++ _mm_store_ps(outputFloatBuffer, ret);
++ *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]);
++ *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]);
++ *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]);
++ *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]);
++ }
++
++ number = quarterPoints * 4;
++ for(; number < num_points; number++){
++ r = inputVector[number] * scalar;
++ if(r > max_val)
++ r = max_val;
++ else if(r < min_val)
++ r = min_val;
++ outputVector[number] = (int16_t)rintf(r);
++ }
++}
++#endif /* LV_HAVE_SSE */
++
++#ifdef LV_HAVE_GENERIC
++ /*!
++ \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
++ \param inputVector The floating point input data buffer
++ \param outputVector The 16 bit output data buffer
++ \param scalar The value multiplied against each point in the input buffer
++ \param num_points The number of data values to be converted
++ */
++static inline void volk_gnsssdr_32f_s32f_convert_16i_a_generic(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
++ int16_t* outputVectorPtr = outputVector;
++ const float* inputVectorPtr = inputVector;
++ unsigned int number = 0;
++ float min_val = -32768;
++ float max_val = 32767;
++ float r;
++
++ for(number = 0; number < num_points; number++){
++ r = *inputVectorPtr++ * scalar;
++ if(r < min_val)
++ r = min_val;
++ else if(r > max_val)
++ r = max_val;
++ *outputVectorPtr++ = (int16_t)rintf(r);
++ }
++}
++#endif /* LV_HAVE_GENERIC */
++
++
++
++
++#endif /* INCLUDED_volk_gnsssdr_32f_s32f_convert_16i_a_H */
+diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_x2_add_32f.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32f_x2_add_32f.h
+--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_x2_add_32f.h 1970-01-01 01:00:00.000000000 +0100
++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32f_x2_add_32f.h 2014-10-15 01:55:08.000000000 +0200
+@@ -0,0 +1,147 @@
++#ifndef INCLUDED_volk_gnsssdr_32f_x2_add_32f_u_H
++#define INCLUDED_volk_gnsssdr_32f_x2_add_32f_u_H
++
++#include <inttypes.h>
++#include <stdio.h>
++
++#ifdef LV_HAVE_SSE
++#include <xmmintrin.h>
++/*!
++ \brief Adds the two input vectors and store their results in the third vector
++ \param cVector The vector where the results will be stored
++ \param aVector One of the vectors to be added
++ \param bVector One of the vectors to be added
++ \param num_points The number of values in aVector and bVector to be added together and stored into cVector
++*/
++static inline void volk_gnsssdr_32f_x2_add_32f_u_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
++ unsigned int number = 0;
++ const unsigned int quarterPoints = num_points / 4;
++
++ float* cPtr = cVector;
++ const float* aPtr = aVector;
++ const float* bPtr= bVector;
++
++ __m128 aVal, bVal, cVal;
++ for(;number < quarterPoints; number++){
++
++ aVal = _mm_loadu_ps(aPtr);
++ bVal = _mm_loadu_ps(bPtr);
++
++ cVal = _mm_add_ps(aVal, bVal);
++
++ _mm_storeu_ps(cPtr,cVal); // Store the results back into the C container
++
++ aPtr += 4;
++ bPtr += 4;
++ cPtr += 4;
++ }
++
++ number = quarterPoints * 4;
++ for(;number < num_points; number++){
++ *cPtr++ = (*aPtr++) + (*bPtr++);
++ }
++}
++#endif /* LV_HAVE_SSE */
++
++#ifdef LV_HAVE_GENERIC
++/*!
++ \brief Adds the two input vectors and store their results in the third vector
++ \param cVector The vector where the results will be stored
++ \param aVector One of the vectors to be added
++ \param bVector One of the vectors to be added
++ \param num_points The number of values in aVector and bVector to be added together and stored into cVector
++*/
++static inline void volk_gnsssdr_32f_x2_add_32f_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
++ float* cPtr = cVector;
++ const float* aPtr = aVector;
++ const float* bPtr= bVector;
++ unsigned int number = 0;
++
++ for(number = 0; number < num_points; number++){
++ *cPtr++ = (*aPtr++) + (*bPtr++);
++ }
++}
++#endif /* LV_HAVE_GENERIC */
++
++#endif /* INCLUDED_volk_gnsssdr_32f_x2_add_32f_u_H */
++#ifndef INCLUDED_volk_gnsssdr_32f_x2_add_32f_a_H
++#define INCLUDED_volk_gnsssdr_32f_x2_add_32f_a_H
++
++#include <inttypes.h>
++#include <stdio.h>
++
++#ifdef LV_HAVE_SSE
++#include <xmmintrin.h>
++/*!
++ \brief Adds the two input vectors and store their results in the third vector
++ \param cVector The vector where the results will be stored
++ \param aVector One of the vectors to be added
++ \param bVector One of the vectors to be added
++ \param num_points The number of values in aVector and bVector to be added together and stored into cVector
++*/
++static inline void volk_gnsssdr_32f_x2_add_32f_a_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
++ unsigned int number = 0;
++ const unsigned int quarterPoints = num_points / 4;
++
++ float* cPtr = cVector;
++ const float* aPtr = aVector;
++ const float* bPtr= bVector;
++
++ __m128 aVal, bVal, cVal;
++ for(;number < quarterPoints; number++){
++
++ aVal = _mm_load_ps(aPtr);
++ bVal = _mm_load_ps(bPtr);
++
++ cVal = _mm_add_ps(aVal, bVal);
++
++ _mm_store_ps(cPtr,cVal); // Store the results back into the C container
++
++ aPtr += 4;
++ bPtr += 4;
++ cPtr += 4;
++ }
++
++ number = quarterPoints * 4;
++ for(;number < num_points; number++){
++ *cPtr++ = (*aPtr++) + (*bPtr++);
++ }
++}
++#endif /* LV_HAVE_SSE */
++
++#ifdef LV_HAVE_GENERIC
++/*!
++ \brief Adds the two input vectors and store their results in the third vector
++ \param cVector The vector where the results will be stored
++ \param aVector One of the vectors to be added
++ \param bVector One of the vectors to be added
++ \param num_points The number of values in aVector and bVector to be added together and stored into cVector
++*/
++static inline void volk_gnsssdr_32f_x2_add_32f_a_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
++ float* cPtr = cVector;
++ const float* aPtr = aVector;
++ const float* bPtr= bVector;
++ unsigned int number = 0;
++
++ for(number = 0; number < num_points; number++){
++ *cPtr++ = (*aPtr++) + (*bPtr++);
++ }
++}
++#endif /* LV_HAVE_GENERIC */
++
++#ifdef LV_HAVE_ORC
++/*!
++ \brief Adds the two input vectors and store their results in the third vector
++ \param cVector The vector where the results will be stored
++ \param aVector One of the vectors to be added
++ \param bVector One of the vectors to be added
++ \param num_points The number of values in aVector and bVector to be added together and stored into cVector
++*/
++extern void volk_gnsssdr_32f_x2_add_32f_a_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
++static inline void volk_gnsssdr_32f_x2_add_32f_u_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
++ volk_gnsssdr_32f_x2_add_32f_a_orc_impl(cVector, aVector, bVector, num_points);
++}
++#endif /* LV_HAVE_ORC */
++
++
++#endif /* INCLUDED_volk_gnsssdr_32f_x2_add_32f_a_H */
+diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_conjugate_32fc.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_conjugate_32fc.h
+--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_conjugate_32fc.h 1970-01-01 01:00:00.000000000 +0100
++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_conjugate_32fc.h 2014-10-15 01:55:08.000000000 +0200
+@@ -0,0 +1,127 @@
++#ifndef INCLUDED_volk_gnsssdr_32fc_conjugate_32fc_u_H
++#define INCLUDED_volk_gnsssdr_32fc_conjugate_32fc_u_H
++
++#include <inttypes.h>
++#include <stdio.h>
++#include <volk_gnsssdr/volk_gnsssdr_complex.h>
++#include <float.h>
++
++#ifdef LV_HAVE_SSE3
++#include <pmmintrin.h>
++ /*!
++ \brief Takes the conjugate of a complex vector.
++ \param cVector The vector where the results will be stored
++ \param aVector Vector to be conjugated
++ \param num_points The number of complex values in aVector to be conjugated and stored into cVector
++ */
++static inline void volk_gnsssdr_32fc_conjugate_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points){
++ unsigned int number = 0;
++ const unsigned int halfPoints = num_points / 2;
++
++ __m128 x;
++ lv_32fc_t* c = cVector;
++ const lv_32fc_t* a = aVector;
++
++ __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f);
++
++ for(;number < halfPoints; number++){
++
++ x = _mm_loadu_ps((float*)a); // Load the complex data as ar,ai,br,bi
++
++ x = _mm_xor_ps(x, conjugator); // conjugate register
++
++ _mm_storeu_ps((float*)c,x); // Store the results back into the C container
++
++ a += 2;
++ c += 2;
++ }
++
++ if((num_points % 2) != 0) {
++ *c = lv_conj(*a);
++ }
++}
++#endif /* LV_HAVE_SSE3 */
++
++#ifdef LV_HAVE_GENERIC
++ /*!
++ \brief Takes the conjugate of a complex vector.
++ \param cVector The vector where the results will be stored
++ \param aVector Vector to be conjugated
++ \param num_points The number of complex values in aVector to be conjugated and stored into cVector
++ */
++static inline void volk_gnsssdr_32fc_conjugate_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points){
++ lv_32fc_t* cPtr = cVector;
++ const lv_32fc_t* aPtr = aVector;
++ unsigned int number = 0;
++
++ for(number = 0; number < num_points; number++){
++ *cPtr++ = lv_conj(*aPtr++);
++ }
++}
++#endif /* LV_HAVE_GENERIC */
++
++
++#endif /* INCLUDED_volk_gnsssdr_32fc_conjugate_32fc_u_H */
++#ifndef INCLUDED_volk_gnsssdr_32fc_conjugate_32fc_a_H
++#define INCLUDED_volk_gnsssdr_32fc_conjugate_32fc_a_H
++
++#include <inttypes.h>
++#include <stdio.h>
++#include <volk_gnsssdr/volk_gnsssdr_complex.h>
++#include <float.h>
++
++#ifdef LV_HAVE_SSE3
++#include <pmmintrin.h>
++ /*!
++ \brief Takes the conjugate of a complex vector.
++ \param cVector The vector where the results will be stored
++ \param aVector Vector to be conjugated
++ \param num_points The number of complex values in aVector to be conjugated and stored into cVector
++ */
++static inline void volk_gnsssdr_32fc_conjugate_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points){
++ unsigned int number = 0;
++ const unsigned int halfPoints = num_points / 2;
++
++ __m128 x;
++ lv_32fc_t* c = cVector;
++ const lv_32fc_t* a = aVector;
++
++ __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f);
++
++ for(;number < halfPoints; number++){
++
++ x = _mm_load_ps((float*)a); // Load the complex data as ar,ai,br,bi
++
++ x = _mm_xor_ps(x, conjugator); // conjugate register
++
++ _mm_store_ps((float*)c,x); // Store the results back into the C container
++
++ a += 2;
++ c += 2;
++ }
++
++ if((num_points % 2) != 0) {
++ *c = lv_conj(*a);
++ }
++}
++#endif /* LV_HAVE_SSE3 */
++
++#ifdef LV_HAVE_GENERIC
++ /*!
++ \brief Takes the conjugate of a complex vector.
++ \param cVector The vector where the results will be stored
++ \param aVector Vector to be conjugated
++ \param num_points The number of complex values in aVector to be conjugated and stored into cVector
++ */
++static inline void volk_gnsssdr_32fc_conjugate_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points){
++ lv_32fc_t* cPtr = cVector;
++ const lv_32fc_t* aPtr = aVector;
++ unsigned int number = 0;
++
++ for(number = 0; number < num_points; number++){
++ *cPtr++ = lv_conj(*aPtr++);
++ }
++}
++#endif /* LV_HAVE_GENERIC */
++
++#endif /* INCLUDED_volk_gnsssdr_32fc_conjugate_32fc_a_H */
+diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_16ic.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_16ic.h
+--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_16ic.h 1970-01-01 01:00:00.000000000 +0100
++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_16ic.h 2014-10-15 01:55:08.000000000 +0200
+@@ -0,0 +1,295 @@
++/*!
++ * \file volk_gnsssdr_32fc_convert_16ic.h
++ * \brief Volk protokernel: converts float32 complex values to 16 integer complex values taking care of overflow
++ * \authors <ul>
++ * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
++ * </ul>
++ *
++ * -------------------------------------------------------------------------
++ *
++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
++ *
++ * GNSS-SDR is a software defined Global Navigation
++ * Satellite Systems receiver
++ *
++ * This file is part of GNSS-SDR.
++ *
++ * GNSS-SDR is free software: you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation, either version 3 of the License, or
++ * at your option) any later version.
++ *
++ * GNSS-SDR is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
++ *
++ * -------------------------------------------------------------------------
++ */
++
++#ifndef INCLUDED_volk_gnsssdr_32fc_convert_16ic_u_H
++#define INCLUDED_volk_gnsssdr_32fc_convert_16ic_u_H
++
++#include <inttypes.h>
++#include <stdio.h>
++#include <math.h>
++
++#ifdef LV_HAVE_SSE2
++#include <emmintrin.h>
++/*!
++ \brief Converts a float vector of 64 bits (32 bits each part) into a 32 integer vector (16 bits each part)
++ \param inputVector The floating point input data buffer
++ \param outputVector The 16 bit output data buffer
++ \param num_points The number of data values to be converted
++ */
++static inline void volk_gnsssdr_32fc_convert_16ic_u_sse2(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){
++ const unsigned int sse_iters = num_points/4;
++
++ float* inputVectorPtr = (float*)inputVector;
++ int16_t* outputVectorPtr = (int16_t*)outputVector;
++
++ float min_val = -32768;
++ float max_val = 32767;
++
++ __m128 inputVal1, inputVal2;
++ __m128i intInputVal1, intInputVal2;
++ __m128 ret1, ret2;
++ __m128 vmin_val = _mm_set_ps1(min_val);
++ __m128 vmax_val = _mm_set_ps1(max_val);
++
++ for(unsigned int i = 0;i < sse_iters; i++){
++ inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
++ inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
++
++ // Clip
++ ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
++ ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
++
++ intInputVal1 = _mm_cvtps_epi32(ret1);
++ intInputVal2 = _mm_cvtps_epi32(ret2);
++
++ intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
++
++ _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
++ outputVectorPtr += 8;
++ }
++
++ for(unsigned int i = 0; i < (num_points%4)*2; i++){
++ if(inputVectorPtr[i] > max_val)
++ inputVectorPtr[i] = max_val;
++ else if(inputVectorPtr[i] < min_val)
++ inputVectorPtr[i] = min_val;
++ outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]);
++ }
++}
++#endif /* LV_HAVE_SSE2 */
++
++#ifdef LV_HAVE_SSE
++#include <xmmintrin.h>
++/*!
++ \brief Converts a float vector of 64 bits (32 bits each part) into a 32 integer vector (16 bits each part)
++ \param inputVector The floating point input data buffer
++ \param outputVector The 16 bit output data buffer
++ \param num_points The number of data values to be converted
++ */
++static inline void volk_gnsssdr_32fc_convert_16ic_u_sse(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){
++ const unsigned int sse_iters = num_points/4;
++
++ float* inputVectorPtr = (float*)inputVector;
++ int16_t* outputVectorPtr = (int16_t*)outputVector;
++
++ float min_val = -32768;
++ float max_val = 32767;
++
++ __m128 inputVal1, inputVal2;
++ __m128i intInputVal1, intInputVal2;
++ __m128 ret1, ret2;
++ __m128 vmin_val = _mm_set_ps1(min_val);
++ __m128 vmax_val = _mm_set_ps1(max_val);
++
++ for(unsigned int i = 0;i < sse_iters; i++){
++ inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
++ inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
++
++ // Clip
++ ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
++ ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
++
++ intInputVal1 = _mm_cvtps_epi32(ret1);
++ intInputVal2 = _mm_cvtps_epi32(ret2);
++
++ intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
++
++ _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
++ outputVectorPtr += 8;
++ }
++
++ for(unsigned int i = 0; i < (num_points%4)*2; i++){
++ if(inputVectorPtr[i] > max_val)
++ inputVectorPtr[i] = max_val;
++ else if(inputVectorPtr[i] < min_val)
++ inputVectorPtr[i] = min_val;
++ outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]);
++ }
++}
++#endif /* LV_HAVE_SSE */
++
++#ifdef LV_HAVE_GENERIC
++/*!
++ \brief Converts a float vector of 64 bits (32 bits each part) into a 32 integer vector (16 bits each part)
++ \param inputVector The floating point input data buffer
++ \param outputVector The 16 bit output data buffer
++ \param num_points The number of data values to be converted
++ */
++static inline void volk_gnsssdr_32fc_convert_16ic_generic(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){
++ float* inputVectorPtr = (float*)inputVector;
++ int16_t* outputVectorPtr = (int16_t*)outputVector;
++ float min_val = -32768;
++ float max_val = 32767;
++
++ for(unsigned int i = 0; i < num_points*2; i++){
++ if(inputVectorPtr[i] > max_val)
++ inputVectorPtr[i] = max_val;
++ else if(inputVectorPtr[i] < min_val)
++ inputVectorPtr[i] = min_val;
++ outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]);
++ }
++}
++#endif /* LV_HAVE_GENERIC */
++#endif /* INCLUDED_volk_gnsssdr_32fc_convert_16ic_u_H */
++
++
++#ifndef INCLUDED_volk_gnsssdr_32fc_convert_16ic_a_H
++#define INCLUDED_volk_gnsssdr_32fc_convert_16ic_a_H
++
++#include <volk/volk_common.h>
++#include <inttypes.h>
++#include <stdio.h>
++#include <math.h>
++
++#ifdef LV_HAVE_SSE2
++#include <emmintrin.h>
++/*!
++ \brief Converts a float vector of 64 bits (32 bits each part) into a 32 integer vector (16 bits each part)
++ \param inputVector The floating point input data buffer
++ \param outputVector The 16 bit output data buffer
++ \param num_points The number of data values to be converted
++ */
++static inline void volk_gnsssdr_32fc_convert_16ic_a_sse2(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){
++ const unsigned int sse_iters = num_points/4;
++
++ float* inputVectorPtr = (float*)inputVector;
++ int16_t* outputVectorPtr = (int16_t*)outputVector;
++
++ float min_val = -32768;
++ float max_val = 32767;
++
++ __m128 inputVal1, inputVal2;
++ __m128i intInputVal1, intInputVal2;
++ __m128 ret1, ret2;
++ __m128 vmin_val = _mm_set_ps1(min_val);
++ __m128 vmax_val = _mm_set_ps1(max_val);
++
++ for(unsigned int i = 0;i < sse_iters; i++){
++ inputVal1 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
++ inputVal2 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
++
++ // Clip
++ ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
++ ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
++
++ intInputVal1 = _mm_cvtps_epi32(ret1);
++ intInputVal2 = _mm_cvtps_epi32(ret2);
++
++ intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
++
++ _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
++ outputVectorPtr += 8;
++ }
++
++ for(unsigned int i = 0; i < (num_points%4)*2; i++){
++ if(inputVectorPtr[i] > max_val)
++ inputVectorPtr[i] = max_val;
++ else if(inputVectorPtr[i] < min_val)
++ inputVectorPtr[i] = min_val;
++ outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]);
++ }
++}
++#endif /* LV_HAVE_SSE2 */
++
++#ifdef LV_HAVE_SSE
++#include <xmmintrin.h>
++/*!
++ \brief Converts a float vector of 64 bits (32 bits each part) into a 32 integer vector (16 bits each part)
++ \param inputVector The floating point input data buffer
++ \param outputVector The 16 bit output data buffer
++ \param num_points The number of data values to be converted
++ */
++static inline void volk_gnsssdr_32fc_convert_16ic_a_sse(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){
++ const unsigned int sse_iters = num_points/4;
++
++ float* inputVectorPtr = (float*)inputVector;
++ int16_t* outputVectorPtr = (int16_t*)outputVector;
++
++ float min_val = -32768;
++ float max_val = 32767;
++
++ __m128 inputVal1, inputVal2;
++ __m128i intInputVal1, intInputVal2;
++ __m128 ret1, ret2;
++ __m128 vmin_val = _mm_set_ps1(min_val);
++ __m128 vmax_val = _mm_set_ps1(max_val);
++
++ for(unsigned int i = 0;i < sse_iters; i++){
++ inputVal1 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
++ inputVal2 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
++
++ // Clip
++ ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
++ ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
++
++ intInputVal1 = _mm_cvtps_epi32(ret1);
++ intInputVal2 = _mm_cvtps_epi32(ret2);
++
++ intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
++
++ _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
++ outputVectorPtr += 8;
++ }
++
++ for(unsigned int i = 0; i < (num_points%4)*2; i++){
++ if(inputVectorPtr[i] > max_val)
++ inputVectorPtr[i] = max_val;
++ else if(inputVectorPtr[i] < min_val)
++ inputVectorPtr[i] = min_val;
++ outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]);
++ }
++}
++#endif /* LV_HAVE_SSE */
++
++#ifdef LV_HAVE_GENERIC
++/*!
++ \brief Converts a float vector of 64 bits (32 bits each part) into a 32 integer vector (16 bits each part)
++ \param inputVector The floating point input data buffer
++ \param outputVector The 16 bit output data buffer
++ \param num_points The number of data values to be converted
++ */
++static inline void volk_gnsssdr_32fc_convert_16ic_a_generic(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){
++ float* inputVectorPtr = (float*)inputVector;
++ int16_t* outputVectorPtr = (int16_t*)outputVector;
++ float min_val = -32768;
++ float max_val = 32767;
++
++ for(unsigned int i = 0; i < num_points*2; i++){
++ if(inputVectorPtr[i] > max_val)
++ inputVectorPtr[i] = max_val;
++ else if(inputVectorPtr[i] < min_val)
++ inputVectorPtr[i] = min_val;
++ outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]);
++ }
++}
++#endif /* LV_HAVE_GENERIC */
++#endif /* INCLUDED_volk_gnsssdr_32fc_convert_16ic_a_H */
+diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_8ic.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_8ic.h
+--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_8ic.h 1970-01-01 01:00:00.000000000 +0100
++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_8ic.h 2014-10-15 01:55:08.000000000 +0200
+@@ -0,0 +1,213 @@
++/*!
++ * \file volk_gnsssdr_32fc_convert_8ic.h
++ * \brief Volk protokernel: converts float32 complex values to 8 integer complex values taking care of overflow
++ * \authors <ul>
++ * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
++ * </ul>
++ *
++ * -------------------------------------------------------------------------
++ *
++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
++ *
++ * GNSS-SDR is a software defined Global Navigation
++ * Satellite Systems receiver
++ *
++ * This file is part of GNSS-SDR.
++ *
++ * GNSS-SDR is free software: you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation, either version 3 of the License, or
++ * at your option) any later version.
++ *
++ * GNSS-SDR is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
++ *
++ * -------------------------------------------------------------------------
++ */
++
++#ifndef INCLUDED_volk_gnsssdr_32fc_convert_8ic_u_H
++#define INCLUDED_volk_gnsssdr_32fc_convert_8ic_u_H
++
++#include <inttypes.h>
++#include <stdio.h>
++#include <math.h>
++
++#ifdef LV_HAVE_SSE2
++#include <emmintrin.h>
++/*!
++ \brief Converts a float vector of 64 bits (32 bits each part) into a 16 integer vector (8 bits each part)
++ \param inputVector The floating point input data buffer
++ \param outputVector The 16 bit output data buffer
++ \param num_points The number of data values to be converted
++ */
++static inline void volk_gnsssdr_32fc_convert_8ic_u_sse2(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){
++ const unsigned int sse_iters = num_points/8;
++
++ float* inputVectorPtr = (float*)inputVector;
++ int8_t* outputVectorPtr = (int8_t*)outputVector;
++
++ float min_val = -128;
++ float max_val = 127;
++
++ __m128 inputVal1, inputVal2, inputVal3, inputVal4;
++ __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
++ __m128i int8InputVal;
++ __m128 ret1, ret2, ret3, ret4;
++ __m128 vmin_val = _mm_set_ps1(min_val);
++ __m128 vmax_val = _mm_set_ps1(max_val);
++
++ for(unsigned int i = 0;i < sse_iters; i++){
++ inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
++ inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
++ inputVal3 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
++ inputVal4 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
++
++ // Clip
++ ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
++ ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
++ ret3 = _mm_max_ps(_mm_min_ps(inputVal3, vmax_val), vmin_val);
++ ret4 = _mm_max_ps(_mm_min_ps(inputVal4, vmax_val), vmin_val);
++
++ intInputVal1 = _mm_cvtps_epi32(ret1);
++ intInputVal2 = _mm_cvtps_epi32(ret2);
++ intInputVal3 = _mm_cvtps_epi32(ret3);
++ intInputVal4 = _mm_cvtps_epi32(ret4);
++
++ intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
++ intInputVal2 = _mm_packs_epi32(intInputVal3, intInputVal4);
++ int8InputVal = _mm_packs_epi16(intInputVal1, intInputVal2);
++
++ _mm_storeu_si128((__m128i*)outputVectorPtr, int8InputVal);
++ outputVectorPtr += 16;
++ }
++
++ for(unsigned int i = 0; i < (num_points%4)*4; i++){
++ if(inputVectorPtr[i] > max_val)
++ inputVectorPtr[i] = max_val;
++ else if(inputVectorPtr[i] < min_val)
++ inputVectorPtr[i] = min_val;
++ outputVectorPtr[i] = (int8_t)rintf(inputVectorPtr[i]);
++ }
++}
++#endif /* LV_HAVE_SSE2 */
++
++#ifdef LV_HAVE_GENERIC
++/*!
++ \brief Converts a float vector of 64 bits (32 bits each part) into a 16 integer vector (8 bits each part)
++ \param inputVector The floating point input data buffer
++ \param outputVector The 16 bit output data buffer
++ \param num_points The number of data values to be converted
++ */
++static inline void volk_gnsssdr_32fc_convert_8ic_generic(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){
++ float* inputVectorPtr = (float*)inputVector;
++ int8_t* outputVectorPtr = (int8_t*)outputVector;
++ float min_val = -128;
++ float max_val = 127;
++
++ for(unsigned int i = 0; i < num_points*2; i++){
++ if(inputVectorPtr[i] > max_val)
++ inputVectorPtr[i] = max_val;
++ else if(inputVectorPtr[i] < min_val)
++ inputVectorPtr[i] = min_val;
++ outputVectorPtr[i] = (int8_t)rintf(inputVectorPtr[i]);
++ }
++}
++#endif /* LV_HAVE_GENERIC */
++#endif /* INCLUDED_volk_gnsssdr_32fc_convert_8ic_u_H */
++
++
++#ifndef INCLUDED_volk_gnsssdr_32fc_convert_8ic_a_H
++#define INCLUDED_volk_gnsssdr_32fc_convert_8ic_a_H
++
++#include <volk/volk_common.h>
++#include <inttypes.h>
++#include <stdio.h>
++#include <math.h>
++
++#ifdef LV_HAVE_SSE2
++#include <emmintrin.h>
++/*!
++ \brief Converts a float vector of 64 bits (32 bits each part) into a 16 integer vector (8 bits each part)
++ \param inputVector The floating point input data buffer
++ \param outputVector The 16 bit output data buffer
++ \param num_points The number of data values to be converted
++ */
++static inline void volk_gnsssdr_32fc_convert_8ic_a_sse2(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){
++ const unsigned int sse_iters = num_points/8;
++
++ float* inputVectorPtr = (float*)inputVector;
++ int8_t* outputVectorPtr = (int8_t*)outputVector;
++
++ float min_val = -128;
++ float max_val = 127;
++
++ __m128 inputVal1, inputVal2, inputVal3, inputVal4;
++ __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
++ __m128i int8InputVal;
++ __m128 ret1, ret2, ret3, ret4;
++ __m128 vmin_val = _mm_set_ps1(min_val);
++ __m128 vmax_val = _mm_set_ps1(max_val);
++
++ for(unsigned int i = 0;i < sse_iters; i++){
++ inputVal1 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
++ inputVal2 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
++ inputVal3 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
++ inputVal4 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
++
++ // Clip
++ ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
++ ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
++ ret3 = _mm_max_ps(_mm_min_ps(inputVal3, vmax_val), vmin_val);
++ ret4 = _mm_max_ps(_mm_min_ps(inputVal4, vmax_val), vmin_val);
++
++ intInputVal1 = _mm_cvtps_epi32(ret1);
++ intInputVal2 = _mm_cvtps_epi32(ret2);
++ intInputVal3 = _mm_cvtps_epi32(ret3);
++ intInputVal4 = _mm_cvtps_epi32(ret4);
++
++ intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
++ intInputVal2 = _mm_packs_epi32(intInputVal3, intInputVal4);
++ int8InputVal = _mm_packs_epi16(intInputVal1, intInputVal2);
++
++ _mm_store_si128((__m128i*)outputVectorPtr, int8InputVal);
++ outputVectorPtr += 16;
++ }
++
++ for(unsigned int i = 0; i < (num_points%4)*4; i++){
++ if(inputVectorPtr[i] > max_val)
++ inputVectorPtr[i] = max_val;
++ else if(inputVectorPtr[i] < min_val)
++ inputVectorPtr[i] = min_val;
++ outputVectorPtr[i] = (int8_t)rintf(inputVectorPtr[i]);
++ }
++}
++#endif /* LV_HAVE_SSE2 */
++
++#ifdef LV_HAVE_GENERIC
++/*!
++ \brief Converts a float vector of 64 bits (32 bits each part) into a 16 integer vector (8 bits each part)
++ \param inputVector The floating point input data buffer
++ \param outputVector The 16 bit output data buffer
++ \param num_points The number of data values to be converted
++ */
++static inline void volk_gnsssdr_32fc_convert_8ic_a_generic(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){
++ float* inputVectorPtr = (float*)inputVector;
++ int8_t* outputVectorPtr = (int8_t*)outputVector;
++ float min_val = -128;
++ float max_val = 127;
++
++ for(unsigned int i = 0; i < num_points*2; i++){
++ if(inputVectorPtr[i] > max_val)
++ inputVectorPtr[i] = max_val;
++ else if(inputVectorPtr[i] < min_val)
++ inputVectorPtr[i] = min_val;
++ outputVectorPtr[i] = (int8_t)rintf(inputVectorPtr[i]);
++ }
++}
++#endif /* LV_HAVE_GENERIC */
++#endif /* INCLUDED_volk_gnsssdr_32fc_convert_8ic_a_H */
+diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_magnitude_squared_32f.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_magnitude_squared_32f.h
+--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_magnitude_squared_32f.h 1970-01-01 01:00:00.000000000 +0100
++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_magnitude_squared_32f.h 2014-10-15 01:55:08.000000000 +0200
+@@ -0,0 +1,228 @@
++#ifndef INCLUDED_volk_gnsssdr_32fc_magnitude_squared_32f_u_H
++#define INCLUDED_volk_gnsssdr_32fc_magnitude_squared_32f_u_H
++
++#include <inttypes.h>
++#include <stdio.h>
++#include <math.h>
++
++#ifdef LV_HAVE_SSE3
++#include <pmmintrin.h>
++ /*!
++ \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector
++ \param complexVector The vector containing the complex input values
++ \param magnitudeVector The vector containing the real output values
++ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
++ */
++static inline void volk_gnsssdr_32fc_magnitude_squared_32f_u_sse3(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
++ unsigned int number = 0;
++ const unsigned int quarterPoints = num_points / 4;
++
++ const float* complexVectorPtr = (float*)complexVector;
++ float* magnitudeVectorPtr = magnitudeVector;
++
++ __m128 cplxValue1, cplxValue2, result;
++ for(;number < quarterPoints; number++){
++ cplxValue1 = _mm_loadu_ps(complexVectorPtr);
++ complexVectorPtr += 4;
++
++ cplxValue2 = _mm_loadu_ps(complexVectorPtr);
++ complexVectorPtr += 4;
++
++ cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
++ cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
++
++ result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
++
++ _mm_storeu_ps(magnitudeVectorPtr, result);
++ magnitudeVectorPtr += 4;
++ }
++
++ number = quarterPoints * 4;
++ for(; number < num_points; number++){
++ float val1Real = *complexVectorPtr++;
++ float val1Imag = *complexVectorPtr++;
++ *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
++ }
++}
++#endif /* LV_HAVE_SSE3 */
++
++#ifdef LV_HAVE_SSE
++#include <xmmintrin.h>
++ /*!
++ \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector
++ \param complexVector The vector containing the complex input values
++ \param magnitudeVector The vector containing the real output values
++ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
++ */
++static inline void volk_gnsssdr_32fc_magnitude_squared_32f_u_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
++ unsigned int number = 0;
++ const unsigned int quarterPoints = num_points / 4;
++
++ const float* complexVectorPtr = (float*)complexVector;
++ float* magnitudeVectorPtr = magnitudeVector;
++
++ __m128 cplxValue1, cplxValue2, iValue, qValue, result;
++ for(;number < quarterPoints; number++){
++ cplxValue1 = _mm_loadu_ps(complexVectorPtr);
++ complexVectorPtr += 4;
++
++ cplxValue2 = _mm_loadu_ps(complexVectorPtr);
++ complexVectorPtr += 4;
++
++ // Arrange in i1i2i3i4 format
++ iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
++ // Arrange in q1q2q3q4 format
++ qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
++
++ iValue = _mm_mul_ps(iValue, iValue); // Square the I values
++ qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values
++
++ result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values
++
++ _mm_storeu_ps(magnitudeVectorPtr, result);
++ magnitudeVectorPtr += 4;
++ }
++
++ number = quarterPoints * 4;
++ for(; number < num_points; number++){
++ float val1Real = *complexVectorPtr++;
++ float val1Imag = *complexVectorPtr++;
++ *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
++ }
++}
++#endif /* LV_HAVE_SSE */
++
++#ifdef LV_HAVE_GENERIC
++ /*!
++ \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector
++ \param complexVector The vector containing the complex input values
++ \param magnitudeVector The vector containing the real output values
++ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
++ */
++static inline void volk_gnsssdr_32fc_magnitude_squared_32f_generic(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
++ const float* complexVectorPtr = (float*)complexVector;
++ float* magnitudeVectorPtr = magnitudeVector;
++ unsigned int number = 0;
++ for(number = 0; number < num_points; number++){
++ const float real = *complexVectorPtr++;
++ const float imag = *complexVectorPtr++;
++ *magnitudeVectorPtr++ = (real*real) + (imag*imag);
++ }
++}
++#endif /* LV_HAVE_GENERIC */
++
++#endif /* INCLUDED_volk_gnsssdr_32fc_magnitude_32f_u_H */
++#ifndef INCLUDED_volk_gnsssdr_32fc_magnitude_squared_32f_a_H
++#define INCLUDED_volk_gnsssdr_32fc_magnitude_squared_32f_a_H
++
++#include <inttypes.h>
++#include <stdio.h>
++#include <math.h>
++
++#ifdef LV_HAVE_SSE3
++#include <pmmintrin.h>
++ /*!
++ \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector
++ \param complexVector The vector containing the complex input values
++ \param magnitudeVector The vector containing the real output values
++ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
++ */
++static inline void volk_gnsssdr_32fc_magnitude_squared_32f_a_sse3(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
++ unsigned int number = 0;
++ const unsigned int quarterPoints = num_points / 4;
++
++ const float* complexVectorPtr = (float*)complexVector;
++ float* magnitudeVectorPtr = magnitudeVector;
++
++ __m128 cplxValue1, cplxValue2, result;
++ for(;number < quarterPoints; number++){
++ cplxValue1 = _mm_load_ps(complexVectorPtr);
++ complexVectorPtr += 4;
++
++ cplxValue2 = _mm_load_ps(complexVectorPtr);
++ complexVectorPtr += 4;
++
++ cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
++ cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
++
++ result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
++
++ _mm_store_ps(magnitudeVectorPtr, result);
++ magnitudeVectorPtr += 4;
++ }
++
++ number = quarterPoints * 4;
++ for(; number < num_points; number++){
++ float val1Real = *complexVectorPtr++;
++ float val1Imag = *complexVectorPtr++;
++ *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
++ }
++}
++#endif /* LV_HAVE_SSE3 */
++
++#ifdef LV_HAVE_SSE
++#include <xmmintrin.h>
++ /*!
++ \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector
++ \param complexVector The vector containing the complex input values
++ \param magnitudeVector The vector containing the real output values
++ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
++ */
++static inline void volk_gnsssdr_32fc_magnitude_squared_32f_a_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
++ unsigned int number = 0;
++ const unsigned int quarterPoints = num_points / 4;
++
++ const float* complexVectorPtr = (float*)complexVector;
++ float* magnitudeVectorPtr = magnitudeVector;
++
++ __m128 cplxValue1, cplxValue2, iValue, qValue, result;
++ for(;number < quarterPoints; number++){
++ cplxValue1 = _mm_load_ps(complexVectorPtr);
++ complexVectorPtr += 4;
++
++ cplxValue2 = _mm_load_ps(complexVectorPtr);
++ complexVectorPtr += 4;
++
++ // Arrange in i1i2i3i4 format
++ iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
++ // Arrange in q1q2q3q4 format
++ qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
++
++ iValue = _mm_mul_ps(iValue, iValue); // Square the I values
++ qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values
++
++ result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values
++
++ _mm_store_ps(magnitudeVectorPtr, result);
++ magnitudeVectorPtr += 4;
++ }
++
++ number = quarterPoints * 4;
++ for(; number < num_points; number++){
++ float val1Real = *complexVectorPtr++;
++ float val1Imag = *complexVectorPtr++;
++ *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
++ }
++}
++#endif /* LV_HAVE_SSE */
++
++#ifdef LV_HAVE_GENERIC
++ /*!
++ \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector
++ \param complexVector The vector containing the complex input values
++ \param magnitudeVector The vector containing the real output values
++ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
++ */
++static inline void volk_gnsssdr_32fc_magnitude_squared_32f_a_generic(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
++ const float* complexVectorPtr = (float*)complexVector;
++ float* magnitudeVectorPtr = magnitudeVector;
++ unsigned int number = 0;
++ for(number = 0; number < num_points; number++){
++ const float real = *complexVectorPtr++;
++ const float imag = *complexVectorPtr++;
++ *magnitudeVectorPtr++ = (real*real) + (imag*imag);
++ }
++}
++#endif /* LV_HAVE_GENERIC */
++
++#endif /* INCLUDED_volk_gnsssdr_32fc_magnitude_32f_a_H */
+diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_s32f_convert_8ic.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_s32f_convert_8ic.h
+--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_s32f_convert_8ic.h 1970-01-01 01:00:00.000000000 +0100
++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_s32f_convert_8ic.h 2014-10-15 01:55:08.000000000 +0200
+@@ -0,0 +1,231 @@
++/*!
++ * \file volk_gnsssdr_32fc_s32f_convert_8ic.h
++ * \brief Volk protokernel: converts float32 complex values to 8 integer complex values taking care of overflow
++ * \authors <ul>
++ * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
++ * </ul>
++ *
++ * -------------------------------------------------------------------------
++ *
++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
++ *
++ * GNSS-SDR is a software defined Global Navigation
++ * Satellite Systems receiver
++ *
++ * This file is part of GNSS-SDR.
++ *
++ * GNSS-SDR is free software: you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation, either version 3 of the License, or
++ * at your option) any later version.
++ *
++ * GNSS-SDR is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
++ *
++ * -------------------------------------------------------------------------
++ */
++
++#ifndef INCLUDED_volk_gnsssdr_32fc_s32f_convert_8ic_u_H
++#define INCLUDED_volk_gnsssdr_32fc_s32f_convert_8ic_u_H
++
++#include <inttypes.h>
++#include <stdio.h>
++#include <math.h>
++
++#ifdef LV_HAVE_SSE2
++#include <emmintrin.h>
++/*!
++ \brief Converts a float vector of 64 bits (32 bits each part) into a 16 integer vector (8 bits each part)
++ \param inputVector The floating point input data buffer
++ \param outputVector The 16 bit output data buffer
++ \param num_points The number of data values to be converted
++ */
++static inline void volk_gnsssdr_32fc_s32f_convert_8ic_u_sse2(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, const float scalar, unsigned int num_points){
++ const unsigned int sse_iters = num_points/8;
++
++ float* inputVectorPtr = (float*)inputVector;
++ int8_t* outputVectorPtr = (int8_t*)outputVector;
++ __m128 invScalar = _mm_set_ps1(1.0/scalar);
++
++ float min_val = -128;
++ float max_val = 127;
++
++ __m128 inputVal1, inputVal2, inputVal3, inputVal4;
++ __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
++ __m128i int8InputVal;
++ __m128 ret1, ret2, ret3, ret4;
++ __m128 vmin_val = _mm_set_ps1(min_val);
++ __m128 vmax_val = _mm_set_ps1(max_val);
++
++ for(unsigned int i = 0;i < sse_iters; i++){
++ inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
++ inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
++ inputVal3 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
++ inputVal4 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
++
++ inputVal1 = _mm_mul_ps(inputVal1, invScalar);
++ inputVal2 = _mm_mul_ps(inputVal2, invScalar);
++ inputVal3 = _mm_mul_ps(inputVal3, invScalar);
++ inputVal4 = _mm_mul_ps(inputVal4, invScalar);
++ // Clip
++ ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
++ ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
++ ret3 = _mm_max_ps(_mm_min_ps(inputVal3, vmax_val), vmin_val);
++ ret4 = _mm_max_ps(_mm_min_ps(inputVal4, vmax_val), vmin_val);
++
++ intInputVal1 = _mm_cvtps_epi32(ret1);
++ intInputVal2 = _mm_cvtps_epi32(ret2);
++ intInputVal3 = _mm_cvtps_epi32(ret3);
++ intInputVal4 = _mm_cvtps_epi32(ret4);
++
++ intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
++ intInputVal2 = _mm_packs_epi32(intInputVal3, intInputVal4);
++ int8InputVal = _mm_packs_epi16(intInputVal1, intInputVal2);
++
++ _mm_storeu_si128((__m128i*)outputVectorPtr, int8InputVal);
++ outputVectorPtr += 16;
++ }
++
++ float scaled = 0;
++ for(unsigned int i = 0; i < (num_points%4)*4; i++){
++ scaled = inputVectorPtr[i]/scalar;
++ if(scaled > max_val)
++ scaled = max_val;
++ else if(scaled < min_val)
++ scaled = min_val;
++ outputVectorPtr[i] = (int8_t)rintf(scaled);
++ }
++}
++#endif /* LV_HAVE_SSE2 */
++
++#ifdef LV_HAVE_GENERIC
++/*!
++ \brief Converts a float vector of 64 bits (32 bits each part) into a 16 integer vector (8 bits each part)
++ \param inputVector The floating point input data buffer
++ \param outputVector The 16 bit output data buffer
++ \param num_points The number of data values to be converted
++ */
++static inline void volk_gnsssdr_32fc_s32f_convert_8ic_generic(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, const float scalar, unsigned int num_points){
++ float* inputVectorPtr = (float*)inputVector;
++ int8_t* outputVectorPtr = (int8_t*)outputVector;
++ float scaled = 0;
++ float min_val = -128;
++ float max_val = 127;
++
++ for(unsigned int i = 0; i < num_points*2; i++){
++ scaled = (inputVectorPtr[i])/scalar;
++ if(scaled > max_val)
++ scaled = max_val;
++ else if(scaled < min_val)
++ scaled = min_val;
++ outputVectorPtr[i] = (int8_t)rintf(scaled);
++ }
++}
++#endif /* LV_HAVE_GENERIC */
++#endif /* INCLUDED_volk_gnsssdr_32fc_s32f_convert_8ic_u_H */
++
++
++#ifndef INCLUDED_volk_gnsssdr_32fc_s32f_convert_8ic_a_H
++#define INCLUDED_volk_gnsssdr_32fc_s32f_convert_8ic_a_H
++
++#include <volk/volk_common.h>
++#include <inttypes.h>
++#include <stdio.h>
++#include <math.h>
++
++#ifdef LV_HAVE_SSE2
++#include <emmintrin.h>
++/*!
++ \brief Converts a float vector of 64 bits (32 bits each part) into a 16 integer vector (8 bits each part)
++ \param inputVector The floating point input data buffer
++ \param outputVector The 16 bit output data buffer
++ \param num_points The number of data values to be converted
++ */
++static inline void volk_gnsssdr_32fc_s32f_convert_8ic_a_sse2(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, const float scalar, unsigned int num_points){
++ const unsigned int sse_iters = num_points/8;
++
++ float* inputVectorPtr = (float*)inputVector;
++ int8_t* outputVectorPtr = (int8_t*)outputVector;
++ __m128 invScalar = _mm_set_ps1(1.0/scalar);
++
++ float min_val = -128;
++ float max_val = 127;
++
++ __m128 inputVal1, inputVal2, inputVal3, inputVal4;
++ __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
++ __m128i int8InputVal;
++ __m128 ret1, ret2, ret3, ret4;
++ __m128 vmin_val = _mm_set_ps1(min_val);
++ __m128 vmax_val = _mm_set_ps1(max_val);
++
++ for(unsigned int i = 0;i < sse_iters; i++){
++ inputVal1 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
++ inputVal2 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
++ inputVal3 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
++ inputVal4 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
++
++ inputVal1 = _mm_mul_ps(inputVal1, invScalar);
++ inputVal2 = _mm_mul_ps(inputVal2, invScalar);
++ inputVal3 = _mm_mul_ps(inputVal3, invScalar);
++ inputVal4 = _mm_mul_ps(inputVal4, invScalar);
++ // Clip
++ ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
++ ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
++ ret3 = _mm_max_ps(_mm_min_ps(inputVal3, vmax_val), vmin_val);
++ ret4 = _mm_max_ps(_mm_min_ps(inputVal4, vmax_val), vmin_val);
++
++ intInputVal1 = _mm_cvtps_epi32(ret1);
++ intInputVal2 = _mm_cvtps_epi32(ret2);
++ intInputVal3 = _mm_cvtps_epi32(ret3);
++ intInputVal4 = _mm_cvtps_epi32(ret4);
++
++ intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
++ intInputVal2 = _mm_packs_epi32(intInputVal3, intInputVal4);
++ int8InputVal = _mm_packs_epi16(intInputVal1, intInputVal2);
++
++ _mm_store_si128((__m128i*)outputVectorPtr, int8InputVal);
++ outputVectorPtr += 16;
++ }
++
++ float scaled = 0;
++ for(unsigned int i = 0; i < (num_points%4)*4; i++){
++ scaled = inputVectorPtr[i]/scalar;
++ if(scaled > max_val)
++ scaled = max_val;
++ else if(scaled < min_val)
++ scaled = min_val;
++ outputVectorPtr[i] = (int8_t)rintf(scaled);
++ }
++}
++#endif /* LV_HAVE_SSE2 */
++
++#ifdef LV_HAVE_GENERIC
++/*!
++ \brief Converts a float vector of 64 bits (32 bits each part) into a 16 integer vector (8 bits each part)
++ \param inputVector The floating point input data buffer
++ \param outputVector The 16 bit output data buffer
++ \param num_points The number of data values to be converted
++ */
++static inline void volk_gnsssdr_32fc_s32f_convert_8ic_a_generic(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, const float scalar, unsigned int num_points){
++ float* inputVectorPtr = (float*)inputVector;
++ int8_t* outputVectorPtr = (int8_t*)outputVector;
++ float scaled = 0;
++ float min_val = -128;
++ float max_val = 127;
++
++ for(unsigned int i = 0; i < num_points*2; i++){
++ scaled = inputVectorPtr[i]/scalar;
++ if(scaled > max_val)
++ scaled = max_val;
++ else if(scaled < min_val)
++ scaled = min_val;
++ outputVectorPtr[i] = (int8_t)rintf(scaled);
++ }
++}
++#endif /* LV_HAVE_GENERIC */
++#endif /* INCLUDED_volk_gnsssdr_32fc_s32f_convert_8ic_a_H */
+diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc.h
+--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc.h 1970-01-01 01:00:00.000000000 +0100
++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc.h 2014-10-15 01:55:08.000000000 +0200
+@@ -0,0 +1,266 @@
++/*!
++ * \file volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc
++ * \brief Volk protokernel: replaces the tracking function for update_local_code
++ * \authors <ul>
++ * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
++ * </ul>
++ *
++ * Volk protokernel that replaces the tracking function for update_local_code
++ *
++ * -------------------------------------------------------------------------
++ *
++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
++ *
++ * GNSS-SDR is a software defined Global Navigation
++ * Satellite Systems receiver
++ *
++ * This file is part of GNSS-SDR.
++ *
++ * GNSS-SDR is free software: you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation, either version 3 of the License, or
++ * at your option) any later version.
++ *
++ * GNSS-SDR is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
++ *
++ * -------------------------------------------------------------------------
++ */
++
++#ifndef INCLUDED_volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_u_H
++#define INCLUDED_volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_u_H
++
++#include <inttypes.h>
++#include <stdio.h>
++#include <volk_gnsssdr/volk_gnsssdr_complex.h>
++#include <float.h>
++
++#ifdef LV_HAVE_SSE4_1
++#include <smmintrin.h>
++ /*!
++ \brief Takes the conjugate of a complex vector.
++ \param cVector The vector where the results will be stored
++ \param aVector Vector to be conjugated
++ \param num_points The number of complex values in aVector to be conjugated and stored into cVector
++ */
++static inline void volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_u_sse4_1(lv_32fc_t* d_very_early_code, const float d_very_early_late_spc_chips, const float code_length_half_chips, const float code_phase_step_half_chips, const float tcode_half_chips_input, const lv_32fc_t* d_ca_code, unsigned int num_points){
++
++// float* pointer1 = (float*)&d_very_early_late_spc_chips;
++// *pointer1 = 1;
++// float* pointer2 = (float*)&code_length_half_chips;
++// *pointer2 = 6;
++// float* pointer3 = (float*)&code_phase_step_half_chips;
++// *pointer3 = 7;
++// float* pointer4 = (float*)&tcode_half_chips_input;
++// *pointer4 = 8;
++
++ const unsigned int sse_iters = num_points / 4;
++
++ __m128 tquot, fmod_num, fmod_result, associated_chip_index_array;
++
++ __m128 tcode_half_chips_array = _mm_set_ps (tcode_half_chips_input+3*code_phase_step_half_chips, tcode_half_chips_input+2*code_phase_step_half_chips, tcode_half_chips_input+code_phase_step_half_chips, tcode_half_chips_input);
++ __m128 code_phase_step_half_chips_array = _mm_set1_ps (code_phase_step_half_chips*4);
++ __m128 d_very_early_late_spc_chips_Multiplied_by_2 = _mm_set1_ps (2*d_very_early_late_spc_chips);
++ __m128 code_length_half_chips_array = _mm_set1_ps (code_length_half_chips);
++ __m128 twos = _mm_set1_ps (2);
++ __m128i associated_chip_index_array_int;
++
++ __VOLK_ATTR_ALIGNED(16) int32_t output[4];
++
++ for (unsigned int i = 0; i < sse_iters; i++)
++ {
++ //fmod = numer - tquot * denom; tquot = numer/denom truncated
++ //associated_chip_index = 2 + round(fmod(tcode_half_chips - 2*d_very_early_late_spc_chips, code_length_half_chips));
++ fmod_num = _mm_sub_ps (tcode_half_chips_array, d_very_early_late_spc_chips_Multiplied_by_2);
++ tquot = _mm_div_ps (fmod_num, code_length_half_chips_array);
++ tquot = _mm_round_ps (tquot, (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) );
++ fmod_result = _mm_sub_ps (fmod_num, _mm_mul_ps (tquot, code_length_half_chips_array));
++
++ associated_chip_index_array = _mm_round_ps (fmod_result, (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC));
++ associated_chip_index_array = _mm_add_ps(twos, associated_chip_index_array);
++ associated_chip_index_array_int = _mm_cvtps_epi32 (associated_chip_index_array);
++ _mm_storeu_si128 ((__m128i*)output, associated_chip_index_array_int);
++
++ //d_very_early_code[i] = d_ca_code[associated_chip_index];
++ *d_very_early_code++ = d_ca_code[output[0]];
++ *d_very_early_code++ = d_ca_code[output[1]];
++ *d_very_early_code++ = d_ca_code[output[2]];
++ *d_very_early_code++ = d_ca_code[output[3]];
++
++ //tcode_half_chips = tcode_half_chips + code_phase_step_half_chips;
++ tcode_half_chips_array = _mm_add_ps (tcode_half_chips_array, code_phase_step_half_chips_array);
++ }
++
++ if (num_points%4!=0)
++ {
++ __VOLK_ATTR_ALIGNED(16) float tcode_half_chips_stored[4];
++ _mm_storeu_si128 ((__m128i*)tcode_half_chips_stored, tcode_half_chips_array);
++
++ int associated_chip_index;
++ float tcode_half_chips = tcode_half_chips_stored[0];
++ float d_very_early_late_spc_chips_multiplied_by_2 = 2*d_very_early_late_spc_chips;
++
++ for (unsigned int i = 0; i < num_points%4; i++)
++ {
++ associated_chip_index = 2 + round(fmod(tcode_half_chips - d_very_early_late_spc_chips_multiplied_by_2, code_length_half_chips));
++ d_very_early_code[i] = d_ca_code[associated_chip_index];
++ tcode_half_chips = tcode_half_chips + code_phase_step_half_chips;
++ }
++ }
++}
++#endif /* LV_HAVE_SSE4_1 */
++
++#ifdef LV_HAVE_GENERIC
++ /*!
++ \brief Takes the conjugate of a complex vector.
++ \param cVector The vector where the results will be stored
++ \param aVector Vector to be conjugated
++ \param num_points The number of complex values in aVector to be conjugated and stored into cVector
++ */
++static inline void volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_generic(lv_32fc_t* d_very_early_code, const float d_very_early_late_spc_chips, const float code_length_half_chips, const float code_phase_step_half_chips, const float tcode_half_chips_input, const lv_32fc_t* d_ca_code, unsigned int num_points){
++
++ float* pointer1 = (float*)&d_very_early_late_spc_chips;
++ *pointer1 = 1;
++ float* pointer2 = (float*)&code_length_half_chips;
++ *pointer2 = 6;
++ float* pointer3 = (float*)&code_phase_step_half_chips;
++ *pointer3 = 7;
++ float* pointer4 = (float*)&tcode_half_chips_input;
++ *pointer4 = 8;
++
++ int associated_chip_index;
++ float tcode_half_chips = tcode_half_chips_input;
++ float d_very_early_late_spc_chips_multiplied_by_2 = 2*d_very_early_late_spc_chips;
++
++ for (unsigned int i = 0; i < num_points; i++)
++ {
++ associated_chip_index = 2 + round(fmod(tcode_half_chips - d_very_early_late_spc_chips_multiplied_by_2, code_length_half_chips));
++ d_very_early_code[i] = d_ca_code[associated_chip_index];
++ tcode_half_chips = tcode_half_chips + code_phase_step_half_chips;
++ }
++}
++#endif /* LV_HAVE_GENERIC */
++
++
++#endif /* INCLUDED_volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_u_H */
++#ifndef INCLUDED_volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_a_H
++#define INCLUDED_volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_a_H
++
++#include <inttypes.h>
++#include <stdio.h>
++#include <volk_gnsssdr/volk_gnsssdr_complex.h>
++#include <float.h>
++
++#ifdef LV_HAVE_SSE4_1
++#include <smmintrin.h>
++ /*!
++ \brief Takes the conjugate of a complex vector.
++ \param cVector The vector where the results will be stored
++ \param aVector Vector to be conjugated
++ \param num_points The number of complex values in aVector to be conjugated and stored into cVector
++ */
++static inline void volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_a_sse4_1(lv_32fc_t* d_very_early_code, const float d_very_early_late_spc_chips, const float code_length_half_chips, const float code_phase_step_half_chips, const float tcode_half_chips_input, const lv_32fc_t* d_ca_code, unsigned int num_points){
++
++ // float* pointer1 = (float*)&d_very_early_late_spc_chips;
++ // *pointer1 = 1;
++ // float* pointer2 = (float*)&code_length_half_chips;
++ // *pointer2 = 6;
++ // float* pointer3 = (float*)&code_phase_step_half_chips;
++ // *pointer3 = 7;
++ // float* pointer4 = (float*)&tcode_half_chips_input;
++ // *pointer4 = 8;
++
++ const unsigned int sse_iters = num_points / 4;
++
++ __m128 tquot, fmod_num, fmod_result, associated_chip_index_array;
++
++ __m128 tcode_half_chips_array = _mm_set_ps (tcode_half_chips_input+3*code_phase_step_half_chips, tcode_half_chips_input+2*code_phase_step_half_chips, tcode_half_chips_input+code_phase_step_half_chips, tcode_half_chips_input);
++ __m128 code_phase_step_half_chips_array = _mm_set1_ps (code_phase_step_half_chips*4);
++ __m128 d_very_early_late_spc_chips_Multiplied_by_2 = _mm_set1_ps (2*d_very_early_late_spc_chips);
++ __m128 code_length_half_chips_array = _mm_set1_ps (code_length_half_chips);
++ __m128 twos = _mm_set1_ps (2);
++ __m128i associated_chip_index_array_int;
++
++ __VOLK_ATTR_ALIGNED(16) int32_t output[4];
++
++ for (unsigned int i = 0; i < sse_iters; i++)
++ {
++ //fmod = numer - tquot * denom; tquot = numer/denom truncated
++ //associated_chip_index = 2 + round(fmod(tcode_half_chips - 2*d_very_early_late_spc_chips, code_length_half_chips));
++ fmod_num = _mm_sub_ps (tcode_half_chips_array, d_very_early_late_spc_chips_Multiplied_by_2);
++ tquot = _mm_div_ps (fmod_num, code_length_half_chips_array);
++ tquot = _mm_round_ps (tquot, (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) );
++ fmod_result = _mm_sub_ps (fmod_num, _mm_mul_ps (tquot, code_length_half_chips_array));
++
++ associated_chip_index_array = _mm_round_ps (fmod_result, (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC));
++ associated_chip_index_array = _mm_add_ps(twos, associated_chip_index_array);
++ associated_chip_index_array_int = _mm_cvtps_epi32 (associated_chip_index_array);
++ _mm_store_si128 ((__m128i*)output, associated_chip_index_array_int);
++
++ //d_very_early_code[i] = d_ca_code[associated_chip_index];
++ *d_very_early_code++ = d_ca_code[output[0]];
++ *d_very_early_code++ = d_ca_code[output[1]];
++ *d_very_early_code++ = d_ca_code[output[2]];
++ *d_very_early_code++ = d_ca_code[output[3]];
++
++ //tcode_half_chips = tcode_half_chips + code_phase_step_half_chips;
++ tcode_half_chips_array = _mm_add_ps (tcode_half_chips_array, code_phase_step_half_chips_array);
++ }
++
++ if (num_points%4!=0)
++ {
++ __VOLK_ATTR_ALIGNED(16) float tcode_half_chips_stored[4];
++ _mm_store_si128 ((__m128i*)tcode_half_chips_stored, tcode_half_chips_array);
++
++ int associated_chip_index;
++ float tcode_half_chips = tcode_half_chips_stored[0];
++ float d_very_early_late_spc_chips_multiplied_by_2 = 2*d_very_early_late_spc_chips;
++
++ for (unsigned int i = 0; i < num_points%4; i++)
++ {
++ associated_chip_index = 2 + round(fmod(tcode_half_chips - d_very_early_late_spc_chips_multiplied_by_2, code_length_half_chips));
++ d_very_early_code[i] = d_ca_code[associated_chip_index];
++ tcode_half_chips = tcode_half_chips + code_phase_step_half_chips;
++ }
++ }
++
++}
++#endif /* LV_HAVE_SSE4_1 */
++
++#ifdef LV_HAVE_GENERIC
++ /*!
++ \brief Takes the conjugate of a complex vector.
++ \param cVector The vector where the results will be stored
++ \param aVector Vector to be conjugated
++ \param num_points The number of complex values in aVector to be conjugated and stored into cVector
++ */
++static inline void volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_a_generic(lv_32fc_t* d_very_early_code, const float d_very_early_late_spc_chips, const float code_length_half_chips, const float code_phase_step_half_chips, const float tcode_half_chips_input, const lv_32fc_t* d_ca_code, unsigned int num_points){
++
++ // float* pointer1 = (float*)&d_very_early_late_spc_chips;
++ // *pointer1 = 1;
++ // float* pointer2 = (float*)&code_length_half_chips;
++ // *pointer2 = 6;
++ // float* pointer3 = (float*)&code_phase_step_half_chips;
++ // *pointer3 = 7;
++ // float* pointer4 = (float*)&tcode_half_chips_input;
++ // *pointer4 = 8;
++
++ int associated_chip_index;
++ float tcode_half_chips = tcode_half_chips_input;
++ float d_very_early_late_spc_chips_multiplied_by_2 = 2*d_very_early_late_spc_chips;
++
++ for (unsigned int i = 0; i < num_points; i++)
++ {
++ associated_chip_index = 2 + round(fmod(tcode_half_chips - d_very_early_late_spc_chips_multiplied_by_2, code_length_half_chips));
++ d_very_early_code[i] = d_ca_code[associated_chip_index];
++ tcode_half_chips = tcode_half_chips + code_phase_step_half_chips;
++ }
++}
++#endif /* LV_HAVE_GENERIC */
++
++#endif /* INCLUDED_volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_a_H */
+diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_s32fc_multiply_32fc.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_s32fc_multiply_32fc.h
+--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_s32fc_multiply_32fc.h 1970-01-01 01:00:00.000000000 +0100
++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_s32fc_multiply_32fc.h 2014-10-15 01:55:08.000000000 +0200
+@@ -0,0 +1,178 @@
++#ifndef INCLUDED_volk_gnsssdr_32fc_s32fc_multiply_32fc_u_H
++#define INCLUDED_volk_gnsssdr_32fc_s32fc_multiply_32fc_u_H
++
++#include <inttypes.h>
++#include <stdio.h>
++#include <volk_gnsssdr/volk_gnsssdr_complex.h>
++#include <float.h>
++
++#ifdef LV_HAVE_SSE3
++#include <pmmintrin.h>
++/*!
++ \brief Multiplies the input vector by a scalar and stores the results in the third vector
++ \param cVector The vector where the results will be stored
++ \param aVector The vector to be multiplied
++ \param scalar The complex scalar to multiply aVector
++ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
++*/
++static inline void volk_gnsssdr_32fc_s32fc_multiply_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
++ unsigned int number = 0;
++ const unsigned int halfPoints = num_points / 2;
++
++ __m128 x, yl, yh, z, tmp1, tmp2;
++ lv_32fc_t* c = cVector;
++ const lv_32fc_t* a = aVector;
++
++ // Set up constant scalar vector
++ yl = _mm_set_ps1(lv_creal(scalar));
++ yh = _mm_set_ps1(lv_cimag(scalar));
++
++ for(;number < halfPoints; number++){
++
++ x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
++
++ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
++
++ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
++
++ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
++
++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
++
++ _mm_storeu_ps((float*)c,z); // Store the results back into the C container
++
++ a += 2;
++ c += 2;
++ }
++
++ if((num_points % 2) != 0) {
++ *c = (*a) * scalar;
++ }
++}
++#endif /* LV_HAVE_SSE */
++
++#ifdef LV_HAVE_GENERIC
++/*!
++ \brief Multiplies the input vector by a scalar and stores the results in the third vector
++ \param cVector The vector where the results will be stored
++ \param aVector The vector to be multiplied
++ \param scalar The complex scalar to multiply aVector
++ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
++*/
++static inline void volk_gnsssdr_32fc_s32fc_multiply_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
++ lv_32fc_t* cPtr = cVector;
++ const lv_32fc_t* aPtr = aVector;
++ unsigned int number = num_points;
++
++ // unwrap loop
++ while (number >= 8){
++ *cPtr++ = (*aPtr++) * scalar;
++ *cPtr++ = (*aPtr++) * scalar;
++ *cPtr++ = (*aPtr++) * scalar;
++ *cPtr++ = (*aPtr++) * scalar;
++ *cPtr++ = (*aPtr++) * scalar;
++ *cPtr++ = (*aPtr++) * scalar;
++ *cPtr++ = (*aPtr++) * scalar;
++ *cPtr++ = (*aPtr++) * scalar;
++ number -= 8;
++ }
++
++ // clean up any remaining
++ while (number-- > 0)
++ *cPtr++ = *aPtr++ * scalar;
++}
++#endif /* LV_HAVE_GENERIC */
++
++
++#endif /* INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_u_H */
++#ifndef INCLUDED_volk_gnsssdr_32fc_s32fc_multiply_32fc_a_H
++#define INCLUDED_volk_gnsssdr_32fc_s32fc_multiply_32fc_a_H
++
++#include <inttypes.h>
++#include <stdio.h>
++#include <volk_gnsssdr/volk_gnsssdr_complex.h>
++#include <float.h>
++
++#ifdef LV_HAVE_SSE3
++#include <pmmintrin.h>
++ /*!
++ \brief Multiplies the two input complex vectors and stores their results in the third vector
++ \param cVector The vector where the results will be stored
++ \param aVector One of the vectors to be multiplied
++ \param bVector One of the vectors to be multiplied
++ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
++ */
++static inline void volk_gnsssdr_32fc_s32fc_multiply_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
++ unsigned int number = 0;
++ const unsigned int halfPoints = num_points / 2;
++
++ __m128 x, yl, yh, z, tmp1, tmp2;
++ lv_32fc_t* c = cVector;
++ const lv_32fc_t* a = aVector;
++
++ // Set up constant scalar vector
++ yl = _mm_set_ps1(lv_creal(scalar));
++ yh = _mm_set_ps1(lv_cimag(scalar));
++
++ for(;number < halfPoints; number++){
++
++ x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
++
++ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
++
++ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
++
++ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
++
++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
++
++ _mm_store_ps((float*)c,z); // Store the results back into the C container
++
++ a += 2;
++ c += 2;
++ }
++
++ if((num_points % 2) != 0) {
++ *c = (*a) * scalar;
++ }
++}
++#endif /* LV_HAVE_SSE */
++
++
++#ifdef LV_HAVE_GENERIC
++ /*!
++ \brief Multiplies the two input complex vectors and stores their results in the third vector
++ \param cVector The vector where the results will be stored
++ \param aVector One of the vectors to be multiplied
++ \param bVector One of the vectors to be multiplied
++ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
++ */
++static inline void volk_gnsssdr_32fc_s32fc_multiply_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
++ lv_32fc_t* cPtr = cVector;
++ const lv_32fc_t* aPtr = aVector;
++ unsigned int number = num_points;
++
++ // unwrap loop
++ while (number >= 8){
++ *cPtr++ = (*aPtr++) * scalar;
++ *cPtr++ = (*aPtr++) * scalar;
++ *cPtr++ = (*aPtr++) * scalar;
++ *cPtr++ = (*aPtr++) * scalar;
++ *cPtr++ = (*aPtr++) * scalar;
++ *cPtr++ = (*aPtr++) * scalar;
++ *cPtr++ = (*aPtr++) * scalar;
++ *cPtr++ = (*aPtr++) * scalar;
++ number -= 8;
++ }
++
++ // clean up any remaining
++ while (number-- > 0)
++ *cPtr++ = *aPtr++ * scalar;
++}
++#endif /* LV_HAVE_GENERIC */
++
++
++
++
++
++#endif /* INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_a_H */
+diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x2_dot_prod_32fc.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x2_dot_prod_32fc.h
+--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x2_dot_prod_32fc.h 1970-01-01 01:00:00.000000000 +0100
++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x2_dot_prod_32fc.h 2014-10-15 01:55:08.000000000 +0200
+@@ -0,0 +1,763 @@
++#ifndef INCLUDED_volk_gnsssdr_32fc_x2_dot_prod_32fc_u_H
++#define INCLUDED_volk_gnsssdr_32fc_x2_dot_prod_32fc_u_H
++
++#include <volk_gnsssdr/volk_gnsssdr_common.h>
++#include <volk_gnsssdr/volk_gnsssdr_complex.h>
++#include <stdio.h>
++#include <string.h>
++
++
++#ifdef LV_HAVE_GENERIC
++
++
++static inline void volk_gnsssdr_32fc_x2_dot_prod_32fc_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
++
++ float * res = (float*) result;
++ float * in = (float*) input;
++ float * tp = (float*) taps;
++ unsigned int n_2_ccomplex_blocks = num_points/2;
++ unsigned int isodd = num_points & 1;
++
++ float sum0[2] = {0,0};
++ float sum1[2] = {0,0};
++ unsigned int i = 0;
++
++ for(i = 0; i < n_2_ccomplex_blocks; ++i) {
++ sum0[0] += in[0] * tp[0] - in[1] * tp[1];
++ sum0[1] += in[0] * tp[1] + in[1] * tp[0];
++ sum1[0] += in[2] * tp[2] - in[3] * tp[3];
++ sum1[1] += in[2] * tp[3] + in[3] * tp[2];
++
++ in += 4;
++ tp += 4;
++ }
++
++ res[0] = sum0[0] + sum1[0];
++ res[1] = sum0[1] + sum1[1];
++
++ // Cleanup if we had an odd number of points
++ for(i = 0; i < isodd; ++i) {
++ *result += input[num_points - 1] * taps[num_points - 1];
++ }
++}
++
++#endif /*LV_HAVE_GENERIC*/
++
++
++
++#if LV_HAVE_SSE && LV_HAVE_64
++
++static inline void volk_gnsssdr_32fc_x2_dot_prod_32fc_u_sse_64(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
++
++ const unsigned int num_bytes = num_points*8;
++ unsigned int isodd = num_points & 1;
++
++ asm
++ (
++ "# ccomplex_dotprod_generic (float* result, const float *input,\n\t"
++ "# const float *taps, unsigned num_bytes)\n\t"
++ "# float sum0 = 0;\n\t"
++ "# float sum1 = 0;\n\t"
++ "# float sum2 = 0;\n\t"
++ "# float sum3 = 0;\n\t"
++ "# do {\n\t"
++ "# sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t"
++ "# sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t"
++ "# sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t"
++ "# sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t"
++ "# input += 4;\n\t"
++ "# taps += 4; \n\t"
++ "# } while (--n_2_ccomplex_blocks != 0);\n\t"
++ "# result[0] = sum0 + sum2;\n\t"
++ "# result[1] = sum1 + sum3;\n\t"
++ "# TODO: prefetch and better scheduling\n\t"
++ " xor %%r9, %%r9\n\t"
++ " xor %%r10, %%r10\n\t"
++ " movq %%rcx, %%rax\n\t"
++ " movq %%rcx, %%r8\n\t"
++ " movq %[rsi], %%r9\n\t"
++ " movq %[rdx], %%r10\n\t"
++ " xorps %%xmm6, %%xmm6 # zero accumulators\n\t"
++ " movups 0(%%r9), %%xmm0\n\t"
++ " xorps %%xmm7, %%xmm7 # zero accumulators\n\t"
++ " movups 0(%%r10), %%xmm2\n\t"
++ " shr $5, %%rax # rax = n_2_ccomplex_blocks / 2\n\t"
++ " shr $4, %%r8\n\t"
++ " jmp .%=L1_test\n\t"
++ " # 4 taps / loop\n\t"
++ " # something like ?? cycles / loop\n\t"
++ ".%=Loop1: \n\t"
++ "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
++ "# movups (%%r9), %%xmmA\n\t"
++ "# movups (%%r10), %%xmmB\n\t"
++ "# movups %%xmmA, %%xmmZ\n\t"
++ "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t"
++ "# mulps %%xmmB, %%xmmA\n\t"
++ "# mulps %%xmmZ, %%xmmB\n\t"
++ "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
++ "# xorps %%xmmPN, %%xmmA\n\t"
++ "# movups %%xmmA, %%xmmZ\n\t"
++ "# unpcklps %%xmmB, %%xmmA\n\t"
++ "# unpckhps %%xmmB, %%xmmZ\n\t"
++ "# movups %%xmmZ, %%xmmY\n\t"
++ "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t"
++ "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t"
++ "# addps %%xmmZ, %%xmmA\n\t"
++ "# addps %%xmmA, %%xmmC\n\t"
++ "# A=xmm0, B=xmm2, Z=xmm4\n\t"
++ "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
++ " movups 16(%%r9), %%xmm1\n\t"
++ " movups %%xmm0, %%xmm4\n\t"
++ " mulps %%xmm2, %%xmm0\n\t"
++ " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
++ " movups 16(%%r10), %%xmm3\n\t"
++ " movups %%xmm1, %%xmm5\n\t"
++ " addps %%xmm0, %%xmm6\n\t"
++ " mulps %%xmm3, %%xmm1\n\t"
++ " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t"
++ " addps %%xmm1, %%xmm6\n\t"
++ " mulps %%xmm4, %%xmm2\n\t"
++ " movups 32(%%r9), %%xmm0\n\t"
++ " addps %%xmm2, %%xmm7\n\t"
++ " mulps %%xmm5, %%xmm3\n\t"
++ " add $32, %%r9\n\t"
++ " movups 32(%%r10), %%xmm2\n\t"
++ " addps %%xmm3, %%xmm7\n\t"
++ " add $32, %%r10\n\t"
++ ".%=L1_test:\n\t"
++ " dec %%rax\n\t"
++ " jge .%=Loop1\n\t"
++ " # We've handled the bulk of multiplies up to here.\n\t"
++ " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
++ " # If so, we've got 2 more taps to do.\n\t"
++ " and $1, %%r8\n\t"
++ " je .%=Leven\n\t"
++ " # The count was odd, do 2 more taps.\n\t"
++ " # Note that we've already got mm0/mm2 preloaded\n\t"
++ " # from the main loop.\n\t"
++ " movups %%xmm0, %%xmm4\n\t"
++ " mulps %%xmm2, %%xmm0\n\t"
++ " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
++ " addps %%xmm0, %%xmm6\n\t"
++ " mulps %%xmm4, %%xmm2\n\t"
++ " addps %%xmm2, %%xmm7\n\t"
++ ".%=Leven:\n\t"
++ " # neg inversor\n\t"
++ " xorps %%xmm1, %%xmm1\n\t"
++ " mov $0x80000000, %%r9\n\t"
++ " movd %%r9, %%xmm1\n\t"
++ " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t"
++ " # pfpnacc\n\t"
++ " xorps %%xmm1, %%xmm6\n\t"
++ " movups %%xmm6, %%xmm2\n\t"
++ " unpcklps %%xmm7, %%xmm6\n\t"
++ " unpckhps %%xmm7, %%xmm2\n\t"
++ " movups %%xmm2, %%xmm3\n\t"
++ " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t"
++ " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t"
++ " addps %%xmm2, %%xmm6\n\t"
++ " # xmm6 = r1 i2 r3 i4\n\t"
++ " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t"
++ " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t"
++ " movlps %%xmm6, (%[rdi]) # store low 2x32 bits (complex) to memory\n\t"
++ :
++ :[rsi] "r" (input), [rdx] "r" (taps), "c" (num_bytes), [rdi] "r" (result)
++ :"rax", "r8", "r9", "r10"
++ );
++
++
++ if(isodd) {
++ *result += input[num_points - 1] * taps[num_points - 1];
++ }
++
++ return;
++
++}
++
++#endif /* LV_HAVE_SSE && LV_HAVE_64 */
++
++
++
++
++#ifdef LV_HAVE_SSE3
++
++#include <pmmintrin.h>
++
++static inline void volk_gnsssdr_32fc_x2_dot_prod_32fc_u_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
++
++ lv_32fc_t dotProduct;
++ memset(&dotProduct, 0x0, 2*sizeof(float));
++
++ unsigned int number = 0;
++ const unsigned int halfPoints = num_points/2;
++ unsigned int isodd = num_points & 1;
++
++ __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
++
++ const lv_32fc_t* a = input;
++ const lv_32fc_t* b = taps;
++
++ dotProdVal = _mm_setzero_ps();
++
++ for(;number < halfPoints; number++){
++
++ x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
++ y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
++
++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
++
++ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
++
++ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
++
++ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
++
++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
++
++ dotProdVal = _mm_add_ps(dotProdVal, z); // Add the complex multiplication results together
++
++ a += 2;
++ b += 2;
++ }
++
++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector[2];
++
++ _mm_storeu_ps((float*)dotProductVector,dotProdVal); // Store the results back into the dot product vector
++
++ dotProduct += ( dotProductVector[0] + dotProductVector[1] );
++
++ if(isodd) {
++ dotProduct += input[num_points - 1] * taps[num_points - 1];
++ }
++
++ *result = dotProduct;
++}
++
++#endif /*LV_HAVE_SSE3*/
++
++#ifdef LV_HAVE_SSE4_1
++
++#include <smmintrin.h>
++
++static inline void volk_gnsssdr_32fc_x2_dot_prod_32fc_u_sse4_1(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
++
++ unsigned int i = 0;
++ const unsigned int qtr_points = num_points/4;
++ const unsigned int isodd = num_points & 3;
++
++ __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1;
++ float *p_input, *p_taps;
++ __m64 *p_result;
++
++ p_result = (__m64*)result;
++ p_input = (float*)input;
++ p_taps = (float*)taps;
++
++ static const __m128i neg = {0x000000000000000080000000};
++
++ real0 = _mm_setzero_ps();
++ real1 = _mm_setzero_ps();
++ im0 = _mm_setzero_ps();
++ im1 = _mm_setzero_ps();
++
++ for(; i < qtr_points; ++i) {
++ xmm0 = _mm_loadu_ps(p_input);
++ xmm1 = _mm_loadu_ps(p_taps);
++
++ p_input += 4;
++ p_taps += 4;
++
++ xmm2 = _mm_loadu_ps(p_input);
++ xmm3 = _mm_loadu_ps(p_taps);
++
++ p_input += 4;
++ p_taps += 4;
++
++ xmm4 = _mm_unpackhi_ps(xmm0, xmm2);
++ xmm5 = _mm_unpackhi_ps(xmm1, xmm3);
++ xmm0 = _mm_unpacklo_ps(xmm0, xmm2);
++ xmm2 = _mm_unpacklo_ps(xmm1, xmm3);
++
++ //imaginary vector from input
++ xmm1 = _mm_unpackhi_ps(xmm0, xmm4);
++ //real vector from input
++ xmm3 = _mm_unpacklo_ps(xmm0, xmm4);
++ //imaginary vector from taps
++ xmm0 = _mm_unpackhi_ps(xmm2, xmm5);
++ //real vector from taps
++ xmm2 = _mm_unpacklo_ps(xmm2, xmm5);
++
++ xmm4 = _mm_dp_ps(xmm3, xmm2, 0xf1);
++ xmm5 = _mm_dp_ps(xmm1, xmm0, 0xf1);
++
++ xmm6 = _mm_dp_ps(xmm3, xmm0, 0xf2);
++ xmm7 = _mm_dp_ps(xmm1, xmm2, 0xf2);
++
++ real0 = _mm_add_ps(xmm4, real0);
++ real1 = _mm_add_ps(xmm5, real1);
++ im0 = _mm_add_ps(xmm6, im0);
++ im1 = _mm_add_ps(xmm7, im1);
++ }
++
++ real1 = _mm_xor_ps(real1, bit128_p(&neg)->float_vec);
++
++ im0 = _mm_add_ps(im0, im1);
++ real0 = _mm_add_ps(real0, real1);
++
++ im0 = _mm_add_ps(im0, real0);
++
++ _mm_storel_pi(p_result, im0);
++
++ for(i = num_points-isodd; i < num_points; i++) {
++ *result += input[i] * taps[i];
++ }
++}
++
++#endif /*LV_HAVE_SSE4_1*/
++
++
++
++
++#endif /*INCLUDED_volk_gnsssdr_32fc_x2_dot_prod_32fc_u_H*/
++#ifndef INCLUDED_volk_gnsssdr_32fc_x2_dot_prod_32fc_a_H
++#define INCLUDED_volk_gnsssdr_32fc_x2_dot_prod_32fc_a_H
++
++#include <volk_gnsssdr/volk_gnsssdr_common.h>
++#include <volk_gnsssdr/volk_gnsssdr_complex.h>
++#include <stdio.h>
++#include <string.h>
++
++
++#ifdef LV_HAVE_GENERIC
++
++
++static inline void volk_gnsssdr_32fc_x2_dot_prod_32fc_a_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
++
++ const unsigned int num_bytes = num_points*8;
++
++ float * res = (float*) result;
++ float * in = (float*) input;
++ float * tp = (float*) taps;
++ unsigned int n_2_ccomplex_blocks = num_bytes >> 4;
++ unsigned int isodd = num_points & 1;
++
++ float sum0[2] = {0,0};
++ float sum1[2] = {0,0};
++ unsigned int i = 0;
++
++ for(i = 0; i < n_2_ccomplex_blocks; ++i) {
++ sum0[0] += in[0] * tp[0] - in[1] * tp[1];
++ sum0[1] += in[0] * tp[1] + in[1] * tp[0];
++ sum1[0] += in[2] * tp[2] - in[3] * tp[3];
++ sum1[1] += in[2] * tp[3] + in[3] * tp[2];
++
++ in += 4;
++ tp += 4;
++ }
++
++ res[0] = sum0[0] + sum1[0];
++ res[1] = sum0[1] + sum1[1];
++
++ for(i = 0; i < isodd; ++i) {
++ *result += input[num_points - 1] * taps[num_points - 1];
++ }
++}
++
++#endif /*LV_HAVE_GENERIC*/
++
++
++#if LV_HAVE_SSE && LV_HAVE_64
++
++
++static inline void volk_gnsssdr_32fc_x2_dot_prod_32fc_a_sse_64(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
++
++ const unsigned int num_bytes = num_points*8;
++ unsigned int isodd = num_points & 1;
++
++ asm
++ (
++ "# ccomplex_dotprod_generic (float* result, const float *input,\n\t"
++ "# const float *taps, unsigned num_bytes)\n\t"
++ "# float sum0 = 0;\n\t"
++ "# float sum1 = 0;\n\t"
++ "# float sum2 = 0;\n\t"
++ "# float sum3 = 0;\n\t"
++ "# do {\n\t"
++ "# sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t"
++ "# sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t"
++ "# sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t"
++ "# sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t"
++ "# input += 4;\n\t"
++ "# taps += 4; \n\t"
++ "# } while (--n_2_ccomplex_blocks != 0);\n\t"
++ "# result[0] = sum0 + sum2;\n\t"
++ "# result[1] = sum1 + sum3;\n\t"
++ "# TODO: prefetch and better scheduling\n\t"
++ " xor %%r9, %%r9\n\t"
++ " xor %%r10, %%r10\n\t"
++ " movq %%rcx, %%rax\n\t"
++ " movq %%rcx, %%r8\n\t"
++ " movq %[rsi], %%r9\n\t"
++ " movq %[rdx], %%r10\n\t"
++ " xorps %%xmm6, %%xmm6 # zero accumulators\n\t"
++ " movaps 0(%%r9), %%xmm0\n\t"
++ " xorps %%xmm7, %%xmm7 # zero accumulators\n\t"
++ " movaps 0(%%r10), %%xmm2\n\t"
++ " shr $5, %%rax # rax = n_2_ccomplex_blocks / 2\n\t"
++ " shr $4, %%r8\n\t"
++ " jmp .%=L1_test\n\t"
++ " # 4 taps / loop\n\t"
++ " # something like ?? cycles / loop\n\t"
++ ".%=Loop1: \n\t"
++ "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
++ "# movaps (%%r9), %%xmmA\n\t"
++ "# movaps (%%r10), %%xmmB\n\t"
++ "# movaps %%xmmA, %%xmmZ\n\t"
++ "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t"
++ "# mulps %%xmmB, %%xmmA\n\t"
++ "# mulps %%xmmZ, %%xmmB\n\t"
++ "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
++ "# xorps %%xmmPN, %%xmmA\n\t"
++ "# movaps %%xmmA, %%xmmZ\n\t"
++ "# unpcklps %%xmmB, %%xmmA\n\t"
++ "# unpckhps %%xmmB, %%xmmZ\n\t"
++ "# movaps %%xmmZ, %%xmmY\n\t"
++ "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t"
++ "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t"
++ "# addps %%xmmZ, %%xmmA\n\t"
++ "# addps %%xmmA, %%xmmC\n\t"
++ "# A=xmm0, B=xmm2, Z=xmm4\n\t"
++ "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
++ " movaps 16(%%r9), %%xmm1\n\t"
++ " movaps %%xmm0, %%xmm4\n\t"
++ " mulps %%xmm2, %%xmm0\n\t"
++ " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
++ " movaps 16(%%r10), %%xmm3\n\t"
++ " movaps %%xmm1, %%xmm5\n\t"
++ " addps %%xmm0, %%xmm6\n\t"
++ " mulps %%xmm3, %%xmm1\n\t"
++ " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t"
++ " addps %%xmm1, %%xmm6\n\t"
++ " mulps %%xmm4, %%xmm2\n\t"
++ " movaps 32(%%r9), %%xmm0\n\t"
++ " addps %%xmm2, %%xmm7\n\t"
++ " mulps %%xmm5, %%xmm3\n\t"
++ " add $32, %%r9\n\t"
++ " movaps 32(%%r10), %%xmm2\n\t"
++ " addps %%xmm3, %%xmm7\n\t"
++ " add $32, %%r10\n\t"
++ ".%=L1_test:\n\t"
++ " dec %%rax\n\t"
++ " jge .%=Loop1\n\t"
++ " # We've handled the bulk of multiplies up to here.\n\t"
++ " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
++ " # If so, we've got 2 more taps to do.\n\t"
++ " and $1, %%r8\n\t"
++ " je .%=Leven\n\t"
++ " # The count was odd, do 2 more taps.\n\t"
++ " # Note that we've already got mm0/mm2 preloaded\n\t"
++ " # from the main loop.\n\t"
++ " movaps %%xmm0, %%xmm4\n\t"
++ " mulps %%xmm2, %%xmm0\n\t"
++ " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
++ " addps %%xmm0, %%xmm6\n\t"
++ " mulps %%xmm4, %%xmm2\n\t"
++ " addps %%xmm2, %%xmm7\n\t"
++ ".%=Leven:\n\t"
++ " # neg inversor\n\t"
++ " xorps %%xmm1, %%xmm1\n\t"
++ " mov $0x80000000, %%r9\n\t"
++ " movd %%r9, %%xmm1\n\t"
++ " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t"
++ " # pfpnacc\n\t"
++ " xorps %%xmm1, %%xmm6\n\t"
++ " movaps %%xmm6, %%xmm2\n\t"
++ " unpcklps %%xmm7, %%xmm6\n\t"
++ " unpckhps %%xmm7, %%xmm2\n\t"
++ " movaps %%xmm2, %%xmm3\n\t"
++ " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t"
++ " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t"
++ " addps %%xmm2, %%xmm6\n\t"
++ " # xmm6 = r1 i2 r3 i4\n\t"
++ " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t"
++ " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t"
++ " movlps %%xmm6, (%[rdi]) # store low 2x32 bits (complex) to memory\n\t"
++ :
++ :[rsi] "r" (input), [rdx] "r" (taps), "c" (num_bytes), [rdi] "r" (result)
++ :"rax", "r8", "r9", "r10"
++ );
++
++
++ if(isodd) {
++ *result += input[num_points - 1] * taps[num_points - 1];
++ }
++
++ return;
++
++}
++
++#endif
++
++#if LV_HAVE_SSE && LV_HAVE_32
++
++static inline void volk_gnsssdr_32fc_x2_dot_prod_32fc_a_sse_32(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
++
++ volk_gnsssdr_32fc_x2_dot_prod_32fc_a_generic(result, input, taps, num_points);
++
++#if 0
++ const unsigned int num_bytes = num_points*8;
++ unsigned int isodd = num_points & 1;
++
++ asm volatile
++ (
++ " #pushl %%ebp\n\t"
++ " #movl %%esp, %%ebp\n\t"
++ " movl 12(%%ebp), %%eax # input\n\t"
++ " movl 16(%%ebp), %%edx # taps\n\t"
++ " movl 20(%%ebp), %%ecx # n_bytes\n\t"
++ " xorps %%xmm6, %%xmm6 # zero accumulators\n\t"
++ " movaps 0(%%eax), %%xmm0\n\t"
++ " xorps %%xmm7, %%xmm7 # zero accumulators\n\t"
++ " movaps 0(%%edx), %%xmm2\n\t"
++ " shrl $5, %%ecx # ecx = n_2_ccomplex_blocks / 2\n\t"
++ " jmp .%=L1_test\n\t"
++ " # 4 taps / loop\n\t"
++ " # something like ?? cycles / loop\n\t"
++ ".%=Loop1: \n\t"
++ "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
++ "# movaps (%%eax), %%xmmA\n\t"
++ "# movaps (%%edx), %%xmmB\n\t"
++ "# movaps %%xmmA, %%xmmZ\n\t"
++ "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t"
++ "# mulps %%xmmB, %%xmmA\n\t"
++ "# mulps %%xmmZ, %%xmmB\n\t"
++ "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
++ "# xorps %%xmmPN, %%xmmA\n\t"
++ "# movaps %%xmmA, %%xmmZ\n\t"
++ "# unpcklps %%xmmB, %%xmmA\n\t"
++ "# unpckhps %%xmmB, %%xmmZ\n\t"
++ "# movaps %%xmmZ, %%xmmY\n\t"
++ "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t"
++ "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t"
++ "# addps %%xmmZ, %%xmmA\n\t"
++ "# addps %%xmmA, %%xmmC\n\t"
++ "# A=xmm0, B=xmm2, Z=xmm4\n\t"
++ "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
++ " movaps 16(%%eax), %%xmm1\n\t"
++ " movaps %%xmm0, %%xmm4\n\t"
++ " mulps %%xmm2, %%xmm0\n\t"
++ " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
++ " movaps 16(%%edx), %%xmm3\n\t"
++ " movaps %%xmm1, %%xmm5\n\t"
++ " addps %%xmm0, %%xmm6\n\t"
++ " mulps %%xmm3, %%xmm1\n\t"
++ " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t"
++ " addps %%xmm1, %%xmm6\n\t"
++ " mulps %%xmm4, %%xmm2\n\t"
++ " movaps 32(%%eax), %%xmm0\n\t"
++ " addps %%xmm2, %%xmm7\n\t"
++ " mulps %%xmm5, %%xmm3\n\t"
++ " addl $32, %%eax\n\t"
++ " movaps 32(%%edx), %%xmm2\n\t"
++ " addps %%xmm3, %%xmm7\n\t"
++ " addl $32, %%edx\n\t"
++ ".%=L1_test:\n\t"
++ " decl %%ecx\n\t"
++ " jge .%=Loop1\n\t"
++ " # We've handled the bulk of multiplies up to here.\n\t"
++ " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
++ " # If so, we've got 2 more taps to do.\n\t"
++ " movl 20(%%ebp), %%ecx # n_2_ccomplex_blocks\n\t"
++ " shrl $4, %%ecx\n\t"
++ " andl $1, %%ecx\n\t"
++ " je .%=Leven\n\t"
++ " # The count was odd, do 2 more taps.\n\t"
++ " # Note that we've already got mm0/mm2 preloaded\n\t"
++ " # from the main loop.\n\t"
++ " movaps %%xmm0, %%xmm4\n\t"
++ " mulps %%xmm2, %%xmm0\n\t"
++ " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
++ " addps %%xmm0, %%xmm6\n\t"
++ " mulps %%xmm4, %%xmm2\n\t"
++ " addps %%xmm2, %%xmm7\n\t"
++ ".%=Leven:\n\t"
++ " # neg inversor\n\t"
++ " movl 8(%%ebp), %%eax \n\t"
++ " xorps %%xmm1, %%xmm1\n\t"
++ " movl $0x80000000, (%%eax)\n\t"
++ " movss (%%eax), %%xmm1\n\t"
++ " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t"
++ " # pfpnacc\n\t"
++ " xorps %%xmm1, %%xmm6\n\t"
++ " movaps %%xmm6, %%xmm2\n\t"
++ " unpcklps %%xmm7, %%xmm6\n\t"
++ " unpckhps %%xmm7, %%xmm2\n\t"
++ " movaps %%xmm2, %%xmm3\n\t"
++ " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t"
++ " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t"
++ " addps %%xmm2, %%xmm6\n\t"
++ " # xmm6 = r1 i2 r3 i4\n\t"
++ " #movl 8(%%ebp), %%eax # @result\n\t"
++ " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t"
++ " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t"
++ " movlps %%xmm6, (%%eax) # store low 2x32 bits (complex) to memory\n\t"
++ " #popl %%ebp\n\t"
++ :
++ :
++ : "eax", "ecx", "edx"
++ );
++
++
++ int getem = num_bytes % 16;
++
++ if(isodd) {
++ *result += (input[num_points - 1] * taps[num_points - 1]);
++ }
++
++ return;
++#endif
++}
++
++#endif /*LV_HAVE_SSE*/
++
++#ifdef LV_HAVE_SSE3
++
++#include <pmmintrin.h>
++
++static inline void volk_gnsssdr_32fc_x2_dot_prod_32fc_a_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
++
++ const unsigned int num_bytes = num_points*8;
++ unsigned int isodd = num_points & 1;
++
++ lv_32fc_t dotProduct;
++ memset(&dotProduct, 0x0, 2*sizeof(float));
++
++ unsigned int number = 0;
++ const unsigned int halfPoints = num_bytes >> 4;
++
++ __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
++
++ const lv_32fc_t* a = input;
++ const lv_32fc_t* b = taps;
++
++ dotProdVal = _mm_setzero_ps();
++
++ for(;number < halfPoints; number++){
++
++ x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
++ y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
++
++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
++
++ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
++
++ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
++
++ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
++
++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
++
++ dotProdVal = _mm_add_ps(dotProdVal, z); // Add the complex multiplication results together
++
++ a += 2;
++ b += 2;
++ }
++
++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector[2];
++
++ _mm_store_ps((float*)dotProductVector,dotProdVal); // Store the results back into the dot product vector
++
++ dotProduct += ( dotProductVector[0] + dotProductVector[1] );
++
++ if(isodd) {
++ dotProduct += input[num_points - 1] * taps[num_points - 1];
++ }
++
++ *result = dotProduct;
++}
++
++#endif /*LV_HAVE_SSE3*/
++
++#ifdef LV_HAVE_SSE4_1
++
++#include <smmintrin.h>
++
++static inline void volk_gnsssdr_32fc_x2_dot_prod_32fc_a_sse4_1(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
++
++ unsigned int i = 0;
++ const unsigned int qtr_points = num_points/4;
++ const unsigned int isodd = num_points & 3;
++
++ __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1;
++ float *p_input, *p_taps;
++ __m64 *p_result;
++
++ static const __m128i neg = {0x000000000000000080000000};
++
++ p_result = (__m64*)result;
++ p_input = (float*)input;
++ p_taps = (float*)taps;
++
++ real0 = _mm_setzero_ps();
++ real1 = _mm_setzero_ps();
++ im0 = _mm_setzero_ps();
++ im1 = _mm_setzero_ps();
++
++ for(; i < qtr_points; ++i) {
++ xmm0 = _mm_load_ps(p_input);
++ xmm1 = _mm_load_ps(p_taps);
++
++ p_input += 4;
++ p_taps += 4;
++
++ xmm2 = _mm_load_ps(p_input);
++ xmm3 = _mm_load_ps(p_taps);
++
++ p_input += 4;
++ p_taps += 4;
++
++ xmm4 = _mm_unpackhi_ps(xmm0, xmm2);
++ xmm5 = _mm_unpackhi_ps(xmm1, xmm3);
++ xmm0 = _mm_unpacklo_ps(xmm0, xmm2);
++ xmm2 = _mm_unpacklo_ps(xmm1, xmm3);
++
++ //imaginary vector from input
++ xmm1 = _mm_unpackhi_ps(xmm0, xmm4);
++ //real vector from input
++ xmm3 = _mm_unpacklo_ps(xmm0, xmm4);
++ //imaginary vector from taps
++ xmm0 = _mm_unpackhi_ps(xmm2, xmm5);
++ //real vector from taps
++ xmm2 = _mm_unpacklo_ps(xmm2, xmm5);
++
++ xmm4 = _mm_dp_ps(xmm3, xmm2, 0xf1);
++ xmm5 = _mm_dp_ps(xmm1, xmm0, 0xf1);
++
++ xmm6 = _mm_dp_ps(xmm3, xmm0, 0xf2);
++ xmm7 = _mm_dp_ps(xmm1, xmm2, 0xf2);
++
++ real0 = _mm_add_ps(xmm4, real0);
++ real1 = _mm_add_ps(xmm5, real1);
++ im0 = _mm_add_ps(xmm6, im0);
++ im1 = _mm_add_ps(xmm7, im1);
++ }
++
++ real1 = _mm_xor_ps(real1, bit128_p(&neg)->float_vec);
++
++ im0 = _mm_add_ps(im0, im1);
++ real0 = _mm_add_ps(real0, real1);
++
++ im0 = _mm_add_ps(im0, real0);
++
++ _mm_storel_pi(p_result, im0);
++
++ for(i = num_points-isodd; i < num_points; i++) {
++ *result += input[i] * taps[i];
++ }
++}
++
++#endif /*LV_HAVE_SSE4_1*/
++
++#endif /*INCLUDED_volk_gnsssdr_32fc_x2_dot_prod_32fc_a_H*/
+diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x2_multiply_32fc.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x2_multiply_32fc.h
+--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x2_multiply_32fc.h 1970-01-01 01:00:00.000000000 +0100
++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x2_multiply_32fc.h 2014-10-15 01:55:08.000000000 +0200
+@@ -0,0 +1,170 @@
++#ifndef INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_u_H
++#define INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_u_H
++
++#include <inttypes.h>
++#include <stdio.h>
++#include <volk_gnsssdr/volk_gnsssdr_complex.h>
++#include <float.h>
++
++#ifdef LV_HAVE_SSE3
++#include <pmmintrin.h>
++ /*!
++ \brief Multiplies the two input complex vectors and stores their results in the third vector
++ \param cVector The vector where the results will be stored
++ \param aVector One of the vectors to be multiplied
++ \param bVector One of the vectors to be multiplied
++ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
++ */
++static inline void volk_gnsssdr_32fc_x2_multiply_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
++ unsigned int number = 0;
++ const unsigned int halfPoints = num_points / 2;
++
++ __m128 x, y, yl, yh, z, tmp1, tmp2;
++ lv_32fc_t* c = cVector;
++ const lv_32fc_t* a = aVector;
++ const lv_32fc_t* b = bVector;
++
++ for(;number < halfPoints; number++){
++
++ x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
++ y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
++
++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
++
++ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
++
++ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
++
++ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
++
++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
++
++ _mm_storeu_ps((float*)c,z); // Store the results back into the C container
++
++ a += 2;
++ b += 2;
++ c += 2;
++ }
++
++ if((num_points % 2) != 0) {
++ *c = (*a) * (*b);
++ }
++}
++#endif /* LV_HAVE_SSE */
++
++#ifdef LV_HAVE_GENERIC
++ /*!
++ \brief Multiplies the two input complex vectors and stores their results in the third vector
++ \param cVector The vector where the results will be stored
++ \param aVector One of the vectors to be multiplied
++ \param bVector One of the vectors to be multiplied
++ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
++ */
++static inline void volk_gnsssdr_32fc_x2_multiply_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
++ lv_32fc_t* cPtr = cVector;
++ const lv_32fc_t* aPtr = aVector;
++ const lv_32fc_t* bPtr= bVector;
++ unsigned int number = 0;
++
++ for(number = 0; number < num_points; number++){
++ *cPtr++ = (*aPtr++) * (*bPtr++);
++ }
++}
++#endif /* LV_HAVE_GENERIC */
++
++
++#endif /* INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_u_H */
++#ifndef INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_a_H
++#define INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_a_H
++
++#include <inttypes.h>
++#include <stdio.h>
++#include <volk_gnsssdr/volk_gnsssdr_complex.h>
++#include <float.h>
++
++#ifdef LV_HAVE_SSE3
++#include <pmmintrin.h>
++ /*!
++ \brief Multiplies the two input complex vectors and stores their results in the third vector
++ \param cVector The vector where the results will be stored
++ \param aVector One of the vectors to be multiplied
++ \param bVector One of the vectors to be multiplied
++ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
++ */
++static inline void volk_gnsssdr_32fc_x2_multiply_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
++ unsigned int number = 0;
++ const unsigned int halfPoints = num_points / 2;
++
++ __m128 x, y, yl, yh, z, tmp1, tmp2;
++ lv_32fc_t* c = cVector;
++ const lv_32fc_t* a = aVector;
++ const lv_32fc_t* b = bVector;
++ for(;number < halfPoints; number++){
++
++ x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
++ y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
++
++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
++
++ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
++
++ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
++
++ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
++
++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
++
++ _mm_store_ps((float*)c,z); // Store the results back into the C container
++
++ a += 2;
++ b += 2;
++ c += 2;
++ }
++
++ if((num_points % 2) != 0) {
++ *c = (*a) * (*b);
++ }
++}
++#endif /* LV_HAVE_SSE */
++
++#ifdef LV_HAVE_GENERIC
++ /*!
++ \brief Multiplies the two input complex vectors and stores their results in the third vector
++ \param cVector The vector where the results will be stored
++ \param aVector One of the vectors to be multiplied
++ \param bVector One of the vectors to be multiplied
++ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
++ */
++static inline void volk_gnsssdr_32fc_x2_multiply_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
++ lv_32fc_t* cPtr = cVector;
++ const lv_32fc_t* aPtr = aVector;
++ const lv_32fc_t* bPtr= bVector;
++ unsigned int number = 0;
++
++ for(number = 0; number < num_points; number++){
++ *cPtr++ = (*aPtr++) * (*bPtr++);
++ }
++}
++#endif /* LV_HAVE_GENERIC */
++
++#ifdef LV_HAVE_ORC
++ /*!
++ \brief Multiplies the two input complex vectors and stores their results in the third vector
++ \param cVector The vector where the results will be stored
++ \param aVector One of the vectors to be multiplied
++ \param bVector One of the vectors to be multiplied
++ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
++ */
++extern void volk_gnsssdr_32fc_x2_multiply_32fc_a_orc_impl(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points);
++static inline void volk_gnsssdr_32fc_x2_multiply_32fc_u_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
++ volk_gnsssdr_32fc_x2_multiply_32fc_a_orc_impl(cVector, aVector, bVector, num_points);
++}
++#endif /* LV_HAVE_ORC */
++
++
++
++
++
++#endif /* INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_a_H */
+diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3.h
+--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3.h 1970-01-01 01:00:00.000000000 +0100
++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3.h 2014-10-15 01:55:08.000000000 +0200
+@@ -0,0 +1,409 @@
++#ifndef INCLUDED_gnsssdr_volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_u_H
++#define INCLUDED_gnsssdr_volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_u_H
++
++#include <inttypes.h>
++#include <stdio.h>
++#include <volk_gnsssdr/volk_gnsssdr_complex.h>
++#include <float.h>
++#include <string.h>
++
++/*!
++ * TODO: Code the SSE4 version and benchmark it
++ */
++#ifdef LV_HAVE_SSE3
++#include <pmmintrin.h>
++
++
++ /*!
++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
++ \param input The input signal input
++ \param carrier The carrier signal input
++ \param E_code Early PRN code replica input
++ \param P_code Early PRN code replica input
++ \param L_code Early PRN code replica input
++ \param E_out Early correlation output
++ \param P_out Early correlation output
++ \param L_out Early correlation output
++ \param num_points The number of complex values in vectors
++ */
++static inline void volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_u_sse3(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, unsigned int num_points)
++{
++ unsigned int number = 0;
++ const unsigned int halfPoints = num_points / 2;
++
++ lv_32fc_t dotProduct_E;
++ memset(&dotProduct_E, 0x0, 2*sizeof(float));
++ lv_32fc_t dotProduct_P;
++ memset(&dotProduct_P, 0x0, 2*sizeof(float));
++ lv_32fc_t dotProduct_L;
++ memset(&dotProduct_L, 0x0, 2*sizeof(float));
++
++ // Aux vars
++ __m128 x, y, yl, yh, z, tmp1, tmp2, z_E, z_P, z_L;
++
++ z_E = _mm_setzero_ps();
++ z_P = _mm_setzero_ps();
++ z_L = _mm_setzero_ps();
++
++ //input and output vectors
++ //lv_32fc_t* _input_BB = input_BB;
++ const lv_32fc_t* _input = input;
++ const lv_32fc_t* _carrier = carrier;
++ const lv_32fc_t* _E_code = E_code;
++ const lv_32fc_t* _P_code = P_code;
++ const lv_32fc_t* _L_code = L_code;
++
++ for(;number < halfPoints; number++)
++ {
++ // carrier wipe-off (vector point-to-point product)
++ x = _mm_loadu_ps((float*)_input); // Load the ar + ai, br + bi as ar,ai,br,bi
++ y = _mm_loadu_ps((float*)_carrier); // Load the cr + ci, dr + di as cr,ci,dr,di
++
++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
++
++ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
++
++ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
++
++ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
++
++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
++
++ //_mm_storeu_ps((float*)_input_BB,z); // Store the results back into the _input_BB container
++
++ // correlation E,P,L (3x vector scalar product)
++ // Early
++ //x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi
++ x = z;
++
++ y = _mm_load_ps((float*)_E_code); // Load the cr + ci, dr + di as cr,ci,dr,di
++
++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
++
++ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
++
++ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
++
++ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
++
++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
++
++ z_E = _mm_add_ps(z_E, z); // Add the complex multiplication results together
++
++ // Prompt
++ //x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi
++ y = _mm_load_ps((float*)_P_code); // Load the cr + ci, dr + di as cr,ci,dr,di
++
++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
++
++ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
++
++ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
++
++ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
++
++ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
++
++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
++
++ z_P = _mm_add_ps(z_P, z); // Add the complex multiplication results together
++
++ // Late
++ //x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi
++ y = _mm_load_ps((float*)_L_code); // Load the cr + ci, dr + di as cr,ci,dr,di
++
++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
++
++ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
++
++ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
++
++ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
++
++ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
++
++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
++
++ z_L = _mm_add_ps(z_L, z); // Add the complex multiplication results together
++
++ /*pointer increment*/
++ _carrier += 2;
++ _input += 2;
++ //_input_BB += 2;
++ _E_code += 2;
++ _P_code += 2;
++ _L_code +=2;
++ }
++
++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_E[2];
++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_P[2];
++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_L[2];
++ //__VOLK_ATTR_ALIGNED(16) lv_32fc_t _input_BB;
++
++ _mm_store_ps((float*)dotProductVector_E,z_E); // Store the results back into the dot product vector
++ _mm_store_ps((float*)dotProductVector_P,z_P); // Store the results back into the dot product vector
++ _mm_store_ps((float*)dotProductVector_L,z_L); // Store the results back into the dot product vector
++
++ dotProduct_E += ( dotProductVector_E[0] + dotProductVector_E[1] );
++ dotProduct_P += ( dotProductVector_P[0] + dotProductVector_P[1] );
++ dotProduct_L += ( dotProductVector_L[0] + dotProductVector_L[1] );
++
++ if((num_points % 2) != 0)
++ {
++ //_input_BB = (*_input) * (*_carrier);
++ dotProduct_E += (*_input) * (*_E_code)*(*_carrier);
++ dotProduct_P += (*_input) * (*_P_code)*(*_carrier);
++ dotProduct_L += (*_input) * (*_L_code)*(*_carrier);
++ }
++
++ *E_out = dotProduct_E;
++ *P_out = dotProduct_P;
++ *L_out = dotProduct_L;
++}
++
++#endif /* LV_HAVE_SSE3 */
++
++#ifdef LV_HAVE_GENERIC
++/*!
++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
++ \param input The input signal input
++ \param carrier The carrier signal input
++ \param E_code Early PRN code replica input
++ \param P_code Early PRN code replica input
++ \param L_code Early PRN code replica input
++ \param E_out Early correlation output
++ \param P_out Early correlation output
++ \param L_out Early correlation output
++ \param num_points The number of complex values in vectors
++ */
++static inline void volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_generic(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, unsigned int num_points)
++{
++ lv_32fc_t bb_signal_sample;
++
++ bb_signal_sample = lv_cmake(0, 0);
++
++ *E_out = 0;
++ *P_out = 0;
++ *L_out = 0;
++ // perform Early, Prompt and Late correlation
++ for(int i=0; i < num_points; ++i)
++ {
++ //Perform the carrier wipe-off
++ bb_signal_sample = input[i] * carrier[i];
++ // Now get early, late, and prompt values for each
++ *E_out += bb_signal_sample * E_code[i];
++ *P_out += bb_signal_sample * P_code[i];
++ *L_out += bb_signal_sample * L_code[i];
++ }
++}
++
++#endif /* LV_HAVE_GENERIC */
++
++#endif /* INCLUDED_gnsssdr_volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_u_H */
++
++
++#ifndef INCLUDED_gnsssdr_volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_a_H
++#define INCLUDED_gnsssdr_volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_a_H
++
++#include <inttypes.h>
++#include <stdio.h>
++#include <volk_gnsssdr/volk_gnsssdr_complex.h>
++#include <float.h>
++#include <string.h>
++
++#ifdef LV_HAVE_SSE3
++#include <pmmintrin.h>
++/*!
++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
++ \param input The input signal input
++ \param carrier The carrier signal input
++ \param E_code Early PRN code replica input
++ \param P_code Early PRN code replica input
++ \param L_code Early PRN code replica input
++ \param E_out Early correlation output
++ \param P_out Early correlation output
++ \param L_out Early correlation output
++ \param num_points The number of complex values in vectors
++ */
++static inline void volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_a_sse3(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, unsigned int num_points)
++{
++ unsigned int number = 0;
++ const unsigned int halfPoints = num_points / 2;
++
++ lv_32fc_t dotProduct_E;
++ memset(&dotProduct_E, 0x0, 2*sizeof(float));
++ lv_32fc_t dotProduct_P;
++ memset(&dotProduct_P, 0x0, 2*sizeof(float));
++ lv_32fc_t dotProduct_L;
++ memset(&dotProduct_L, 0x0, 2*sizeof(float));
++
++ // Aux vars
++ __m128 x, y, yl, yh, z, tmp1, tmp2, z_E, z_P, z_L;
++
++ z_E = _mm_setzero_ps();
++ z_P = _mm_setzero_ps();
++ z_L = _mm_setzero_ps();
++
++ //input and output vectors
++ //lv_32fc_t* _input_BB = input_BB;
++ const lv_32fc_t* _input = input;
++ const lv_32fc_t* _carrier = carrier;
++ const lv_32fc_t* _E_code = E_code;
++ const lv_32fc_t* _P_code = P_code;
++ const lv_32fc_t* _L_code = L_code;
++
++ for(;number < halfPoints; number++)
++ {
++ // carrier wipe-off (vector point-to-point product)
++ x = _mm_load_ps((float*)_input); // Load the ar + ai, br + bi as ar,ai,br,bi
++ y = _mm_load_ps((float*)_carrier); // Load the cr + ci, dr + di as cr,ci,dr,di
++
++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
++
++ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
++
++ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
++
++ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
++
++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
++
++ //_mm_storeu_ps((float*)_input_BB,z); // Store the results back into the _input_BB container
++
++ // correlation E,P,L (3x vector scalar product)
++ // Early
++ //x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi
++ x = z;
++
++ y = _mm_load_ps((float*)_E_code); // Load the cr + ci, dr + di as cr,ci,dr,di
++
++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
++
++ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
++
++ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
++
++ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
++
++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
++
++ z_E = _mm_add_ps(z_E, z); // Add the complex multiplication results together
++
++ // Prompt
++ //x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi
++ y = _mm_load_ps((float*)_P_code); // Load the cr + ci, dr + di as cr,ci,dr,di
++
++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
++
++ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
++
++ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
++
++ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
++
++ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
++
++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
++
++ z_P = _mm_add_ps(z_P, z); // Add the complex multiplication results together
++
++ // Late
++ //x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi
++ y = _mm_load_ps((float*)_L_code); // Load the cr + ci, dr + di as cr,ci,dr,di
++
++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
++
++ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
++
++ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
++
++ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
++
++ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
++
++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
++
++ z_L = _mm_add_ps(z_L, z); // Add the complex multiplication results together
++
++ /*pointer increment*/
++ _carrier += 2;
++ _input += 2;
++ //_input_BB += 2;
++ _E_code += 2;
++ _P_code += 2;
++ _L_code +=2;
++ }
++
++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_E[2];
++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_P[2];
++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_L[2];
++ //__VOLK_ATTR_ALIGNED(16) lv_32fc_t _input_BB;
++
++ _mm_store_ps((float*)dotProductVector_E,z_E); // Store the results back into the dot product vector
++ _mm_store_ps((float*)dotProductVector_P,z_P); // Store the results back into the dot product vector
++ _mm_store_ps((float*)dotProductVector_L,z_L); // Store the results back into the dot product vector
++
++ dotProduct_E += ( dotProductVector_E[0] + dotProductVector_E[1] );
++ dotProduct_P += ( dotProductVector_P[0] + dotProductVector_P[1] );
++ dotProduct_L += ( dotProductVector_L[0] + dotProductVector_L[1] );
++
++ if((num_points % 2) != 0)
++ {
++ //_input_BB = (*_input) * (*_carrier);
++ dotProduct_E += (*_input) * (*_E_code)*(*_carrier);
++ dotProduct_P += (*_input) * (*_P_code)*(*_carrier);
++ dotProduct_L += (*_input) * (*_L_code)*(*_carrier);
++ }
++
++ *E_out = dotProduct_E;
++ *P_out = dotProduct_P;
++ *L_out = dotProduct_L;
++}
++
++#endif /* LV_HAVE_SSE3 */
++
++#ifdef LV_HAVE_GENERIC
++/*!
++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
++ \param input The input signal input
++ \param carrier The carrier signal input
++ \param E_code Early PRN code replica input
++ \param P_code Early PRN code replica input
++ \param L_code Early PRN code replica input
++ \param E_out Early correlation output
++ \param P_out Early correlation output
++ \param L_out Early correlation output
++ \param num_points The number of complex values in vectors
++ */
++static inline void volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_a_generic(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, unsigned int num_points)
++{
++ lv_32fc_t bb_signal_sample;
++
++ bb_signal_sample = lv_cmake(0, 0);
++
++ *E_out = 0;
++ *P_out = 0;
++ *L_out = 0;
++ // perform Early, Prompt and Late correlation
++ for(int i=0; i < num_points; ++i)
++ {
++ //Perform the carrier wipe-off
++ bb_signal_sample = input[i] * carrier[i];
++ // Now get early, late, and prompt values for each
++ *E_out += bb_signal_sample * E_code[i];
++ *P_out += bb_signal_sample * P_code[i];
++ *L_out += bb_signal_sample * L_code[i];
++ }
++}
++
++#endif /* LV_HAVE_GENERIC */
++
++#endif /* INCLUDED_gnsssdr_volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_a_H */
+diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5.h
+--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5.h 1970-01-01 01:00:00.000000000 +0100
++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5.h 2014-10-15 01:55:08.000000000 +0200
+@@ -0,0 +1,848 @@
++/*!
++ * \file volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5
++ * \brief Volk protokernel: performs the carrier wipe-off mixing and the VE, Early, Prompt, Late and VL correlation with 64 bits vectors
++ * \authors <ul>
++ * <li>Javier Arribas, 2011. jarribas(at)cttc.es
++ * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
++ * </ul>
++ *
++ * Volk protokernel that performs the carrier wipe-off mixing and the
++ * VE, Early, Prompt, Late and VL correlation with 64 bits vectors (32 bits the
++ * real part and 32 bits the imaginary part):
++ * - The carrier wipe-off is done by multiplying the input signal by the
++ * carrier (multiplication of 64 bits vectors) It returns the input
++ * signal in base band (BB)
++ * - VE values are calculated by multiplying the input signal in BB by the
++ * VE code (multiplication of 64 bits vectors), accumulating the results
++ * - Early values are calculated by multiplying the input signal in BB by the
++ * early code (multiplication of 64 bits vectors), accumulating the results
++ * - Prompt values are calculated by multiplying the input signal in BB by the
++ * prompt code (multiplication of 64 bits vectors), accumulating the results
++ * - Late values are calculated by multiplying the input signal in BB by the
++ * late code (multiplication of 64 bits vectors), accumulating the results
++ * - VL values are calculated by multiplying the input signal in BB by the
++ * VL code (multiplication of 64 bits vectors), accumulating the results
++ *
++ * -------------------------------------------------------------------------
++ *
++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
++ *
++ * GNSS-SDR is a software defined Global Navigation
++ * Satellite Systems receiver
++ *
++ * This file is part of GNSS-SDR.
++ *
++ * GNSS-SDR is free software: you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation, either version 3 of the License, or
++ * at your option) any later version.
++ *
++ * GNSS-SDR is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
++ *
++ * -------------------------------------------------------------------------
++ */
++
++#ifndef INCLUDED_gnsssdr_volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_u_H
++#define INCLUDED_gnsssdr_volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_u_H
++
++#include <inttypes.h>
++#include <stdio.h>
++#include <volk_gnsssdr/volk_gnsssdr_complex.h>
++#include <float.h>
++#include <string.h>
++
++#ifdef LV_HAVE_AVX
++#include <immintrin.h>
++/*!
++ \brief Performs the carrier wipe-off mixing and the VE, Early, Prompt, Late and VL correlation
++ \param input The input signal input
++ \param carrier The carrier signal input
++ \param VE_code VE PRN code replica input
++ \param E_code Early PRN code replica input
++ \param P_code Early PRN code replica input
++ \param L_code Early PRN code replica input
++ \param VL_code VL PRN code replica input
++ \param VE_out VE correlation output
++ \param E_out Early correlation output
++ \param P_out Early correlation output
++ \param L_out Early correlation output
++ \param VL_out VL correlation output
++ \param num_points The number of complex values in vectors
++ */
++static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_u_avx(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* VE_code, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, const lv_32fc_t* VL_code, unsigned int num_points)
++{
++ unsigned int number = 0;
++ const unsigned int halfPoints = num_points / 4;
++
++ lv_32fc_t dotProduct_VE;
++ lv_32fc_t dotProduct_E;
++ lv_32fc_t dotProduct_P;
++ lv_32fc_t dotProduct_L;
++ lv_32fc_t dotProduct_VL;
++
++ // Aux vars
++ __m256 x, y, yl, yh, z, tmp1, tmp2, z_VE, z_E, z_P, z_L, z_VL;
++ __m256 bb_signal_sample, bb_signal_sample_shuffled;
++
++ z_VE = _mm256_setzero_ps();
++ z_E = _mm256_setzero_ps();
++ z_P = _mm256_setzero_ps();
++ z_L = _mm256_setzero_ps();
++ z_VL = _mm256_setzero_ps();
++
++ //input and output vectors
++ const lv_32fc_t* _input = input;
++ const lv_32fc_t* _carrier = carrier;
++ const lv_32fc_t* _VE_code = VE_code;
++ const lv_32fc_t* _E_code = E_code;
++ const lv_32fc_t* _P_code = P_code;
++ const lv_32fc_t* _L_code = L_code;
++ const lv_32fc_t* _VL_code = VL_code;
++
++ for(;number < halfPoints; number++)
++ {
++ // carrier wipe-off (vector point-to-point product)
++ x = _mm256_loadu_ps((float*)_input); // Load the ar + ai, br + bi as ar,ai,br,bi
++ y = _mm256_loadu_ps((float*)_carrier); // Load the cr + ci, dr + di as cr,ci,dr,di
++
++ yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr
++ yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di
++
++ tmp1 = _mm256_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
++
++ x = _mm256_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
++
++ tmp2 = _mm256_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
++
++ bb_signal_sample = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
++ bb_signal_sample_shuffled = _mm256_shuffle_ps(bb_signal_sample,bb_signal_sample,0xB1); // Re-arrange bb_signal_sample to be ai,ar,bi,br
++
++ // correlation VE,E,P,L,VL (5x vector scalar product)
++ // VE
++ y = _mm256_loadu_ps((float*)_VE_code); // Load the cr + ci, dr + di as cr,ci,dr,di
++
++ yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr
++ yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di
++
++ tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
++ tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
++
++ z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
++ z_VE = _mm256_add_ps(z_VE, z); // Add the complex multiplication results together
++
++ // Early
++ y = _mm256_loadu_ps((float*)_E_code); // Load the cr + ci, dr + di as cr,ci,dr,di
++
++ yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr
++ yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di
++
++ tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
++ tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
++
++ z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
++ z_E = _mm256_add_ps(z_E, z); // Add the complex multiplication results together
++
++ // Prompt
++ y = _mm256_loadu_ps((float*)_P_code); // Load the cr + ci, dr + di as cr,ci,dr,di
++
++ yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr
++ yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di
++
++ tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
++ tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
++
++ z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
++ z_P = _mm256_add_ps(z_P, z); // Add the complex multiplication results together
++
++ // Late
++ y = _mm256_loadu_ps((float*)_L_code); // Load the cr + ci, dr + di as cr,ci,dr,di
++
++ yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr
++ yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di
++
++ tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
++ tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
++
++ z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
++ z_L = _mm256_add_ps(z_L, z); // Add the complex multiplication results together
++
++ // VL
++ y = _mm256_loadu_ps((float*)_VL_code); // Load the cr + ci, dr + di as cr,ci,dr,di
++
++ yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr
++ yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di
++
++ tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
++ tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
++
++ z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
++ z_VL = _mm256_add_ps(z_VL, z); // Add the complex multiplication results together
++
++ /*pointer increment*/
++ _carrier += 4;
++ _input += 4;
++ _VE_code += 4;
++ _E_code += 4;
++ _P_code += 4;
++ _L_code += 4;
++ _VL_code += 4;
++ }
++
++ __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_VE[4];
++ __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_E[4];
++ __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_P[4];
++ __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_L[4];
++ __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_VL[4];
++
++ _mm256_storeu_ps((float*)dotProductVector_VE,z_VE); // Store the results back into the dot product vector
++ _mm256_storeu_ps((float*)dotProductVector_E,z_E); // Store the results back into the dot product vector
++ _mm256_storeu_ps((float*)dotProductVector_P,z_P); // Store the results back into the dot product vector
++ _mm256_storeu_ps((float*)dotProductVector_L,z_L); // Store the results back into the dot product vector
++ _mm256_storeu_ps((float*)dotProductVector_VL,z_VL); // Store the results back into the dot product vector
++
++ dotProduct_VE = ( dotProductVector_VE[0] + dotProductVector_VE[1] + dotProductVector_VE[2] + dotProductVector_VE[3] );
++ dotProduct_E = ( dotProductVector_E[0] + dotProductVector_E[1] + dotProductVector_E[2] + dotProductVector_E[3] );
++ dotProduct_P = ( dotProductVector_P[0] + dotProductVector_P[1] + dotProductVector_P[2] + dotProductVector_P[3] );
++ dotProduct_L = ( dotProductVector_L[0] + dotProductVector_L[1] + dotProductVector_L[2] + dotProductVector_L[3] );
++ dotProduct_VL = ( dotProductVector_VL[0] + dotProductVector_VL[1] + dotProductVector_VL[2] + dotProductVector_VL[3] );
++
++ for (int i = 0; i<(num_points % 4); ++i)
++ {
++ dotProduct_VE += (*_input) * (*_VE_code++) * (*_carrier);
++ dotProduct_E += (*_input) * (*_E_code++) * (*_carrier);
++ dotProduct_P += (*_input) * (*_P_code++) * (*_carrier);
++ dotProduct_L += (*_input) * (*_L_code++) * (*_carrier);
++ dotProduct_VL += (*_input++) * (*_VL_code++) * (*_carrier++);
++ }
++
++ *VE_out = dotProduct_VE;
++ *E_out = dotProduct_E;
++ *P_out = dotProduct_P;
++ *L_out = dotProduct_L;
++ *VL_out = dotProduct_VL;
++}
++#endif /* LV_HAVE_AVX */
++
++#ifdef LV_HAVE_SSE3
++#include <pmmintrin.h>
++ /*!
++ \brief Performs the carrier wipe-off mixing and the VE, Early, Prompt, Late and VL correlation
++ \param input The input signal input
++ \param carrier The carrier signal input
++ \param VE_code VE PRN code replica input
++ \param E_code Early PRN code replica input
++ \param P_code Early PRN code replica input
++ \param L_code Early PRN code replica input
++ \param VL_code VL PRN code replica input
++ \param VE_out VE correlation output
++ \param E_out Early correlation output
++ \param P_out Early correlation output
++ \param L_out Early correlation output
++ \param VL_out VL correlation output
++ \param num_points The number of complex values in vectors
++ */
++static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_u_sse3(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* VE_code, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, const lv_32fc_t* VL_code, unsigned int num_points)
++{
++ unsigned int number = 0;
++ const unsigned int halfPoints = num_points / 2;
++
++ lv_32fc_t dotProduct_VE;
++ lv_32fc_t dotProduct_E;
++ lv_32fc_t dotProduct_P;
++ lv_32fc_t dotProduct_L;
++ lv_32fc_t dotProduct_VL;
++
++ // Aux vars
++ __m128 x, y, yl, yh, z, tmp1, tmp2, z_VE, z_E, z_P, z_L, z_VL;
++ __m128 bb_signal_sample, bb_signal_sample_shuffled;
++
++ z_VE = _mm_setzero_ps();
++ z_E = _mm_setzero_ps();
++ z_P = _mm_setzero_ps();
++ z_L = _mm_setzero_ps();
++ z_VL = _mm_setzero_ps();
++
++ //input and output vectors
++ const lv_32fc_t* _input = input;
++ const lv_32fc_t* _carrier = carrier;
++ const lv_32fc_t* _VE_code = VE_code;
++ const lv_32fc_t* _E_code = E_code;
++ const lv_32fc_t* _P_code = P_code;
++ const lv_32fc_t* _L_code = L_code;
++ const lv_32fc_t* _VL_code = VL_code;
++
++ for(;number < halfPoints; number++)
++ {
++ // carrier wipe-off (vector point-to-point product)
++ x = _mm_loadu_ps((float*)_input); // Load the ar + ai, br + bi as ar,ai,br,bi
++ y = _mm_loadu_ps((float*)_carrier); // Load the cr + ci, dr + di as cr,ci,dr,di
++
++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
++
++ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
++
++ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
++
++ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
++
++ bb_signal_sample = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
++ bb_signal_sample_shuffled = _mm_shuffle_ps(bb_signal_sample,bb_signal_sample,0xB1); // Re-arrange bb_signal_sample to be ai,ar,bi,br
++
++ // correlation VE,E,P,L,VL (5x vector scalar product)
++ // VE
++ y = _mm_loadu_ps((float*)_VE_code); // Load the cr + ci, dr + di as cr,ci,dr,di
++
++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
++
++ tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
++ tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
++
++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
++ z_VE = _mm_add_ps(z_VE, z); // Add the complex multiplication results together
++
++ // Early
++ y = _mm_loadu_ps((float*)_E_code); // Load the cr + ci, dr + di as cr,ci,dr,di
++
++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
++
++ tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
++ tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
++
++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
++ z_E = _mm_add_ps(z_E, z); // Add the complex multiplication results together
++
++ // Prompt
++ y = _mm_loadu_ps((float*)_P_code); // Load the cr + ci, dr + di as cr,ci,dr,di
++
++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
++
++ tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
++ tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
++
++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
++ z_P = _mm_add_ps(z_P, z); // Add the complex multiplication results together
++
++ // Late
++ y = _mm_loadu_ps((float*)_L_code); // Load the cr + ci, dr + di as cr,ci,dr,di
++
++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
++
++ tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
++ tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
++
++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
++ z_L = _mm_add_ps(z_L, z); // Add the complex multiplication results together
++
++ // VL
++ //x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi
++ y = _mm_loadu_ps((float*)_VL_code); // Load the cr + ci, dr + di as cr,ci,dr,di
++
++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
++
++ tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
++ tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
++
++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
++ z_VL = _mm_add_ps(z_VL, z); // Add the complex multiplication results together
++
++ /*pointer increment*/
++ _carrier += 2;
++ _input += 2;
++ _VE_code += 2;
++ _E_code += 2;
++ _P_code += 2;
++ _L_code +=2;
++ _VL_code +=2;
++ }
++
++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_VE[2];
++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_E[2];
++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_P[2];
++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_L[2];
++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_VL[2];
++
++ _mm_storeu_ps((float*)dotProductVector_VE,z_VE); // Store the results back into the dot product vector
++ _mm_storeu_ps((float*)dotProductVector_E,z_E); // Store the results back into the dot product vector
++ _mm_storeu_ps((float*)dotProductVector_P,z_P); // Store the results back into the dot product vector
++ _mm_storeu_ps((float*)dotProductVector_L,z_L); // Store the results back into the dot product vector
++ _mm_storeu_ps((float*)dotProductVector_VL,z_VL); // Store the results back into the dot product vector
++
++ dotProduct_VE = ( dotProductVector_VE[0] + dotProductVector_VE[1] );
++ dotProduct_E = ( dotProductVector_E[0] + dotProductVector_E[1] );
++ dotProduct_P = ( dotProductVector_P[0] + dotProductVector_P[1] );
++ dotProduct_L = ( dotProductVector_L[0] + dotProductVector_L[1] );
++ dotProduct_VL = ( dotProductVector_VL[0] + dotProductVector_VL[1] );
++
++ if((num_points % 2) != 0)
++ {
++ dotProduct_VE += (*_input) * (*_VE_code)*(*_carrier);
++ dotProduct_E += (*_input) * (*_E_code)*(*_carrier);
++ dotProduct_P += (*_input) * (*_P_code)*(*_carrier);
++ dotProduct_L += (*_input) * (*_L_code)*(*_carrier);
++ dotProduct_VL += (*_input) * (*_VL_code)*(*_carrier);
++ }
++
++ *VE_out = dotProduct_VE;
++ *E_out = dotProduct_E;
++ *P_out = dotProduct_P;
++ *L_out = dotProduct_L;
++ *VL_out = dotProduct_VL;
++}
++#endif /* LV_HAVE_SSE3 */
++
++#ifdef LV_HAVE_GENERIC
++/*!
++ \brief Performs the carrier wipe-off mixing and the VE, Early, Prompt, Late and VL correlation
++ \param input The input signal input
++ \param carrier The carrier signal input
++ \param VE_code VE PRN code replica input
++ \param E_code Early PRN code replica input
++ \param P_code Early PRN code replica input
++ \param L_code Early PRN code replica input
++ \param VL_code VL PRN code replica input
++ \param VE_out VE correlation output
++ \param E_out Early correlation output
++ \param P_out Early correlation output
++ \param L_out Early correlation output
++ \param VL_out VL correlation output
++ \param num_points The number of complex values in vectors
++ */
++static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* VE_code, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, const lv_32fc_t* VL_code, unsigned int num_points)
++{
++ lv_32fc_t bb_signal_sample;
++
++ bb_signal_sample = lv_cmake(0, 0);
++
++ *VE_out = 0;
++ *E_out = 0;
++ *P_out = 0;
++ *L_out = 0;
++ *VL_out = 0;
++ // perform Early, Prompt and Late correlation
++ for(int i=0; i < num_points; ++i)
++ {
++ //Perform the carrier wipe-off
++ bb_signal_sample = input[i] * carrier[i];
++ // Now get early, late, and prompt values for each
++ *VE_out += bb_signal_sample * VE_code[i];
++ *E_out += bb_signal_sample * E_code[i];
++ *P_out += bb_signal_sample * P_code[i];
++ *L_out += bb_signal_sample * L_code[i];
++ *VL_out += bb_signal_sample * VL_code[i];
++ }
++}
++
++#endif /* LV_HAVE_GENERIC */
++
++#endif /* INCLUDED_gnsssdr_volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_u_H */
++
++
++#ifndef INCLUDED_gnsssdr_volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_a_H
++#define INCLUDED_gnsssdr_volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_a_H
++
++#include <inttypes.h>
++#include <stdio.h>
++#include <volk_gnsssdr/volk_gnsssdr_complex.h>
++#include <float.h>
++#include <string.h>
++
++#ifdef LV_HAVE_AVX
++#include <immintrin.h>
++/*!
++ \brief Performs the carrier wipe-off mixing and the VE, Early, Prompt, Late and VL correlation
++ \param input The input signal input
++ \param carrier The carrier signal input
++ \param VE_code VE PRN code replica input
++ \param E_code Early PRN code replica input
++ \param P_code Early PRN code replica input
++ \param L_code Early PRN code replica input
++ \param VL_code VL PRN code replica input
++ \param VE_out VE correlation output
++ \param E_out Early correlation output
++ \param P_out Early correlation output
++ \param L_out Early correlation output
++ \param VL_out VL correlation output
++ \param num_points The number of complex values in vectors
++ */
++static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_a_avx(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* VE_code, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, const lv_32fc_t* VL_code, unsigned int num_points)
++{
++ unsigned int number = 0;
++ const unsigned int halfPoints = num_points / 4;
++
++ lv_32fc_t dotProduct_VE;
++ lv_32fc_t dotProduct_E;
++ lv_32fc_t dotProduct_P;
++ lv_32fc_t dotProduct_L;
++ lv_32fc_t dotProduct_VL;
++
++ // Aux vars
++ __m256 x, y, yl, yh, z, tmp1, tmp2, z_VE, z_E, z_P, z_L, z_VL;
++ __m256 bb_signal_sample, bb_signal_sample_shuffled;
++
++ z_VE = _mm256_setzero_ps();
++ z_E = _mm256_setzero_ps();
++ z_P = _mm256_setzero_ps();
++ z_L = _mm256_setzero_ps();
++ z_VL = _mm256_setzero_ps();
++
++ //input and output vectors
++ const lv_32fc_t* _input = input;
++ const lv_32fc_t* _carrier = carrier;
++ const lv_32fc_t* _VE_code = VE_code;
++ const lv_32fc_t* _E_code = E_code;
++ const lv_32fc_t* _P_code = P_code;
++ const lv_32fc_t* _L_code = L_code;
++ const lv_32fc_t* _VL_code = VL_code;
++
++ for(;number < halfPoints; number++)
++ {
++ // carrier wipe-off (vector point-to-point product)
++ x = _mm256_load_ps((float*)_input); // Load the ar + ai, br + bi as ar,ai,br,bi
++ y = _mm256_load_ps((float*)_carrier); // Load the cr + ci, dr + di as cr,ci,dr,di
++
++ yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr
++ yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di
++
++ tmp1 = _mm256_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
++
++ x = _mm256_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
++
++ tmp2 = _mm256_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
++
++ bb_signal_sample = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
++ bb_signal_sample_shuffled = _mm256_shuffle_ps(bb_signal_sample,bb_signal_sample,0xB1); // Re-arrange bb_signal_sample to be ai,ar,bi,br
++
++ // correlation VE,E,P,L,VL (5x vector scalar product)
++ // VE
++ y = _mm256_load_ps((float*)_VE_code); // Load the cr + ci, dr + di as cr,ci,dr,di
++
++ yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr
++ yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di
++
++ tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
++ tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
++
++ z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
++ z_VE = _mm256_add_ps(z_VE, z); // Add the complex multiplication results together
++
++ // Early
++ y = _mm256_load_ps((float*)_E_code); // Load the cr + ci, dr + di as cr,ci,dr,di
++
++ yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr
++ yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di
++
++ tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
++ tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
++
++ z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
++ z_E = _mm256_add_ps(z_E, z); // Add the complex multiplication results together
++
++ // Prompt
++ y = _mm256_load_ps((float*)_P_code); // Load the cr + ci, dr + di as cr,ci,dr,di
++
++ yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr
++ yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di
++
++ tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
++ tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
++
++ z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
++ z_P = _mm256_add_ps(z_P, z); // Add the complex multiplication results together
++
++ // Late
++ y = _mm256_load_ps((float*)_L_code); // Load the cr + ci, dr + di as cr,ci,dr,di
++
++ yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr
++ yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di
++
++ tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
++ tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
++
++ z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
++ z_L = _mm256_add_ps(z_L, z); // Add the complex multiplication results together
++
++ // VL
++ y = _mm256_load_ps((float*)_VL_code); // Load the cr + ci, dr + di as cr,ci,dr,di
++
++ yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr
++ yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di
++
++ tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
++ tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
++
++ z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
++ z_VL = _mm256_add_ps(z_VL, z); // Add the complex multiplication results together
++
++ /*pointer increment*/
++ _carrier += 4;
++ _input += 4;
++ _VE_code += 4;
++ _E_code += 4;
++ _P_code += 4;
++ _L_code += 4;
++ _VL_code += 4;
++ }
++
++ __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_VE[4];
++ __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_E[4];
++ __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_P[4];
++ __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_L[4];
++ __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_VL[4];
++
++ _mm256_store_ps((float*)dotProductVector_VE,z_VE); // Store the results back into the dot product vector
++ _mm256_store_ps((float*)dotProductVector_E,z_E); // Store the results back into the dot product vector
++ _mm256_store_ps((float*)dotProductVector_P,z_P); // Store the results back into the dot product vector
++ _mm256_store_ps((float*)dotProductVector_L,z_L); // Store the results back into the dot product vector
++ _mm256_store_ps((float*)dotProductVector_VL,z_VL); // Store the results back into the dot product vector
++
++ dotProduct_VE = ( dotProductVector_VE[0] + dotProductVector_VE[1] + dotProductVector_VE[2] + dotProductVector_VE[3] );
++ dotProduct_E = ( dotProductVector_E[0] + dotProductVector_E[1] + dotProductVector_E[2] + dotProductVector_E[3] );
++ dotProduct_P = ( dotProductVector_P[0] + dotProductVector_P[1] + dotProductVector_P[2] + dotProductVector_P[3] );
++ dotProduct_L = ( dotProductVector_L[0] + dotProductVector_L[1] + dotProductVector_L[2] + dotProductVector_L[3] );
++ dotProduct_VL = ( dotProductVector_VL[0] + dotProductVector_VL[1] + dotProductVector_VL[2] + dotProductVector_VL[3] );
++
++ for (int i = 0; i<(num_points % 4); ++i)
++ {
++ dotProduct_VE += (*_input) * (*_VE_code++) * (*_carrier);
++ dotProduct_E += (*_input) * (*_E_code++) * (*_carrier);
++ dotProduct_P += (*_input) * (*_P_code++) * (*_carrier);
++ dotProduct_L += (*_input) * (*_L_code++) * (*_carrier);
++ dotProduct_VL += (*_input++) * (*_VL_code++) * (*_carrier++);
++ }
++
++ *VE_out = dotProduct_VE;
++ *E_out = dotProduct_E;
++ *P_out = dotProduct_P;
++ *L_out = dotProduct_L;
++ *VL_out = dotProduct_VL;
++}
++#endif /* LV_HAVE_AVX */
++
++#ifdef LV_HAVE_SSE3
++#include <pmmintrin.h>
++/*!
++ \brief Performs the carrier wipe-off mixing and the VE, Early, Prompt, Late and VL correlation
++ \param input The input signal input
++ \param carrier The carrier signal input
++ \param VE_code VE PRN code replica input
++ \param E_code Early PRN code replica input
++ \param P_code Early PRN code replica input
++ \param L_code Early PRN code replica input
++ \param VL_code VL PRN code replica input
++ \param VE_out VE correlation output
++ \param E_out Early correlation output
++ \param P_out Early correlation output
++ \param L_out Early correlation output
++ \param VL_out VL correlation output
++ \param num_points The number of complex values in vectors
++ */
++static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_a_sse3(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* VE_code, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, const lv_32fc_t* VL_code, unsigned int num_points)
++{
++ unsigned int number = 0;
++ const unsigned int halfPoints = num_points / 2;
++
++ lv_32fc_t dotProduct_VE;
++ lv_32fc_t dotProduct_E;
++ lv_32fc_t dotProduct_P;
++ lv_32fc_t dotProduct_L;
++ lv_32fc_t dotProduct_VL;
++
++ // Aux vars
++ __m128 x, y, yl, yh, z, tmp1, tmp2, z_VE, z_E, z_P, z_L, z_VL;
++ __m128 bb_signal_sample, bb_signal_sample_shuffled;
++
++ z_VE = _mm_setzero_ps();
++ z_E = _mm_setzero_ps();
++ z_P = _mm_setzero_ps();
++ z_L = _mm_setzero_ps();
++ z_VL = _mm_setzero_ps();
++
++ //input and output vectors
++ const lv_32fc_t* _input = input;
++ const lv_32fc_t* _carrier = carrier;
++ const lv_32fc_t* _VE_code = VE_code;
++ const lv_32fc_t* _E_code = E_code;
++ const lv_32fc_t* _P_code = P_code;
++ const lv_32fc_t* _L_code = L_code;
++ const lv_32fc_t* _VL_code = VL_code;
++
++ for(;number < halfPoints; number++)
++ {
++ // carrier wipe-off (vector point-to-point product)
++ x = _mm_load_ps((float*)_input); // Load the ar + ai, br + bi as ar,ai,br,bi
++ y = _mm_load_ps((float*)_carrier); // Load the cr + ci, dr + di as cr,ci,dr,di
++
++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
++
++ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
++
++ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
++
++ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
++
++ bb_signal_sample = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
++ bb_signal_sample_shuffled = _mm_shuffle_ps(bb_signal_sample,bb_signal_sample,0xB1); // Re-arrange bb_signal_sample to be ai,ar,bi,br
++
++ // correlation VE,E,P,L,VL (5x vector scalar product)
++ // VE
++ y = _mm_load_ps((float*)_VE_code); // Load the cr + ci, dr + di as cr,ci,dr,di
++
++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
++
++ tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
++ tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
++
++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
++ z_VE = _mm_add_ps(z_VE, z); // Add the complex multiplication results together
++
++ // Early
++ y = _mm_load_ps((float*)_E_code); // Load the cr + ci, dr + di as cr,ci,dr,di
++
++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
++
++ tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
++ tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
++
++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
++ z_E = _mm_add_ps(z_E, z); // Add the complex multiplication results together
++
++ // Prompt
++ y = _mm_load_ps((float*)_P_code); // Load the cr + ci, dr + di as cr,ci,dr,di
++
++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
++
++ tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
++ tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
++
++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
++ z_P = _mm_add_ps(z_P, z); // Add the complex multiplication results together
++
++ // Late
++ y = _mm_load_ps((float*)_L_code); // Load the cr + ci, dr + di as cr,ci,dr,di
++
++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
++
++ tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
++ tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
++
++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
++ z_L = _mm_add_ps(z_L, z); // Add the complex multiplication results together
++
++ // VL
++ //x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi
++ y = _mm_load_ps((float*)_VL_code); // Load the cr + ci, dr + di as cr,ci,dr,di
++
++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
++
++ tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
++ tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
++
++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
++ z_VL = _mm_add_ps(z_VL, z); // Add the complex multiplication results together
++
++ /*pointer increment*/
++ _carrier += 2;
++ _input += 2;
++ _VE_code += 2;
++ _E_code += 2;
++ _P_code += 2;
++ _L_code +=2;
++ _VL_code +=2;
++ }
++
++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_VE[2];
++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_E[2];
++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_P[2];
++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_L[2];
++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_VL[2];
++
++ _mm_store_ps((float*)dotProductVector_VE,z_VE); // Store the results back into the dot product vector
++ _mm_store_ps((float*)dotProductVector_E,z_E); // Store the results back into the dot product vector
++ _mm_store_ps((float*)dotProductVector_P,z_P); // Store the results back into the dot product vector
++ _mm_store_ps((float*)dotProductVector_L,z_L); // Store the results back into the dot product vector
++ _mm_store_ps((float*)dotProductVector_VL,z_VL); // Store the results back into the dot product vector
++
++ dotProduct_VE = ( dotProductVector_VE[0] + dotProductVector_VE[1] );
++ dotProduct_E = ( dotProductVector_E[0] + dotProductVector_E[1] );
++ dotProduct_P = ( dotProductVector_P[0] + dotProductVector_P[1] );
++ dotProduct_L = ( dotProductVector_L[0] + dotProductVector_L[1] );
++ dotProduct_VL = ( dotProductVector_VL[0] + dotProductVector_VL[1] );
++
++ if((num_points % 2) != 0)
++ {
++ dotProduct_VE += (*_input) * (*_VE_code)*(*_carrier);
++ dotProduct_E += (*_input) * (*_E_code)*(*_carrier);
++ dotProduct_P += (*_input) * (*_P_code)*(*_carrier);
++ dotProduct_L += (*_input) * (*_L_code)*(*_carrier);
++ dotProduct_VL += (*_input) * (*_VL_code)*(*_carrier);
++ }
++
++ *VE_out = dotProduct_VE;
++ *E_out = dotProduct_E;
++ *P_out = dotProduct_P;
++ *L_out = dotProduct_L;
++ *VL_out = dotProduct_VL;
++}
++#endif /* LV_HAVE_SSE3 */
++
++#ifdef LV_HAVE_GENERIC
++/*!
++ \brief Performs the carrier wipe-off mixing and the VE, Early, Prompt, Late and VL correlation
++ \param input The input signal input
++ \param carrier The carrier signal input
++ \param VE_code VE PRN code replica input
++ \param E_code Early PRN code replica input
++ \param P_code Early PRN code replica input
++ \param L_code Early PRN code replica input
++ \param VL_code VL PRN code replica input
++ \param VE_out VE correlation output
++ \param E_out Early correlation output
++ \param P_out Early correlation output
++ \param L_out Early correlation output
++ \param VL_out VL correlation output
++ \param num_points The number of complex values in vectors
++ */
++static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_a_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* VE_code, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, const lv_32fc_t* VL_code, unsigned int num_points)
++{
++ lv_32fc_t bb_signal_sample;
++
++ bb_signal_sample = lv_cmake(0, 0);
++
++ *VE_out = 0;
++ *E_out = 0;
++ *P_out = 0;
++ *L_out = 0;
++ *VL_out = 0;
++ // perform Early, Prompt and Late correlation
++ for(int i=0; i < num_points; ++i)
++ {
++ //Perform the carrier wipe-off
++ bb_signal_sample = input[i] * carrier[i];
++ // Now get early, late, and prompt values for each
++ *VE_out += bb_signal_sample * VE_code[i];
++ *E_out += bb_signal_sample * E_code[i];
++ *P_out += bb_signal_sample * P_code[i];
++ *L_out += bb_signal_sample * L_code[i];
++ *VL_out += bb_signal_sample * VL_code[i];
++ }
++}
++#endif /* LV_HAVE_GENERIC */
++#endif /* INCLUDED_gnsssdr_volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_a_H */
+diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_64f_accumulator_64f.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_64f_accumulator_64f.h
+--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_64f_accumulator_64f.h 1970-01-01 01:00:00.000000000 +0100
++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_64f_accumulator_64f.h 2014-10-15 01:55:08.000000000 +0200
+@@ -0,0 +1,243 @@
++/*!
++ * \file volk_gnsssdr_64f_accumulator_64f.h
++ * \brief Volk protokernel: 64 bits (double) scalar accumulator
++ * \authors <ul>
++ * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
++ * </ul>
++ *
++ * Volk protokernel that implements an accumulator of char values
++ *
++ * -------------------------------------------------------------------------
++ *
++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
++ *
++ * GNSS-SDR is a software defined Global Navigation
++ * Satellite Systems receiver
++ *
++ * This file is part of GNSS-SDR.
++ *
++ * GNSS-SDR is free software: you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation, either version 3 of the License, or
++ * at your option) any later version.
++ *
++ * GNSS-SDR is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
++ *
++ * -------------------------------------------------------------------------
++ */
++
++#ifndef INCLUDED_volk_gnsssdr_64f_accumulator_64f_u_H
++#define INCLUDED_volk_gnsssdr_64f_accumulator_64f_u_H
++
++#include <volk_gnsssdr/volk_gnsssdr_common.h>
++#include <inttypes.h>
++#include <stdio.h>
++
++#ifdef LV_HAVE_AVX
++#include <immintrin.h>
++/*!
++ \brief Accumulates the values in the input buffer
++ \param result The accumulated result
++ \param inputBuffer The buffer of data to be accumulated
++ \param num_points The number of values in inputBuffer to be accumulated
++ */
++static inline void volk_gnsssdr_64f_accumulator_64f_u_avx(double* result,const double* inputBuffer, unsigned int num_points){
++ double returnValue = 0;
++ const unsigned int sse_iters = num_points / 4;
++
++ const double* aPtr = inputBuffer;
++
++ __VOLK_ATTR_ALIGNED(32) double tempBuffer[4];
++ __m256d accumulator = _mm256_setzero_pd();
++ __m256d aVal = _mm256_setzero_pd();
++
++ for(unsigned int number = 0; number < sse_iters; number++)
++ {
++ aVal = _mm256_loadu_pd(aPtr);
++ accumulator = _mm256_add_pd(accumulator, aVal);
++ aPtr += 4;
++ }
++
++ _mm256_storeu_pd((double*)tempBuffer,accumulator);
++
++ for(int i = 0; i<4; ++i){
++ returnValue += tempBuffer[i];
++ }
++
++ for(int i = 0; i<(num_points % 4); ++i){
++ returnValue += (*aPtr++);
++ }
++
++ *result = returnValue;
++}
++#endif /* LV_HAVE_AVX */
++
++#ifdef LV_HAVE_SSE3
++#include <xmmintrin.h>
++/*!
++ \brief Accumulates the values in the input buffer
++ \param result The accumulated result
++ \param inputBuffer The buffer of data to be accumulated
++ \param num_points The number of values in inputBuffer to be accumulated
++ */
++static inline void volk_gnsssdr_64f_accumulator_64f_u_sse3(double* result,const double* inputBuffer, unsigned int num_points){
++ double returnValue = 0;
++ const unsigned int sse_iters = num_points / 2;
++
++ const double* aPtr = inputBuffer;
++
++ __VOLK_ATTR_ALIGNED(16) double tempBuffer[2];
++ __m128d accumulator = _mm_setzero_pd();
++ __m128d aVal = _mm_setzero_pd();
++
++ for(unsigned int number = 0; number < sse_iters; number++)
++ {
++ aVal = _mm_loadu_pd(aPtr);
++ accumulator = _mm_add_pd(accumulator, aVal);
++ aPtr += 2;
++ }
++
++ _mm_storeu_pd((double*)tempBuffer,accumulator);
++
++ for(int i = 0; i<2; ++i){
++ returnValue += tempBuffer[i];
++ }
++
++ for(int i = 0; i<(num_points % 2); ++i){
++ returnValue += (*aPtr++);
++ }
++
++ *result = returnValue;
++}
++#endif /* LV_HAVE_SSE3 */
++
++#ifdef LV_HAVE_GENERIC
++/*!
++ \brief Accumulates the values in the input buffer
++ \param result The accumulated result
++ \param inputBuffer The buffer of data to be accumulated
++ \param num_points The number of values in inputBuffer to be accumulated
++ */
++static inline void volk_gnsssdr_64f_accumulator_64f_generic(double* result,const double* inputBuffer, unsigned int num_points){
++ const double* aPtr = inputBuffer;
++ double returnValue = 0;
++
++ for(unsigned int number = 0;number < num_points; number++){
++ returnValue += (*aPtr++);
++ }
++ *result = returnValue;
++}
++#endif /* LV_HAVE_GENERIC */
++
++#endif /* INCLUDED_volk_gnsssdr_64f_accumulator_64f_u_H */
++
++
++#ifndef INCLUDED_volk_gnsssdr_64f_accumulator_64f_a_H
++#define INCLUDED_volk_gnsssdr_64f_accumulator_64f_a_H
++
++#include <volk_gnsssdr/volk_gnsssdr_common.h>
++#include <inttypes.h>
++#include <stdio.h>
++
++#ifdef LV_HAVE_AVX
++#include <immintrin.h>
++/*!
++ \brief Accumulates the values in the input buffer
++ \param result The accumulated result
++ \param inputBuffer The buffer of data to be accumulated
++ \param num_points The number of values in inputBuffer to be accumulated
++ */
++static inline void volk_gnsssdr_64f_accumulator_64f_a_avx(double* result,const double* inputBuffer, unsigned int num_points){
++ double returnValue = 0;
++ const unsigned int sse_iters = num_points / 4;
++
++ const double* aPtr = inputBuffer;
++
++ __VOLK_ATTR_ALIGNED(32) double tempBuffer[4];
++ __m256d accumulator = _mm256_setzero_pd();
++ __m256d aVal = _mm256_setzero_pd();
++
++ for(unsigned int number = 0; number < sse_iters; number++)
++ {
++ aVal = _mm256_load_pd(aPtr);
++ accumulator = _mm256_add_pd(accumulator, aVal);
++ aPtr += 4;
++ }
++
++ _mm256_store_pd((double*)tempBuffer,accumulator);
++
++ for(int i = 0; i<4; ++i){
++ returnValue += tempBuffer[i];
++ }
++
++ for(int i = 0; i<(num_points % 4); ++i){
++ returnValue += (*aPtr++);
++ }
++
++ *result = returnValue;
++}
++#endif /* LV_HAVE_AVX */
++
++#ifdef LV_HAVE_SSE3
++#include <xmmintrin.h>
++/*!
++ \brief Accumulates the values in the input buffer
++ \param result The accumulated result
++ \param inputBuffer The buffer of data to be accumulated
++ \param num_points The number of values in inputBuffer to be accumulated
++ */
++static inline void volk_gnsssdr_64f_accumulator_64f_a_sse3(double* result,const double* inputBuffer, unsigned int num_points){
++ double returnValue = 0;
++ const unsigned int sse_iters = num_points / 2;
++
++ const double* aPtr = inputBuffer;
++
++ __VOLK_ATTR_ALIGNED(16) double tempBuffer[2];
++ __m128d accumulator = _mm_setzero_pd();
++ __m128d aVal = _mm_setzero_pd();
++
++ for(unsigned int number = 0; number < sse_iters; number++)
++ {
++ aVal = _mm_load_pd(aPtr);
++ accumulator = _mm_add_pd(accumulator, aVal);
++ aPtr += 2;
++ }
++
++ _mm_store_pd((double*)tempBuffer,accumulator);
++
++ for(int i = 0; i<2; ++i){
++ returnValue += tempBuffer[i];
++ }
++
++ for(int i = 0; i<(num_points % 2); ++i){
++ returnValue += (*aPtr++);
++ }
++
++ *result = returnValue;
++}
++#endif /* LV_HAVE_SSE3 */
++
++#ifdef LV_HAVE_GENERIC
++/*!
++ \brief Accumulates the values in the input buffer
++ \param result The accumulated result
++ \param inputBuffer The buffer of data to be accumulated
++ \param num_points The number of values in inputBuffer to be accumulated
++ */
++static inline void volk_gnsssdr_64f_accumulator_64f_a_generic(double* result,const double* inputBuffer, unsigned int num_points){
++ const double* aPtr = inputBuffer;
++ double returnValue = 0;
++
++ for(unsigned int number = 0;number < num_points; number++){
++ returnValue += (*aPtr++);
++ }
++ *result = returnValue;
++}
++#endif /* LV_HAVE_GENERIC */
++#endif /* INCLUDED_volk_gnsssdr_64f_accumulator_64f_a_H */
+\ No newline at end of file
+diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_accumulator_s8i.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8i_accumulator_s8i.h
+--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_accumulator_s8i.h 1970-01-01 01:00:00.000000000 +0100
++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8i_accumulator_s8i.h 2014-10-15 01:55:08.000000000 +0200
+@@ -0,0 +1,183 @@
++/*!
++ * \file volk_gnsssdr_8i_accumulator_s8i.h
++ * \brief Volk protokernel: 8 bits (char) scalar accumulator
++ * \authors <ul>
++ * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
++ * </ul>
++ *
++ * Volk protokernel that implements an accumulator of char values
++ *
++ * -------------------------------------------------------------------------
++ *
++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
++ *
++ * GNSS-SDR is a software defined Global Navigation
++ * Satellite Systems receiver
++ *
++ * This file is part of GNSS-SDR.
++ *
++ * GNSS-SDR is free software: you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation, either version 3 of the License, or
++ * at your option) any later version.
++ *
++ * GNSS-SDR is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
++ *
++ * -------------------------------------------------------------------------
++ */
++
++#ifndef INCLUDED_volk_gnsssdr_8i_accumulator_s8i_u_H
++#define INCLUDED_volk_gnsssdr_8i_accumulator_s8i_u_H
++
++#include <volk_gnsssdr/volk_gnsssdr_common.h>
++#include <inttypes.h>
++#include <stdio.h>
++
++#ifdef LV_HAVE_SSE3
++#include <xmmintrin.h>
++/*!
++ \brief Accumulates the values in the input buffer
++ \param result The accumulated result
++ \param inputBuffer The buffer of data to be accumulated
++ \param num_points The number of values in inputBuffer to be accumulated
++ */
++static inline void volk_gnsssdr_8i_accumulator_s8i_u_sse3(char* result, const char* inputBuffer, unsigned int num_points){
++ char returnValue = 0;
++ const unsigned int sse_iters = num_points / 16;
++
++ const char* aPtr = inputBuffer;
++
++ __VOLK_ATTR_ALIGNED(16) char tempBuffer[16];
++ __m128i accumulator = _mm_setzero_si128();
++ __m128i aVal = _mm_setzero_si128();
++
++ for(unsigned int number = 0; number < sse_iters; number++){
++ aVal = _mm_lddqu_si128((__m128i*)aPtr);
++ accumulator = _mm_add_epi8(accumulator, aVal);
++ aPtr += 16;
++ }
++ _mm_storeu_si128((__m128i*)tempBuffer,accumulator);
++
++ for(int i = 0; i<16; ++i){
++ returnValue += tempBuffer[i];
++ }
++
++ for(int i = 0; i<(num_points % 16); ++i){
++ returnValue += (*aPtr++);
++ }
++
++ *result = returnValue;
++}
++#endif /* LV_HAVE_SSE3 */
++
++#ifdef LV_HAVE_GENERIC
++/*!
++ \brief Accumulates the values in the input buffer
++ \param result The accumulated result
++ \param inputBuffer The buffer of data to be accumulated
++ \param num_points The number of values in inputBuffer to be accumulated
++ */
++static inline void volk_gnsssdr_8i_accumulator_s8i_generic(char* result, const char* inputBuffer, unsigned int num_points){
++ const char* aPtr = inputBuffer;
++ char returnValue = 0;
++
++ for(unsigned int number = 0;number < num_points; number++){
++ returnValue += (*aPtr++);
++ }
++ *result = returnValue;
++}
++#endif /* LV_HAVE_GENERIC */
++
++#endif /* INCLUDED_volk_gnsssdr_8i_accumulator_s8i_u_H */
++
++
++#ifndef INCLUDED_volk_gnsssdr_8i_accumulator_s8i_a_H
++#define INCLUDED_volk_gnsssdr_8i_accumulator_s8i_a_H
++
++#include <volk_gnsssdr/volk_gnsssdr_common.h>
++#include <inttypes.h>
++#include <stdio.h>
++
++#ifdef LV_HAVE_SSE3
++#include <xmmintrin.h>
++/*!
++ \brief Accumulates the values in the input buffer
++ \param result The accumulated result
++ \param inputBuffer The buffer of data to be accumulated
++ \param num_points The number of values in inputBuffer to be accumulated
++ */
++static inline void volk_gnsssdr_8i_accumulator_s8i_a_sse3(char* result, const char* inputBuffer, unsigned int num_points){
++ char returnValue = 0;
++ const unsigned int sse_iters = num_points / 16;
++
++ const char* aPtr = inputBuffer;
++
++ __VOLK_ATTR_ALIGNED(16) char tempBuffer[16];
++ __m128i accumulator = _mm_setzero_si128();
++ __m128i aVal = _mm_setzero_si128();
++
++ for(unsigned int number = 0; number < sse_iters; number++){
++ aVal = _mm_load_si128((__m128i*)aPtr);
++ accumulator = _mm_add_epi8(accumulator, aVal);
++ aPtr += 16;
++ }
++ _mm_store_si128((__m128i*)tempBuffer,accumulator);
++
++ for(int i = 0; i<16; ++i){
++ returnValue += tempBuffer[i];
++ }
++
++ for(int i = 0; i<(num_points % 16); ++i){
++ returnValue += (*aPtr++);
++ }
++
++ *result = returnValue;
++}
++#endif /* LV_HAVE_SSE3 */
++
++#ifdef LV_HAVE_GENERIC
++/*!
++ \brief Accumulates the values in the input buffer
++ \param result The accumulated result
++ \param inputBuffer The buffer of data to be accumulated
++ \param num_points The number of values in inputBuffer to be accumulated
++ */
++static inline void volk_gnsssdr_8i_accumulator_s8i_a_generic(char* result, const char* inputBuffer, unsigned int num_points){
++ const char* aPtr = inputBuffer;
++ char returnValue = 0;
++
++ for(unsigned int number = 0;number < num_points; number++){
++ returnValue += (*aPtr++);
++ }
++ *result = returnValue;
++}
++#endif /* LV_HAVE_GENERIC */
++
++#ifdef LV_HAVE_ORC
++/*!
++ \brief Accumulates the values in the input buffer
++ \param result The accumulated result
++ \param inputBuffer The buffer of data to be accumulated
++ \param num_points The number of values in inputBuffer to be accumulated
++ */
++extern void volk_gnsssdr_8i_accumulator_s8i_a_orc_impl(short* result, const char* inputBuffer, unsigned int num_points);
++static inline void volk_gnsssdr_8i_accumulator_s8i_u_orc(char* result, const char* inputBuffer, unsigned int num_points){
++
++ short res = 0;
++ char* resc = (char*)&res;
++ resc++;
++
++ volk_gnsssdr_8i_accumulator_s8i_a_orc_impl(&res, inputBuffer, num_points);
++
++ *result = *resc;
++}
++#endif /* LV_HAVE_ORC */
++
++#endif /* INCLUDED_volk_gnsssdr_8i_accumulator_s8i_a_H */
++
+diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_index_max_16u.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8i_index_max_16u.h
+--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_index_max_16u.h 1970-01-01 01:00:00.000000000 +0100
++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8i_index_max_16u.h 2014-10-15 01:55:08.000000000 +0200
+@@ -0,0 +1,493 @@
++/*!
++ * \file volk_gnsssdr_8i_index_max_16u.h
++ * \brief Volk protokernel: calculates the index of the maximum value in a group of 8 bits (char) scalars
++ * \authors <ul>
++ * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
++ * </ul>
++ *
++ * Volk protokernel that returns the index of the maximum value of a group of 8 bits (char) scalars
++ *
++ * -------------------------------------------------------------------------
++ *
++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
++ *
++ * GNSS-SDR is a software defined Global Navigation
++ * Satellite Systems receiver
++ *
++ * This file is part of GNSS-SDR.
++ *
++ * GNSS-SDR is free software: you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation, either version 3 of the License, or
++ * at your option) any later version.
++ *
++ * GNSS-SDR is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
++ *
++ * -------------------------------------------------------------------------
++ */
++
++#ifndef INCLUDED_volk_gnsssdr_8i_index_max_16u_u_H
++#define INCLUDED_volk_gnsssdr_8i_index_max_16u_u_H
++
++#include <volk_gnsssdr/volk_gnsssdr_common.h>
++#include <inttypes.h>
++#include <stdio.h>
++
++#ifdef LV_HAVE_AVX
++#include "immintrin.h"
++/*!
++ \brief Returns the index of the max value in src0
++ \param target The index of the max value in src0
++ \param src0 The buffer of data to be analysed
++ \param num_points The number of values in src0 to be analysed
++ */
++static inline void volk_gnsssdr_8i_index_max_16u_u_avx(unsigned int* target, const char* src0, unsigned int num_points) {
++ if(num_points > 0){
++ const unsigned int sse_iters = num_points / 32;
++
++ char* basePtr = (char*)src0;
++ char* inputPtr = (char*)src0;
++ char max = src0[0];
++ unsigned int index = 0;
++ __VOLK_ATTR_ALIGNED(32) char currentValuesBuffer[32];
++ __m256i ones, compareResults, currentValues;
++ __m128i compareResultslo, compareResultshi, maxValues, lo, hi;
++
++ ones = _mm256_set1_epi8(0xFF);
++ maxValues = _mm_set1_epi8(max);
++
++ for(unsigned int number = 0; number < sse_iters; number++)
++ {
++ currentValues = _mm256_lddqu_si256((__m256i*)inputPtr);
++
++ lo = _mm256_castsi256_si128(currentValues);
++ hi = _mm256_extractf128_si256(currentValues,1);
++
++ compareResultslo = _mm_cmpgt_epi8(maxValues, lo);
++ compareResultshi = _mm_cmpgt_epi8(maxValues, hi);
++
++ //compareResults = _mm256_set_m128i(compareResultshi , compareResultslo); //not defined in some versions of immintrin.h
++ compareResults = _mm256_insertf128_si256(_mm256_castsi128_si256(compareResultslo),(compareResultshi),1);
++
++ if (!_mm256_testc_si256(compareResults, ones))
++ {
++ _mm256_storeu_si256((__m256i*)&currentValuesBuffer, currentValues);
++
++ for(int i = 0; i < 32; i++)
++ {
++ if(currentValuesBuffer[i] > max)
++ {
++ index = inputPtr - basePtr + i;
++ max = currentValuesBuffer[i];
++ }
++ }
++ maxValues = _mm_set1_epi8(max);
++ }
++
++ inputPtr += 32;
++ }
++
++ for(int i = 0; i<(num_points % 32); ++i)
++ {
++ if(src0[i] > max)
++ {
++ index = i;
++ max = src0[i];
++ }
++ }
++ target[0] = index;
++ }
++}
++
++#endif /*LV_HAVE_AVX*/
++
++#ifdef LV_HAVE_SSE4_1
++#include<smmintrin.h>
++/*!
++ \brief Returns the index of the max value in src0
++ \param target The index of the max value in src0
++ \param src0 The buffer of data to be analysed
++ \param num_points The number of values in src0 to be analysed
++ */
++static inline void volk_gnsssdr_8i_index_max_16u_u_sse4_1(unsigned int* target, const char* src0, unsigned int num_points) {
++ if(num_points > 0){
++ const unsigned int sse_iters = num_points / 16;
++
++ char* basePtr = (char*)src0;
++ char* inputPtr = (char*)src0;
++ char max = src0[0];
++ unsigned int index = 0;
++ __VOLK_ATTR_ALIGNED(16) char currentValuesBuffer[16];
++ __m128i maxValues, compareResults, currentValues;
++
++ maxValues = _mm_set1_epi8(max);
++
++ for(unsigned int number = 0; number < sse_iters; number++)
++ {
++ currentValues = _mm_lddqu_si128((__m128i*)inputPtr);
++
++ compareResults = _mm_cmpgt_epi8(maxValues, currentValues);
++
++ if (!_mm_test_all_ones(compareResults))
++ {
++ _mm_storeu_si128((__m128i*)&currentValuesBuffer, currentValues);
++
++ for(int i = 0; i < 16; i++)
++ {
++ if(currentValuesBuffer[i] > max)
++ {
++ index = inputPtr - basePtr + i;
++ max = currentValuesBuffer[i];
++ }
++ }
++ maxValues = _mm_set1_epi8(max);
++ }
++
++ inputPtr += 16;
++ }
++
++ for(int i = 0; i<(num_points % 16); ++i)
++ {
++ if(src0[i] > max)
++ {
++ index = i;
++ max = src0[i];
++ }
++ }
++ target[0] = index;
++ }
++}
++
++#endif /*LV_HAVE_SSE4_1*/
++
++#ifdef LV_HAVE_SSE2
++#include<xmmintrin.h>
++/*!
++ \brief Returns the index of the max value in src0
++ \param target The index of the max value in src0
++ \param src0 The buffer of data to be analysed
++ \param num_points The number of values in src0 to be analysed
++ */
++static inline void volk_gnsssdr_8i_index_max_16u_u_sse2(unsigned int* target, const char* src0, unsigned int num_points) {
++ if(num_points > 0){
++ const unsigned int sse_iters = num_points / 16;
++
++ char* basePtr = (char*)src0;
++ char* inputPtr = (char*)src0;
++ char max = src0[0];
++ unsigned int index = 0;
++ unsigned short mask;
++ __VOLK_ATTR_ALIGNED(16) char currentValuesBuffer[16];
++ __m128i maxValues, compareResults, currentValues;
++
++ maxValues = _mm_set1_epi8(max);
++
++ for(unsigned int number = 0; number < sse_iters; number++)
++ {
++ currentValues = _mm_loadu_si128((__m128i*)inputPtr);
++ compareResults = _mm_cmpgt_epi8(maxValues, currentValues);
++ mask = _mm_movemask_epi8(compareResults);
++
++ if (mask != 0xFFFF)
++ {
++ _mm_storeu_si128((__m128i*)&currentValuesBuffer, currentValues);
++ mask = ~mask;
++ int i = 0;
++ while (mask > 0)
++ {
++ if ((mask & 1) == 1)
++ {
++ if(currentValuesBuffer[i] > max)
++ {
++ index = inputPtr - basePtr + i;
++ max = currentValuesBuffer[i];
++ }
++ }
++ i++;
++ mask >>= 1;
++ }
++ maxValues = _mm_set1_epi8(max);
++ }
++ inputPtr += 16;
++ }
++
++ for(int i = 0; i<(num_points % 16); ++i)
++ {
++ if(src0[i] > max)
++ {
++ index = i;
++ max = src0[i];
++ }
++ }
++ target[0] = index;
++ }
++}
++
++#endif /*LV_HAVE_SSE2*/
++
++#ifdef LV_HAVE_GENERIC
++/*!
++ \brief Returns the index of the max value in src0
++ \param target The index of the max value in src0
++ \param src0 The buffer of data to be analysed
++ \param num_points The number of values in src0 to be analysed
++ */
++static inline void volk_gnsssdr_8i_index_max_16u_generic(unsigned int* target, const char* src0, unsigned int num_points) {
++
++ if(num_points > 0)
++ {
++ char max = src0[0];
++ unsigned int index = 0;
++
++ for(unsigned int i = 1; i < num_points; ++i)
++ {
++ if(src0[i] > max)
++ {
++ index = i;
++ max = src0[i];
++ }
++ }
++ target[0] = index;
++ }
++}
++
++#endif /*LV_HAVE_GENERIC*/
++
++#endif /*INCLUDED_volk_gnsssdr_8i_index_max_16u_u_H*/
++
++
++#ifndef INCLUDED_volk_gnsssdr_8i_index_max_16u_a_H
++#define INCLUDED_volk_gnsssdr_8i_index_max_16u_a_H
++
++#include <volk_gnsssdr/volk_gnsssdr_common.h>
++#include <inttypes.h>
++#include <stdio.h>
++
++#ifdef LV_HAVE_AVX
++#include "immintrin.h"
++/*!
++ \brief Returns the index of the max value in src0
++ \param target The index of the max value in src0
++ \param src0 The buffer of data to be analysed
++ \param num_points The number of values in src0 to be analysed
++ */
++static inline void volk_gnsssdr_8i_index_max_16u_a_avx(unsigned int* target, const char* src0, unsigned int num_points) {
++ if(num_points > 0){
++ const unsigned int sse_iters = num_points / 32;
++
++ char* basePtr = (char*)src0;
++ char* inputPtr = (char*)src0;
++ char max = src0[0];
++ unsigned int index = 0;
++ __VOLK_ATTR_ALIGNED(32) char currentValuesBuffer[32];
++ __m256i ones, compareResults, currentValues;
++ __m128i compareResultslo, compareResultshi, maxValues, lo, hi;
++
++ ones = _mm256_set1_epi8(0xFF);
++ maxValues = _mm_set1_epi8(max);
++
++ for(unsigned int number = 0; number < sse_iters; number++)
++ {
++ currentValues = _mm256_load_si256((__m256i*)inputPtr);
++
++ lo = _mm256_castsi256_si128(currentValues);
++ hi = _mm256_extractf128_si256(currentValues,1);
++
++ compareResultslo = _mm_cmpgt_epi8(maxValues, lo);
++ compareResultshi = _mm_cmpgt_epi8(maxValues, hi);
++
++ //compareResults = _mm256_set_m128i(compareResultshi , compareResultslo); //not defined in some versions of immintrin.h
++ compareResults = _mm256_insertf128_si256(_mm256_castsi128_si256(compareResultslo),(compareResultshi),1);
++
++ if (!_mm256_testc_si256(compareResults, ones))
++ {
++ _mm256_store_si256((__m256i*)&currentValuesBuffer, currentValues);
++
++ for(int i = 0; i < 32; i++)
++ {
++ if(currentValuesBuffer[i] > max)
++ {
++ index = inputPtr - basePtr + i;
++ max = currentValuesBuffer[i];
++ }
++ }
++ maxValues = _mm_set1_epi8(max);
++ }
++
++ inputPtr += 32;
++ }
++
++ for(int i = 0; i<(num_points % 32); ++i)
++ {
++ if(src0[i] > max)
++ {
++ index = i;
++ max = src0[i];
++ }
++ }
++ target[0] = index;
++ }
++}
++
++#endif /*LV_HAVE_AVX*/
++
++#ifdef LV_HAVE_SSE4_1
++#include "smmintrin.h"
++#include "emmintrin.h"
++/*!
++ \brief Returns the index of the max value in src0
++ \param target The index of the max value in src0
++ \param src0 The buffer of data to be analysed
++ \param num_points The number of values in src0 to be analysed
++ */
++static inline void volk_gnsssdr_8i_index_max_16u_a_sse4_1(unsigned int* target, const char* src0, unsigned int num_points) {
++ if(num_points > 0){
++ const unsigned int sse_iters = num_points / 16;
++
++ char* basePtr = (char*)src0;
++ char* inputPtr = (char*)src0;
++ char max = src0[0];
++ unsigned int index = 0;
++ __VOLK_ATTR_ALIGNED(16) char currentValuesBuffer[16];
++ __m128i maxValues, compareResults, currentValues;
++
++ maxValues = _mm_set1_epi8(max);
++
++ for(unsigned int number = 0; number < sse_iters; number++)
++ {
++ currentValues = _mm_load_si128((__m128i*)inputPtr);
++
++ compareResults = _mm_cmpgt_epi8(maxValues, currentValues);
++
++ if (!_mm_test_all_ones(compareResults))
++ {
++ _mm_store_si128((__m128i*)&currentValuesBuffer, currentValues);
++
++ for(int i = 0; i < 16; i++)
++ {
++ if(currentValuesBuffer[i] > max)
++ {
++ index = inputPtr - basePtr + i;
++ max = currentValuesBuffer[i];
++ }
++ }
++ maxValues = _mm_set1_epi8(max);
++ }
++
++ inputPtr += 16;
++ }
++
++ for(int i = 0; i<(num_points % 16); ++i)
++ {
++ if(src0[i] > max)
++ {
++ index = i;
++ max = src0[i];
++ }
++ }
++ target[0] = index;
++ }
++}
++
++#endif /*LV_HAVE_SSE4_1*/
++
++#ifdef LV_HAVE_SSE2
++#include "emmintrin.h"
++/*!
++ \brief Returns the index of the max value in src0
++ \param target The index of the max value in src0
++ \param src0 The buffer of data to be analysed
++ \param num_points The number of values in src0 to be analysed
++ */
++static inline void volk_gnsssdr_8i_index_max_16u_a_sse2(unsigned int* target, const char* src0, unsigned int num_points) {
++ if(num_points > 0){
++ const unsigned int sse_iters = num_points / 16;
++
++ char* basePtr = (char*)src0;
++ char* inputPtr = (char*)src0;
++ char max = src0[0];
++ unsigned int index = 0;
++ unsigned short mask;
++ __VOLK_ATTR_ALIGNED(16) char currentValuesBuffer[16];
++ __m128i maxValues, compareResults, currentValues;
++
++ maxValues = _mm_set1_epi8(max);
++
++ for(unsigned int number = 0; number < sse_iters; number++)
++ {
++ currentValues = _mm_load_si128((__m128i*)inputPtr);
++ compareResults = _mm_cmpgt_epi8(maxValues, currentValues);
++ mask = _mm_movemask_epi8(compareResults);
++
++ if (mask != 0xFFFF)
++ {
++ _mm_store_si128((__m128i*)&currentValuesBuffer, currentValues);
++ mask = ~mask;
++ int i = 0;
++ while (mask > 0)
++ {
++ if ((mask & 1) == 1)
++ {
++ if(currentValuesBuffer[i] > max)
++ {
++ index = inputPtr - basePtr + i;
++ max = currentValuesBuffer[i];
++ }
++ }
++ i++;
++ mask >>= 1;
++ }
++ maxValues = _mm_set1_epi8(max);
++ }
++ inputPtr += 16;
++ }
++
++ for(int i = 0; i<(num_points % 16); ++i)
++ {
++ if(src0[i] > max)
++ {
++ index = i;
++ max = src0[i];
++ }
++ }
++ target[0] = index;
++ }
++}
++
++#endif /*LV_HAVE_SSE2*/
++
++#ifdef LV_HAVE_GENERIC
++/*!
++ \brief Returns the index of the max value in src0
++ \param target The index of the max value in src0
++ \param src0 The buffer of data to be analysed
++ \param num_points The number of values in src0 to be analysed
++ */
++static inline void volk_gnsssdr_8i_index_max_16u_a_generic(unsigned int* target, const char* src0, unsigned int num_points) {
++
++ if(num_points > 0)
++ {
++ char max = src0[0];
++ unsigned int index = 0;
++
++ for(unsigned int i = 1; i < num_points; ++i)
++ {
++ if(src0[i] > max)
++ {
++ index = i;
++ max = src0[i];
++ }
++ }
++ target[0] = index;
++ }
++}
++
++#endif /*LV_HAVE_GENERIC*/
++
++#endif /*INCLUDED_volk_gnsssdr_8i_index_max_16u_a_H*/
+diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_max_s8i.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8i_max_s8i.h
+--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_max_s8i.h 1970-01-01 01:00:00.000000000 +0100
++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8i_max_s8i.h 2014-10-15 01:55:08.000000000 +0200
+@@ -0,0 +1,327 @@
++/*!
++ * \file volk_gnsssdr_8i_max_s8i.h
++ * \brief Volk protokernel: calculates the maximum value in a group of 8 bits (char) scalars
++ * \authors <ul>
++ * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
++ * </ul>
++ *
++ * Volk protokernel that returns the maximum value of a group of 8 bits (char) scalars
++ *
++ * -------------------------------------------------------------------------
++ *
++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
++ *
++ * GNSS-SDR is a software defined Global Navigation
++ * Satellite Systems receiver
++ *
++ * This file is part of GNSS-SDR.
++ *
++ * GNSS-SDR is free software: you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation, either version 3 of the License, or
++ * at your option) any later version.
++ *
++ * GNSS-SDR is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
++ *
++ * -------------------------------------------------------------------------
++ */
++
++#ifndef INCLUDED_volk_gnsssdr_8i_max_s8i_u_H
++#define INCLUDED_volk_gnsssdr_8i_max_s8i_u_H
++
++#include <volk_gnsssdr/volk_gnsssdr_common.h>
++#include <inttypes.h>
++#include <stdio.h>
++
++#ifdef LV_HAVE_SSE4_1
++#include<smmintrin.h>
++/*!
++ \brief Returns the max value in src0
++ \param target The max value in src0
++ \param src0 The buffer of data to be analysed
++ \param num_points The number of values in src0 to be analysed
++ */
++static inline void volk_gnsssdr_8i_max_s8i_u_sse4_1(char target, const char* src0, unsigned int num_points) {
++ if(num_points > 0){
++ const unsigned int sse_iters = num_points / 16;
++
++ char* inputPtr = (char*)src0;
++ char max = src0[0];
++ __VOLK_ATTR_ALIGNED(16) char maxValuesBuffer[16];
++ __m128i maxValues, compareResults, currentValues;
++
++ maxValues = _mm_set1_epi8(max);
++
++ for(unsigned int number = 0; number < sse_iters; number++)
++ {
++ currentValues = _mm_loadu_si128((__m128i*)inputPtr);
++ compareResults = _mm_cmpgt_epi8(maxValues, currentValues);
++ maxValues = _mm_blendv_epi8(currentValues, maxValues, compareResults);
++ inputPtr += 16;
++ }
++
++ _mm_storeu_si128((__m128i*)maxValuesBuffer, maxValues);
++
++ for(int i = 0; i<16; ++i)
++ {
++ if(maxValuesBuffer[i] > max)
++ {
++ max = maxValuesBuffer[i];
++ }
++ }
++
++ for(int i = 0; i<(num_points % 16); ++i)
++ {
++ if(src0[i] > max)
++ {
++ max = src0[i];
++ }
++ }
++ target = max;
++ }
++}
++
++#endif /*LV_HAVE_SSE4_1*/
++
++#ifdef LV_HAVE_SSE2
++#include<xmmintrin.h>
++/*!
++ \brief Returns the max value in src0
++ \param target The max value in src0
++ \param src0 The buffer of data to be analysed
++ \param num_points The number of values in src0 to be analysed
++ */
++static inline void volk_gnsssdr_8i_max_s8i_u_sse2(char target, const char* src0, unsigned int num_points) {
++ if(num_points > 0){
++ const unsigned int sse_iters = num_points / 16;
++
++ char* inputPtr = (char*)src0;
++ char max = src0[0];
++ unsigned short mask;
++ __VOLK_ATTR_ALIGNED(16) char currentValuesBuffer[16];
++ __m128i maxValues, compareResults, currentValues;
++
++ maxValues = _mm_set1_epi8(max);
++
++ for(unsigned int number = 0; number < sse_iters; number++)
++ {
++ currentValues = _mm_loadu_si128((__m128i*)inputPtr);
++ compareResults = _mm_cmpgt_epi8(maxValues, currentValues);
++ mask = _mm_movemask_epi8(compareResults);
++
++ if (mask != 0xFFFF)
++ {
++ _mm_storeu_si128((__m128i*)&currentValuesBuffer, currentValues);
++ mask = ~mask;
++ int i = 0;
++ while (mask > 0)
++ {
++ if ((mask & 1) == 1)
++ {
++ if(currentValuesBuffer[i] > max)
++ {
++ max = currentValuesBuffer[i];
++ }
++ }
++ i++;
++ mask >>= 1;
++ }
++ maxValues = _mm_set1_epi8(max);
++ }
++ inputPtr += 16;
++ }
++
++ for(int i = 0; i<(num_points % 16); ++i)
++ {
++ if(src0[i] > max)
++ {
++ max = src0[i];
++ }
++ }
++ target = max;
++ }
++}
++
++#endif /*LV_HAVE_SSE2*/
++
++#ifdef LV_HAVE_GENERIC
++/*!
++ \brief Returns the max value in src0
++ \param target The max value in src0
++ \param src0 The buffer of data to be analysed
++ \param num_points The number of values in src0 to be analysed
++ */
++static inline void volk_gnsssdr_8i_max_s8i_generic(char target, const char* src0, unsigned int num_points) {
++ if(num_points > 0)
++ {
++ char max = src0[0];
++
++ for(unsigned int i = 1; i < num_points; ++i)
++ {
++ if(src0[i] > max)
++ {
++ max = src0[i];
++ }
++ }
++ target = max;
++ }
++}
++
++#endif /*LV_HAVE_GENERIC*/
++
++#endif /*INCLUDED_volk_gnsssdr_8i_max_s8i_u_H*/
++
++
++#ifndef INCLUDED_volk_gnsssdr_8i_max_s8i_a_H
++#define INCLUDED_volk_gnsssdr_8i_max_s8i_a_H
++
++#include <volk_gnsssdr/volk_gnsssdr_common.h>
++#include <inttypes.h>
++#include <stdio.h>
++
++#ifdef LV_HAVE_SSE4_1
++#include "smmintrin.h"
++/*!
++ \brief Returns the max value in src0
++ \param target The max value in src0
++ \param src0 The buffer of data to be analysed
++ \param num_points The number of values in src0 to be analysed
++ */
++static inline void volk_gnsssdr_8i_max_s8i_a_sse4_1(char target, const char* src0, unsigned int num_points) {
++ if(num_points > 0){
++ const unsigned int sse_iters = num_points / 16;
++
++ char* inputPtr = (char*)src0;
++ char max = src0[0];
++ __VOLK_ATTR_ALIGNED(16) char maxValuesBuffer[16];
++ __m128i maxValues, compareResults, currentValues;
++
++ maxValues = _mm_set1_epi8(max);
++
++ for(unsigned int number = 0; number < sse_iters; number++)
++ {
++ currentValues = _mm_load_si128((__m128i*)inputPtr);
++ compareResults = _mm_cmpgt_epi8(maxValues, currentValues);
++ maxValues = _mm_blendv_epi8(currentValues, maxValues, compareResults);
++ inputPtr += 16;
++ }
++
++ _mm_store_si128((__m128i*)maxValuesBuffer, maxValues);
++
++ for(int i = 0; i<16; ++i)
++ {
++ if(maxValuesBuffer[i] > max)
++ {
++ max = maxValuesBuffer[i];
++ }
++ }
++
++ for(int i = 0; i<(num_points % 16); ++i)
++ {
++ if(src0[i] > max)
++ {
++ max = src0[i];
++ }
++ }
++ target = max;
++ }
++}
++
++#endif /*LV_HAVE_SSE4_1*/
++
++#ifdef LV_HAVE_SSE2
++#include "emmintrin.h"
++/*!
++ \brief Returns the max value in src0
++ \param target The max value in src0
++ \param src0 The buffer of data to be analysed
++ \param num_points The number of values in src0 to be analysed
++ */
++static inline void volk_gnsssdr_8i_max_s8i_a_sse2(char target, const char* src0, unsigned int num_points) {
++ if(num_points > 0){
++ const unsigned int sse_iters = num_points / 16;
++
++ char* inputPtr = (char*)src0;
++ char max = src0[0];
++ unsigned short mask;
++ __VOLK_ATTR_ALIGNED(16) char currentValuesBuffer[16];
++ __m128i maxValues, compareResults, currentValues;
++
++ maxValues = _mm_set1_epi8(max);
++
++ for(unsigned int number = 0; number < sse_iters; number++)
++ {
++ currentValues = _mm_load_si128((__m128i*)inputPtr);
++ compareResults = _mm_cmpgt_epi8(maxValues, currentValues);
++ mask = _mm_movemask_epi8(compareResults);
++
++ if (mask != 0xFFFF)
++ {
++ _mm_store_si128((__m128i*)&currentValuesBuffer, currentValues);
++ mask = ~mask;
++ int i = 0;
++ while (mask > 0)
++ {
++ if ((mask & 1) == 1)
++ {
++ if(currentValuesBuffer[i] > max)
++ {
++ max = currentValuesBuffer[i];
++ }
++ }
++ i++;
++ mask >>= 1;
++ }
++ maxValues = _mm_set1_epi8(max);
++ }
++ inputPtr += 16;
++ }
++
++ for(int i = 0; i<(num_points % 16); ++i)
++ {
++ if(src0[i] > max)
++ {
++ max = src0[i];
++ }
++ }
++ target = max;
++ }
++}
++
++#endif /*LV_HAVE_SSE2*/
++
++#ifdef LV_HAVE_GENERIC
++/*!
++ \brief Returns the max value in src0
++ \param target The max value in src0
++ \param src0 The buffer of data to be analysed
++ \param num_points The number of values in src0 to be analysed
++ */
++static inline void volk_gnsssdr_8i_max_s8i_a_generic(char target, const char* src0, unsigned int num_points) {
++ if(num_points > 0)
++ {
++ if(num_points > 0)
++ {
++ char max = src0[0];
++
++ for(unsigned int i = 1; i < num_points; ++i)
++ {
++ if(src0[i] > max)
++ {
++ max = src0[i];
++ }
++ }
++ target = max;
++ }
++ }
++}
++
++#endif /*LV_HAVE_GENERIC*/
++
++#endif /*INCLUDED_volk_gnsssdr_8i_max_s8i_a_H*/
+\ No newline at end of file
+diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_x2_add_8i.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8i_x2_add_8i.h
+--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_x2_add_8i.h 1970-01-01 01:00:00.000000000 +0100
++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8i_x2_add_8i.h 2014-10-15 01:55:08.000000000 +0200
+@@ -0,0 +1,184 @@
++/*!
++ * \file volk_gnsssdr_8i_x2_add_8i.h
++ * \brief Volk protokernel: adds pairs of 8 bits (char) scalars
++ * \authors <ul>
++ * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
++ * </ul>
++ *
++ * Volk protokernel that adds pairs of 8 bits (char) scalars
++ *
++ * -------------------------------------------------------------------------
++ *
++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
++ *
++ * GNSS-SDR is a software defined Global Navigation
++ * Satellite Systems receiver
++ *
++ * This file is part of GNSS-SDR.
++ *
++ * GNSS-SDR is free software: you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation, either version 3 of the License, or
++ * at your option) any later version.
++ *
++ * GNSS-SDR is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
++ *
++ * -------------------------------------------------------------------------
++ */
++
++#ifndef INCLUDED_volk_gnsssdr_8i_x2_add_8i_u_H
++#define INCLUDED_volk_gnsssdr_8i_x2_add_8i_u_H
++
++#include <inttypes.h>
++#include <stdio.h>
++
++#ifdef LV_HAVE_SSE2
++#include "pmmintrin.h"
++/*!
++ \brief Adds the two input vectors and store their results in the third vector
++ \param cVector The vector where the results will be stored
++ \param aVector One of the vectors to be added
++ \param bVector One of the vectors to be added
++ \param num_points The number of values in aVector and bVector to be added together and stored into cVector
++ */
++static inline void volk_gnsssdr_8i_x2_add_8i_u_sse2(char* cVector, const char* aVector, const char* bVector, unsigned int num_points){
++
++ const unsigned int sse_iters = num_points / 16;
++
++ char* cPtr = cVector;
++ const char* aPtr = aVector;
++ const char* bPtr= bVector;
++
++ __m128i aVal, bVal, cVal;
++
++ for(int number = 0; number < sse_iters; number++){
++
++ aVal = _mm_lddqu_si128((__m128i*)aPtr);
++ bVal = _mm_lddqu_si128((__m128i*)bPtr);
++
++ cVal = _mm_add_epi8(aVal, bVal);
++
++ _mm_storeu_si128((__m128i*)cPtr,cVal); // Store the results back into the C container
++
++ aPtr += 16;
++ bPtr += 16;
++ cPtr += 16;
++ }
++
++ for(int i = 0; i<(num_points % 16); ++i)
++ {
++ *cPtr++ = (*aPtr++) + (*bPtr++);
++ }
++}
++#endif /* LV_HAVE_SSE2 */
++
++#ifdef LV_HAVE_GENERIC
++/*!
++ \brief Adds the two input vectors and store their results in the third vector
++ \param cVector The vector where the results will be stored
++ \param aVector One of the vectors to be added
++ \param bVector One of the vectors to be added
++ \param num_points The number of values in aVector and bVector to be added together and stored into cVector
++ */
++static inline void volk_gnsssdr_8i_x2_add_8i_generic(char* cVector, const char* aVector, const char* bVector, unsigned int num_points){
++ char* cPtr = cVector;
++ const char* aPtr = aVector;
++ const char* bPtr= bVector;
++ unsigned int number = 0;
++
++ for(number = 0; number < num_points; number++){
++ *cPtr++ = (*aPtr++) + (*bPtr++);
++ }
++}
++#endif /* LV_HAVE_GENERIC */
++
++#endif /* INCLUDED_volk_gnsssdr_8i_x2_add_8i_u_H */
++
++
++#ifndef INCLUDED_volk_gnsssdr_8i_x2_add_8i_a_H
++#define INCLUDED_volk_gnsssdr_8i_x2_add_8i_a_H
++
++#include <inttypes.h>
++#include <stdio.h>
++
++#ifdef LV_HAVE_SSE2
++#include "pmmintrin.h"
++/*!
++ \brief Adds the two input vectors and store their results in the third vector
++ \param cVector The vector where the results will be stored
++ \param aVector One of the vectors to be added
++ \param bVector One of the vectors to be added
++ \param num_points The number of values in aVector and bVector to be added together and stored into cVector
++ */
++static inline void volk_gnsssdr_8i_x2_add_8i_a_sse2(char* cVector, const char* aVector, const char* bVector, unsigned int num_points){
++
++ const unsigned int sse_iters = num_points / 16;
++
++ char* cPtr = cVector;
++ const char* aPtr = aVector;
++ const char* bPtr= bVector;
++
++ __m128i aVal, bVal, cVal;
++
++ for(int number = 0; number < sse_iters; number++){
++
++ aVal = _mm_load_si128((__m128i*)aPtr);
++ bVal = _mm_load_si128((__m128i*)bPtr);
++
++ cVal = _mm_add_epi8(aVal, bVal);
++
++ _mm_store_si128((__m128i*)cPtr,cVal); // Store the results back into the C container
++
++ aPtr += 16;
++ bPtr += 16;
++ cPtr += 16;
++ }
++
++ for(int i = 0; i<(num_points % 16); ++i)
++ {
++ *cPtr++ = (*aPtr++) + (*bPtr++);
++ }
++}
++#endif /* LV_HAVE_SSE2 */
++
++#ifdef LV_HAVE_GENERIC
++/*!
++ \brief Adds the two input vectors and store their results in the third vector
++ \param cVector The vector where the results will be stored
++ \param aVector One of the vectors to be added
++ \param bVector One of the vectors to be added
++ \param num_points The number of values in aVector and bVector to be added together and stored into cVector
++ */
++static inline void volk_gnsssdr_8i_x2_add_8i_a_generic(char* cVector, const char* aVector, const char* bVector, unsigned int num_points){
++ char* cPtr = cVector;
++ const char* aPtr = aVector;
++ const char* bPtr= bVector;
++ unsigned int number = 0;
++
++ for(number = 0; number < num_points; number++){
++ *cPtr++ = (*aPtr++) + (*bPtr++);
++ }
++}
++#endif /* LV_HAVE_GENERIC */
++
++#ifdef LV_HAVE_ORC
++/*!
++ \brief Adds the two input vectors and store their results in the third vector
++ \param cVector The vector where the results will be stored
++ \param aVector One of the vectors to be added
++ \param bVector One of the vectors to be added
++ \param num_points The number of values in aVector and bVector to be added together and stored into cVector
++ */
++extern void volk_gnsssdr_8i_x2_add_8i_a_orc_impl(char* cVector, const char* aVector, const char* bVector, unsigned int num_points);
++static inline void volk_gnsssdr_8i_x2_add_8i_u_orc(char* cVector, const char* aVector, const char* bVector, unsigned int num_points){
++ volk_gnsssdr_8i_x2_add_8i_a_orc_impl(cVector, aVector, bVector, num_points);
++}
++#endif /* LV_HAVE_ORC */
++
++#endif /* INCLUDED_volk_gnsssdr_8i_x2_add_8i_a_H */
+diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_conjugate_8ic.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_conjugate_8ic.h
+--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_conjugate_8ic.h 1970-01-01 01:00:00.000000000 +0100
++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_conjugate_8ic.h 2014-10-15 01:55:08.000000000 +0200
+@@ -0,0 +1,326 @@
++/*!
++ * \file volk_gnsssdr_8ic_conjugate_8ic.h
++ * \brief Volk protokernel: calculates the conjugate of a 16 bits vector
++ * \authors <ul>
++ * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
++ * </ul>
++ *
++ * Volk protokernel that calculates the conjugate of a
++ * 16 bits vector (8 bits the real part and 8 bits the imaginary part)
++ *
++ * -------------------------------------------------------------------------
++ *
++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
++ *
++ * GNSS-SDR is a software defined Global Navigation
++ * Satellite Systems receiver
++ *
++ * This file is part of GNSS-SDR.
++ *
++ * GNSS-SDR is free software: you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation, either version 3 of the License, or
++ * at your option) any later version.
++ *
++ * GNSS-SDR is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
++ *
++ * -------------------------------------------------------------------------
++ */
++
++#ifndef INCLUDED_volk_gnsssdr_8ic_conjugate_8ic_u_H
++#define INCLUDED_volk_gnsssdr_8ic_conjugate_8ic_u_H
++
++#include <inttypes.h>
++#include <stdio.h>
++#include <volk_gnsssdr/volk_gnsssdr_complex.h>
++
++#ifdef LV_HAVE_AVX
++#include "immintrin.h"
++/*!
++ \brief Takes the conjugate of an unsigned char vector.
++ \param cVector The vector where the results will be stored
++ \param aVector Vector to be conjugated
++ \param num_points The number of unsigned char values in aVector to be conjugated and stored into cVector
++ */
++static inline void volk_gnsssdr_8ic_conjugate_8ic_u_avx(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points){
++ const unsigned int sse_iters = num_points / 16;
++
++ lv_8sc_t* c = cVector;
++ const lv_8sc_t* a = aVector;
++
++ __m256 tmp;
++ __m128i tmp128lo, tmp128hi;
++ __m256 conjugator1 = _mm256_castsi256_ps(_mm256_setr_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255));
++ __m128i conjugator2 = _mm_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1);
++
++ for (int i = 0; i < sse_iters; ++i)
++ {
++ tmp = _mm256_loadu_ps((float*)a);
++ tmp = _mm256_xor_ps(tmp, conjugator1);
++ tmp128lo = _mm256_castsi256_si128(_mm256_castps_si256(tmp));
++ tmp128lo = _mm_add_epi8(tmp128lo, conjugator2);
++ tmp128hi = _mm256_extractf128_si256(_mm256_castps_si256(tmp),1);
++ tmp128hi = _mm_add_epi8(tmp128hi, conjugator2);
++ //tmp = _mm256_set_m128i(tmp128hi , tmp128lo); //not defined in some versions of immintrin.h
++ tmp = _mm256_insertf128_si256(_mm256_castsi128_si256(tmp128lo),(tmp128hi),1);
++ _mm256_storeu_ps((float*)c, tmp);
++
++ a += 16;
++ c += 16;
++ }
++
++ for (int i = 0; i<(num_points % 16); ++i)
++ {
++ *c++ = lv_conj(*a++);
++ }
++}
++#endif /* LV_HAVE_AVX */
++
++#ifdef LV_HAVE_SSSE3
++#include "tmmintrin.h"
++/*!
++ \brief Takes the conjugate of an unsigned char vector.
++ \param cVector The vector where the results will be stored
++ \param aVector Vector to be conjugated
++ \param num_points The number of unsigned char values in aVector to be conjugated and stored into cVector
++ */
++static inline void volk_gnsssdr_8ic_conjugate_8ic_u_ssse3(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points){
++ const unsigned int sse_iters = num_points / 8;
++
++ lv_8sc_t* c = cVector;
++ const lv_8sc_t* a = aVector;
++ __m128i tmp;
++
++ __m128i conjugator = _mm_setr_epi8(1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1);
++
++ for (int i = 0; i < sse_iters; ++i)
++ {
++ tmp = _mm_lddqu_si128((__m128i*)a);
++ tmp = _mm_sign_epi8(tmp, conjugator);
++ _mm_storeu_si128((__m128i*)c, tmp);
++ a += 8;
++ c += 8;
++ }
++
++ for (int i = 0; i<(num_points % 8); ++i)
++ {
++ *c++ = lv_conj(*a++);
++ }
++
++}
++#endif /* LV_HAVE_SSSE3 */
++
++#ifdef LV_HAVE_SSE3
++#include <pmmintrin.h>
++/*!
++ \brief Takes the conjugate of an unsigned char vector.
++ \param cVector The vector where the results will be stored
++ \param aVector Vector to be conjugated
++ \param num_points The number of unsigned char values in aVector to be conjugated and stored into cVector
++ */
++static inline void volk_gnsssdr_8ic_conjugate_8ic_u_sse3(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points){
++ const unsigned int sse_iters = num_points / 8;
++
++ lv_8sc_t* c = cVector;
++ const lv_8sc_t* a = aVector;
++ __m128i tmp;
++
++ __m128i conjugator1 = _mm_setr_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
++ __m128i conjugator2 = _mm_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1);
++
++ for (int i = 0; i < sse_iters; ++i)
++ {
++ tmp = _mm_lddqu_si128((__m128i*)a);
++ tmp = _mm_xor_si128(tmp, conjugator1);
++ tmp = _mm_add_epi8(tmp, conjugator2);
++ _mm_storeu_si128((__m128i*)c, tmp);
++ a += 8;
++ c += 8;
++ }
++
++ for (int i = 0; i<(num_points % 8); ++i)
++ {
++ *c++ = lv_conj(*a++);
++ }
++
++}
++#endif /* LV_HAVE_SSE3 */
++
++#ifdef LV_HAVE_GENERIC
++/*!
++ \brief Takes the conjugate of an unsigned char vector.
++ \param cVector The vector where the results will be stored
++ \param aVector Vector to be conjugated
++ \param num_points The number of unsigned char values in aVector to be conjugated and stored into cVector
++ */
++static inline void volk_gnsssdr_8ic_conjugate_8ic_generic(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points){
++ lv_8sc_t* cPtr = cVector;
++ const lv_8sc_t* aPtr = aVector;
++ unsigned int number = 0;
++
++ for(number = 0; number < num_points; number++){
++ *cPtr++ = lv_conj(*aPtr++);
++ }
++}
++#endif /* LV_HAVE_GENERIC */
++
++#endif /* INCLUDED_volk_gnsssdr_8ic_conjugate_8ic_u_H */
++
++
++#ifndef INCLUDED_volk_gnsssdr_8ic_conjugate_8ic_a_H
++#define INCLUDED_volk_gnsssdr_8ic_conjugate_8ic_a_H
++
++#include <inttypes.h>
++#include <stdio.h>
++#include <volk_gnsssdr/volk_gnsssdr_complex.h>
++
++#ifdef LV_HAVE_AVX
++#include "immintrin.h"
++/*!
++ \brief Takes the conjugate of an unsigned char vector.
++ \param cVector The vector where the results will be stored
++ \param aVector Vector to be conjugated
++ \param num_points The number of unsigned char values in aVector to be conjugated and stored into cVector
++ */
++static inline void volk_gnsssdr_8ic_conjugate_8ic_a_avx(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points){
++ const unsigned int sse_iters = num_points / 16;
++
++ lv_8sc_t* c = cVector;
++ const lv_8sc_t* a = aVector;
++
++ __m256 tmp;
++ __m128i tmp128lo, tmp128hi;
++ __m256 conjugator1 = _mm256_castsi256_ps(_mm256_setr_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255));
++ __m128i conjugator2 = _mm_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1);
++
++ for (int i = 0; i < sse_iters; ++i)
++ {
++ tmp = _mm256_load_ps((float*)a);
++ tmp = _mm256_xor_ps(tmp, conjugator1);
++ tmp128lo = _mm256_castsi256_si128(_mm256_castps_si256(tmp));
++ tmp128lo = _mm_add_epi8(tmp128lo, conjugator2);
++ tmp128hi = _mm256_extractf128_si256(_mm256_castps_si256(tmp),1);
++ tmp128hi = _mm_add_epi8(tmp128hi, conjugator2);
++ //tmp = _mm256_set_m128i(tmp128hi , tmp128lo); //not defined in some versions of immintrin.h
++ tmp = _mm256_insertf128_si256(_mm256_castsi128_si256(tmp128lo),(tmp128hi),1);
++ _mm256_store_ps((float*)c, tmp);
++
++ a += 16;
++ c += 16;
++ }
++
++ for (int i = 0; i<(num_points % 16); ++i)
++ {
++ *c++ = lv_conj(*a++);
++ }
++}
++#endif /* LV_HAVE_AVX */
++
++#ifdef LV_HAVE_SSSE3
++#include "tmmintrin.h"
++/*!
++ \brief Takes the conjugate of an unsigned char vector.
++ \param cVector The vector where the results will be stored
++ \param aVector Vector to be conjugated
++ \param num_points The number of unsigned char values in aVector to be conjugated and stored into cVector
++ */
++static inline void volk_gnsssdr_8ic_conjugate_8ic_a_ssse3(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points){
++ const unsigned int sse_iters = num_points / 8;
++
++ lv_8sc_t* c = cVector;
++ const lv_8sc_t* a = aVector;
++ __m128i tmp;
++
++ __m128i conjugator = _mm_setr_epi8(1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1);
++
++ for (int i = 0; i < sse_iters; ++i)
++ {
++ tmp = _mm_load_si128((__m128i*)a);
++ tmp = _mm_sign_epi8(tmp, conjugator);
++ _mm_store_si128((__m128i*)c, tmp);
++ a += 8;
++ c += 8;
++ }
++
++ for (int i = 0; i<(num_points % 8); ++i)
++ {
++ *c++ = lv_conj(*a++);
++ }
++
++}
++#endif /* LV_HAVE_SSSE3 */
++
++#ifdef LV_HAVE_SSE3
++#include <pmmintrin.h>
++/*!
++ \brief Takes the conjugate of an unsigned char vector.
++ \param cVector The vector where the results will be stored
++ \param aVector Vector to be conjugated
++ \param num_points The number of unsigned char values in aVector to be conjugated and stored into cVector
++ */
++static inline void volk_gnsssdr_8ic_conjugate_8ic_a_sse3(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points){
++ const unsigned int sse_iters = num_points / 8;
++
++ lv_8sc_t* c = cVector;
++ const lv_8sc_t* a = aVector;
++ __m128i tmp;
++
++ __m128i conjugator1 = _mm_setr_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
++ __m128i conjugator2 = _mm_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1);
++
++ for (int i = 0; i < sse_iters; ++i)
++ {
++ tmp = _mm_load_si128((__m128i*)a);
++ tmp = _mm_xor_si128(tmp, conjugator1);
++ tmp = _mm_add_epi8(tmp, conjugator2);
++ _mm_store_si128((__m128i*)c, tmp);
++ a += 8;
++ c += 8;
++ }
++
++ for (int i = 0; i<(num_points % 8); ++i)
++ {
++ *c++ = lv_conj(*a++);
++ }
++
++}
++#endif /* LV_HAVE_SSE3 */
++
++#ifdef LV_HAVE_GENERIC
++/*!
++ \brief Takes the conjugate of an unsigned char vector.
++ \param cVector The vector where the results will be stored
++ \param aVector Vector to be conjugated
++ \param num_points The number of unsigned char values in aVector to be conjugated and stored into cVector
++ */
++static inline void volk_gnsssdr_8ic_conjugate_8ic_a_generic(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points){
++ lv_8sc_t* cPtr = cVector;
++ const lv_8sc_t* aPtr = aVector;
++ unsigned int number = 0;
++
++ for(number = 0; number < num_points; number++){
++ *cPtr++ = lv_conj(*aPtr++);
++ }
++}
++#endif /* LV_HAVE_GENERIC */
++
++#ifdef LV_HAVE_ORC
++/*!
++ \brief Takes the conjugate of an unsigned char vector.
++ \param cVector The vector where the results will be stored
++ \param aVector Vector to be conjugated
++ \param num_points The number of unsigned char values in aVector to be conjugated and stored into cVector
++ */
++extern void volk_gnsssdr_8ic_conjugate_8ic_a_orc_impl(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points);
++static inline void volk_gnsssdr_8ic_conjugate_8ic_u_orc(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points){
++ volk_gnsssdr_8ic_conjugate_8ic_a_orc_impl(cVector, aVector, num_points);
++}
++#endif /* LV_HAVE_ORC */
++
++#endif /* INCLUDED_volk_gnsssdr_8ic_conjugate_8ic_a_H */
+diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_magnitude_squared_8i.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_magnitude_squared_8i.h
+--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_magnitude_squared_8i.h 1970-01-01 01:00:00.000000000 +0100
++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_magnitude_squared_8i.h 2014-10-15 01:55:08.000000000 +0200
+@@ -0,0 +1,320 @@
++/*!
++ * \file volk_gnsssdr_8ic_magnitude_squared_8i.h
++ * \brief Volk protokernel: calculates the magnitude squared of a 16 bits vector
++ * \authors <ul>
++ * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
++ * </ul>
++ *
++ * Volk protokernel that calculates the magnitude squared of a
++ * 16 bits vector (8 bits the real part and 8 bits the imaginary part)
++ * result = (real*real) + (imag*imag)
++ *
++ * -------------------------------------------------------------------------
++ *
++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
++ *
++ * GNSS-SDR is a software defined Global Navigation
++ * Satellite Systems receiver
++ *
++ * This file is part of GNSS-SDR.
++ *
++ * GNSS-SDR is free software: you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation, either version 3 of the License, or
++ * at your option) any later version.
++ *
++ * GNSS-SDR is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
++ *
++ * -------------------------------------------------------------------------
++ */
++
++#ifndef INCLUDED_volk_gnsssdr_8ic_magnitude_squared_8i_u_H
++#define INCLUDED_volk_gnsssdr_8ic_magnitude_squared_8i_u_H
++
++#include <inttypes.h>
++#include <stdio.h>
++#include <math.h>
++
++#ifdef LV_HAVE_SSE3
++#include <pmmintrin.h>
++#include "tmmintrin.h"
++/*!
++ \brief Calculates the magnitude squared of complexVector and stores the results in magnitudeVector
++ \param complexVector The vector containing the complex input values
++ \param magnitudeVector The vector containing the real output values
++ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
++ */
++static inline void volk_gnsssdr_8ic_magnitude_squared_8i_u_sse3(char* magnitudeVector, const lv_8sc_t* complexVector, unsigned int num_points){
++
++ const unsigned int sse_iters = num_points / 16;
++
++ const char* complexVectorPtr = (char*)complexVector;
++ char* magnitudeVectorPtr = magnitudeVector;
++
++ __m128i zero, result8;
++ __m128i avector, avectorhi, avectorlo, avectorlomult, avectorhimult, aadded, maska;
++ __m128i bvector, bvectorhi, bvectorlo, bvectorlomult, bvectorhimult, badded, maskb;
++
++ zero = _mm_setzero_si128();
++ maska = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
++ maskb = _mm_set_epi8(14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
++
++ for(int number = 0;number < sse_iters; number++)
++ {
++ avector = _mm_lddqu_si128((__m128i*)complexVectorPtr);
++ avectorlo = _mm_unpacklo_epi8 (avector, zero);
++ avectorhi = _mm_unpackhi_epi8 (avector, zero);
++ avectorlomult = _mm_mullo_epi16 (avectorlo, avectorlo);
++ avectorhimult = _mm_mullo_epi16 (avectorhi, avectorhi);
++ aadded = _mm_hadd_epi16 (avectorlomult, avectorhimult);
++
++ complexVectorPtr += 16;
++
++ bvector = _mm_lddqu_si128((__m128i*)complexVectorPtr);
++ bvectorlo = _mm_unpacklo_epi8 (bvector, zero);
++ bvectorhi = _mm_unpackhi_epi8 (bvector, zero);
++ bvectorlomult = _mm_mullo_epi16 (bvectorlo, bvectorlo);
++ bvectorhimult = _mm_mullo_epi16 (bvectorhi, bvectorhi);
++ badded = _mm_hadd_epi16 (bvectorlomult, bvectorhimult);
++
++ complexVectorPtr += 16;
++
++ result8 = _mm_or_si128(_mm_shuffle_epi8(aadded, maska), _mm_shuffle_epi8(badded, maskb));
++
++ _mm_storeu_si128((__m128i*)magnitudeVectorPtr, result8);
++
++ magnitudeVectorPtr += 16;
++
++
++ }
++
++ for (int i = 0; i<(num_points % 16); ++i)
++ {
++ const char valReal = *complexVectorPtr++;
++ const char valImag = *complexVectorPtr++;
++ *magnitudeVectorPtr++ = (valReal * valReal) + (valImag * valImag);
++ }
++}
++#endif /* LV_HAVE_SSE3 */
++
++//#ifdef LV_HAVE_SSE
++//#include <xmmintrin.h>
++///*!
++// \brief Calculates the magnitude squared of complexVector and stores the results in magnitudeVector
++// \param complexVector The vector containing the complex input values
++// \param magnitudeVector The vector containing the real output values
++// \param num_points The number of complex values in complexVector to be calculated and stored into cVector
++// */
++//static inline void volk_gnsssdr_8ic_magnitude_squared_8i_u_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
++// unsigned int number = 0;
++// const unsigned int quarterPoints = num_points / 4;
++//
++// const float* complexVectorPtr = (float*)complexVector;
++// float* magnitudeVectorPtr = magnitudeVector;
++//
++// __m128 cplxValue1, cplxValue2, iValue, qValue, result;
++// for(;number < quarterPoints; number++){
++// cplxValue1 = _mm_loadu_ps(complexVectorPtr);
++// complexVectorPtr += 4;
++//
++// cplxValue2 = _mm_loadu_ps(complexVectorPtr);
++// complexVectorPtr += 4;
++//
++// // Arrange in i1i2i3i4 format
++// iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
++// // Arrange in q1q2q3q4 format
++// qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
++//
++// iValue = _mm_mul_ps(iValue, iValue); // Square the I values
++// qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values
++//
++// result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values
++//
++// _mm_storeu_ps(magnitudeVectorPtr, result);
++// magnitudeVectorPtr += 4;
++// }
++//
++// number = quarterPoints * 4;
++// for(; number < num_points; number++){
++// float val1Real = *complexVectorPtr++;
++// float val1Imag = *complexVectorPtr++;
++// *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
++// }
++//}
++//#endif /* LV_HAVE_SSE */
++
++#ifdef LV_HAVE_GENERIC
++/*!
++ \brief Calculates the magnitude squared of complexVector and stores the results in magnitudeVector
++ \param complexVector The vector containing the complex input values
++ \param magnitudeVector The vector containing the real output values
++ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
++ */
++static inline void volk_gnsssdr_8ic_magnitude_squared_8i_generic(char* magnitudeVector, const lv_8sc_t* complexVector, unsigned int num_points){
++ const char* complexVectorPtr = (char*)complexVector;
++ char* magnitudeVectorPtr = magnitudeVector;
++
++ for(int number = 0; number < num_points; number++){
++ const char real = *complexVectorPtr++;
++ const char imag = *complexVectorPtr++;
++ *magnitudeVectorPtr++ = (real*real) + (imag*imag);
++ }
++}
++#endif /* LV_HAVE_GENERIC */
++
++#endif /* INCLUDED_volk_gnsssdr_32fc_magnitude_32f_u_H */
++
++
++#ifndef INCLUDED_volk_gnsssdr_8ic_magnitude_squared_8i_a_H
++#define INCLUDED_volk_gnsssdr_8ic_magnitude_squared_8i_a_H
++
++#include <inttypes.h>
++#include <stdio.h>
++#include <math.h>
++
++#ifdef LV_HAVE_SSE3
++#include <pmmintrin.h>
++/*!
++ \brief Calculates the magnitude squared of complexVector and stores the results in magnitudeVector
++ \param complexVector The vector containing the complex input values
++ \param magnitudeVector The vector containing the real output values
++ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
++ */
++static inline void volk_gnsssdr_8ic_magnitude_squared_8i_a_sse3(char* magnitudeVector, const lv_8sc_t* complexVector, unsigned int num_points){
++
++ const unsigned int sse_iters = num_points / 16;
++
++ const char* complexVectorPtr = (char*)complexVector;
++ char* magnitudeVectorPtr = magnitudeVector;
++
++ __m128i zero, result8;
++ __m128i avector, avectorhi, avectorlo, avectorlomult, avectorhimult, aadded, maska;
++ __m128i bvector, bvectorhi, bvectorlo, bvectorlomult, bvectorhimult, badded, maskb;
++
++ zero = _mm_setzero_si128();
++ maska = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
++ maskb = _mm_set_epi8(14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
++
++ for(int number = 0;number < sse_iters; number++)
++ {
++ avector = _mm_load_si128((__m128i*)complexVectorPtr);
++ avectorlo = _mm_unpacklo_epi8 (avector, zero);
++ avectorhi = _mm_unpackhi_epi8 (avector, zero);
++ avectorlomult = _mm_mullo_epi16 (avectorlo, avectorlo);
++ avectorhimult = _mm_mullo_epi16 (avectorhi, avectorhi);
++ aadded = _mm_hadd_epi16 (avectorlomult, avectorhimult);
++
++ complexVectorPtr += 16;
++
++ bvector = _mm_load_si128((__m128i*)complexVectorPtr);
++ bvectorlo = _mm_unpacklo_epi8 (bvector, zero);
++ bvectorhi = _mm_unpackhi_epi8 (bvector, zero);
++ bvectorlomult = _mm_mullo_epi16 (bvectorlo, bvectorlo);
++ bvectorhimult = _mm_mullo_epi16 (bvectorhi, bvectorhi);
++ badded = _mm_hadd_epi16 (bvectorlomult, bvectorhimult);
++
++ complexVectorPtr += 16;
++
++ result8 = _mm_or_si128(_mm_shuffle_epi8(aadded, maska), _mm_shuffle_epi8(badded, maskb));
++
++ _mm_store_si128((__m128i*)magnitudeVectorPtr, result8);
++
++ magnitudeVectorPtr += 16;
++
++
++ }
++
++ for (int i = 0; i<(num_points % 16); ++i)
++ {
++ const char valReal = *complexVectorPtr++;
++ const char valImag = *complexVectorPtr++;
++ *magnitudeVectorPtr++ = (valReal * valReal) + (valImag * valImag);
++ }
++}
++#endif /* LV_HAVE_SSE3 */
++
++//#ifdef LV_HAVE_SSE
++//#include <xmmintrin.h>
++///*!
++// \brief Calculates the magnitude squared of complexVector and stores the results in magnitudeVector
++// \param complexVector The vector containing the complex input values
++// \param magnitudeVector The vector containing the real output values
++// \param num_points The number of complex values in complexVector to be calculated and stored into cVector
++// */
++//static inline void volk_gnsssdr_8ic_magnitude_squared_8i_a_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
++// unsigned int number = 0;
++// const unsigned int quarterPoints = num_points / 4;
++//
++// const float* complexVectorPtr = (float*)complexVector;
++// float* magnitudeVectorPtr = magnitudeVector;
++//
++// __m128 cplxValue1, cplxValue2, iValue, qValue, result;
++// for(;number < quarterPoints; number++){
++// cplxValue1 = _mm_load_ps(complexVectorPtr);
++// complexVectorPtr += 4;
++//
++// cplxValue2 = _mm_load_ps(complexVectorPtr);
++// complexVectorPtr += 4;
++//
++// // Arrange in i1i2i3i4 format
++// iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
++// // Arrange in q1q2q3q4 format
++// qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
++//
++// iValue = _mm_mul_ps(iValue, iValue); // Square the I values
++// qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values
++//
++// result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values
++//
++// _mm_store_ps(magnitudeVectorPtr, result);
++// magnitudeVectorPtr += 4;
++// }
++//
++// number = quarterPoints * 4;
++// for(; number < num_points; number++){
++// float val1Real = *complexVectorPtr++;
++// float val1Imag = *complexVectorPtr++;
++// *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
++// }
++//}
++//#endif /* LV_HAVE_SSE */
++
++#ifdef LV_HAVE_GENERIC
++/*!
++ \brief Calculates the magnitude squared of complexVector and stores the results in magnitudeVector
++ \param complexVector The vector containing the complex input values
++ \param magnitudeVector The vector containing the real output values
++ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
++ */
++static inline void volk_gnsssdr_8ic_magnitude_squared_8i_a_generic(char* magnitudeVector, const lv_8sc_t* complexVector, unsigned int num_points){
++ const char* complexVectorPtr = (char*)complexVector;
++ char* magnitudeVectorPtr = magnitudeVector;
++
++ for(int number = 0; number < num_points; number++){
++ const char real = *complexVectorPtr++;
++ const char imag = *complexVectorPtr++;
++ *magnitudeVectorPtr++ = (real*real) + (imag*imag);
++ }
++}
++#endif /* LV_HAVE_GENERIC */
++
++#ifdef LV_HAVE_ORC
++/*!
++ \brief Calculates the magnitude squared of complexVector and stores the results in magnitudeVector
++ \param complexVector The vector containing the complex input values
++ \param magnitudeVector The vector containing the real output values
++ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
++ */
++extern void volk_gnsssdr_8ic_magnitude_squared_8i_a_orc_impl(char* magnitudeVector, const lv_8sc_t* complexVector, unsigned int num_points);
++static inline void volk_gnsssdr_8ic_magnitude_squared_8i_u_orc(char* magnitudeVector, const lv_8sc_t* complexVector, unsigned int num_points){
++ volk_gnsssdr_8ic_magnitude_squared_8i_a_orc_impl(magnitudeVector, complexVector, num_points);
++}
++#endif /* LV_HAVE_ORC */
++
++#endif /* INCLUDED_volk_gnsssdr_32fc_magnitude_32f_a_H */
+diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_s8ic_multiply_8ic.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_s8ic_multiply_8ic.h
+--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_s8ic_multiply_8ic.h 1970-01-01 01:00:00.000000000 +0100
++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_s8ic_multiply_8ic.h 2014-10-15 01:55:08.000000000 +0200
+@@ -0,0 +1,271 @@
++/*!
++ * \file volk_gnsssdr_8ic_s8ic_multiply_8ic.h
++ * \brief Volk protokernel: multiplies a group of 16 bits vectors by one constant vector
++ * \authors <ul>
++ * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
++ * </ul>
++ *
++ * Volk protokernel that multiplies a group of 16 bits vectors
++ * (8 bits the real part and 8 bits the imaginary part) by one constant vector
++ *
++ * -------------------------------------------------------------------------
++ *
++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
++ *
++ * GNSS-SDR is a software defined Global Navigation
++ * Satellite Systems receiver
++ *
++ * This file is part of GNSS-SDR.
++ *
++ * GNSS-SDR is free software: you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation, either version 3 of the License, or
++ * at your option) any later version.
++ *
++ * GNSS-SDR is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
++ *
++ * -------------------------------------------------------------------------
++ */
++
++#ifndef INCLUDED_volk_gnsssdr_8ic_s8ic_multiply_8ic_u_H
++#define INCLUDED_volk_gnsssdr_8ic_s8ic_multiply_8ic_u_H
++
++#include <inttypes.h>
++#include <stdio.h>
++#include <volk_gnsssdr/volk_gnsssdr_complex.h>
++#include <float.h>
++
++#ifdef LV_HAVE_SSE3
++#include <pmmintrin.h>
++/*!
++ \brief Multiplies the input vector by a scalar and stores the results in the third vector
++ \param cVector The vector where the results will be stored
++ \param aVector The vector to be multiplied
++ \param scalar The complex scalar to multiply aVector
++ \param num_points The number of complex values in aVector to be multiplied by sacalar and stored into cVector
++ */
++static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_u_sse3(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t scalar, unsigned int num_points){
++
++ const unsigned int sse_iters = num_points / 8;
++
++ __m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc;
++
++ lv_8sc_t* c = cVector;
++ const lv_8sc_t* a = aVector;
++
++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
++
++ y = _mm_set1_epi16 (*(short*)&scalar);
++ imagy = _mm_srli_si128 (y, 1);
++ imagy = _mm_and_si128 (imagy, mult1);
++ realy = _mm_and_si128 (y, mult1);
++
++ for(int number = 0;number < sse_iters; number++){
++
++ x = _mm_lddqu_si128((__m128i*)a);
++
++ imagx = _mm_srli_si128 (x, 1);
++ imagx = _mm_and_si128 (imagx, mult1);
++ realx = _mm_and_si128 (x, mult1);
++
++ realx_mult_realy = _mm_mullo_epi16 (realx, realy);
++ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);
++ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);
++ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);
++
++ realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
++ realc = _mm_and_si128 (realc, mult1);
++ imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
++ imagc = _mm_and_si128 (imagc, mult1);
++ imagc = _mm_slli_si128 (imagc, 1);
++
++ totalc = _mm_or_si128 (realc, imagc);
++
++ _mm_storeu_si128((__m128i*)c, totalc);
++
++ a += 8;
++ c += 8;
++ }
++
++ for (int i = 0; i<(num_points % 8); ++i)
++ {
++ *c++ = (*a++) * scalar;
++ }
++
++}
++#endif /* LV_HAVE_SSE3 */
++
++#ifdef LV_HAVE_GENERIC
++/*!
++ \brief Multiplies the input vector by a scalar and stores the results in the third vector
++ \param cVector The vector where the results will be stored
++ \param aVector The vector to be multiplied
++ \param scalar The complex scalar to multiply aVector
++ \param num_points The number of complex values in aVector to be multiplied by sacalar and stored into cVector
++ */
++static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_generic(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t scalar, unsigned int num_points){
++
++ /*lv_8sc_t* cPtr = cVector;
++ const lv_8sc_t* aPtr = aVector;
++
++ for (int i = 0; i<num_points; ++i)
++ {
++ *cPtr++ = (*aPtr++) * scalar;
++ }*/
++
++ lv_8sc_t* cPtr = cVector;
++ const lv_8sc_t* aPtr = aVector;
++ unsigned int number = num_points;
++
++ // unwrap loop
++ while (number >= 8){
++ *cPtr++ = (*aPtr++) * scalar;
++ *cPtr++ = (*aPtr++) * scalar;
++ *cPtr++ = (*aPtr++) * scalar;
++ *cPtr++ = (*aPtr++) * scalar;
++ *cPtr++ = (*aPtr++) * scalar;
++ *cPtr++ = (*aPtr++) * scalar;
++ *cPtr++ = (*aPtr++) * scalar;
++ *cPtr++ = (*aPtr++) * scalar;
++ number -= 8;
++ }
++
++ // clean up any remaining
++ while (number-- > 0)
++ *cPtr++ = *aPtr++ * scalar;
++}
++#endif /* LV_HAVE_GENERIC */
++
++#endif /* INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_u_H */
++
++
++#ifndef INCLUDED_volk_gnsssdr_8ic_s8ic_multiply_8ic_a_H
++#define INCLUDED_volk_gnsssdr_8ic_s8ic_multiply_8ic_a_H
++
++#include <inttypes.h>
++#include <stdio.h>
++#include <volk_gnsssdr/volk_gnsssdr_complex.h>
++#include <float.h>
++
++#ifdef LV_HAVE_SSE3
++#include <pmmintrin.h>
++/*!
++ \brief Multiplies the input vector by a scalar and stores the results in the third vector
++ \param cVector The vector where the results will be stored
++ \param aVector The vector to be multiplied
++ \param scalar The complex scalar to multiply aVector
++ \param num_points The number of complex values in aVector to be multiplied by sacalar and stored into cVector
++ */
++static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_a_sse3(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t scalar, unsigned int num_points){
++
++ const unsigned int sse_iters = num_points / 8;
++
++ __m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc;
++
++ lv_8sc_t* c = cVector;
++ const lv_8sc_t* a = aVector;
++
++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
++
++ y = _mm_set1_epi16 (*(short*)&scalar);
++ imagy = _mm_srli_si128 (y, 1);
++ imagy = _mm_and_si128 (imagy, mult1);
++ realy = _mm_and_si128 (y, mult1);
++
++ for(int number = 0;number < sse_iters; number++){
++
++ x = _mm_load_si128((__m128i*)a);
++
++ imagx = _mm_srli_si128 (x, 1);
++ imagx = _mm_and_si128 (imagx, mult1);
++ realx = _mm_and_si128 (x, mult1);
++
++ realx_mult_realy = _mm_mullo_epi16 (realx, realy);
++ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);
++ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);
++ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);
++
++ realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
++ realc = _mm_and_si128 (realc, mult1);
++ imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
++ imagc = _mm_and_si128 (imagc, mult1);
++ imagc = _mm_slli_si128 (imagc, 1);
++
++ totalc = _mm_or_si128 (realc, imagc);
++
++ _mm_store_si128((__m128i*)c, totalc);
++
++ a += 8;
++ c += 8;
++ }
++
++ for (int i = 0; i<(num_points % 8); ++i)
++ {
++ *c++ = (*a++) * scalar;
++ }
++
++}
++#endif /* LV_HAVE_SSE3 */
++
++#ifdef LV_HAVE_GENERIC
++/*!
++ \brief Multiplies the input vector by a scalar and stores the results in the third vector
++ \param cVector The vector where the results will be stored
++ \param aVector The vector to be multiplied
++ \param scalar The complex scalar to multiply aVector
++ \param num_points The number of complex values in aVector to be multiplied by sacalar and stored into cVector
++ */
++static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_a_generic(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t scalar, unsigned int num_points){
++
++ /*lv_8sc_t* cPtr = cVector;
++ const lv_8sc_t* aPtr = aVector;
++
++ for (int i = 0; i<num_points; ++i)
++ {
++ *cPtr++ = (*aPtr++) * scalar;
++ }*/
++
++ lv_8sc_t* cPtr = cVector;
++ const lv_8sc_t* aPtr = aVector;
++ unsigned int number = num_points;
++
++ // unwrap loop
++ while (number >= 8){
++ *cPtr++ = (*aPtr++) * scalar;
++ *cPtr++ = (*aPtr++) * scalar;
++ *cPtr++ = (*aPtr++) * scalar;
++ *cPtr++ = (*aPtr++) * scalar;
++ *cPtr++ = (*aPtr++) * scalar;
++ *cPtr++ = (*aPtr++) * scalar;
++ *cPtr++ = (*aPtr++) * scalar;
++ *cPtr++ = (*aPtr++) * scalar;
++ number -= 8;
++ }
++
++ // clean up any remaining
++ while (number-- > 0)
++ *cPtr++ = *aPtr++ * scalar;
++}
++#endif /* LV_HAVE_GENERIC */
++
++#ifdef LV_HAVE_ORC
++/*!
++ \brief Multiplies the input vector by a scalar and stores the results in the third vector
++ \param cVector The vector where the results will be stored
++ \param aVector The vector to be multiplied
++ \param scalar The complex scalar to multiply aVector
++ \param num_points The number of complex values in aVector to be multiplied by sacalar and stored into cVector
++ */
++extern void volk_gnsssdr_8ic_s8ic_multiply_8ic_a_orc_impl(lv_8sc_t* cVector, const lv_8sc_t* aVector, const char scalarreal, const char scalarimag, unsigned int num_points);
++static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_u_orc(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t scalar, unsigned int num_points){
++ volk_gnsssdr_8ic_s8ic_multiply_8ic_a_orc_impl(cVector, aVector, lv_creal(scalar), lv_cimag(scalar), num_points);
++}
++#endif /* LV_HAVE_ORC */
++
++#endif /* INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_a_H */
+diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x2_dot_prod_8ic.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x2_dot_prod_8ic.h
+--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x2_dot_prod_8ic.h 1970-01-01 01:00:00.000000000 +0100
++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x2_dot_prod_8ic.h 2014-10-15 01:55:08.000000000 +0200
+@@ -0,0 +1,499 @@
++/*!
++ * \file volk_gnsssdr_8ic_x2_dot_prod_8ic.h
++ * \brief Volk protokernel: multiplies two 16 bits vectors and accumulates them
++ * \authors <ul>
++ * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
++ * </ul>
++ *
++ * Volk protokernel that multiplies two 16 bits vectors (8 bits the real part
++ * and 8 bits the imaginary part) and accumulates them
++ *
++ * -------------------------------------------------------------------------
++ *
++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
++ *
++ * GNSS-SDR is a software defined Global Navigation
++ * Satellite Systems receiver
++ *
++ * This file is part of GNSS-SDR.
++ *
++ * GNSS-SDR is free software: you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation, either version 3 of the License, or
++ * at your option) any later version.
++ *
++ * GNSS-SDR is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
++ *
++ * -------------------------------------------------------------------------
++ */
++
++#ifndef INCLUDED_volk_gnsssdr_8ic_x2_dot_prod_8ic_u_H
++#define INCLUDED_volk_gnsssdr_8ic_x2_dot_prod_8ic_u_H
++
++#include <volk_gnsssdr/volk_gnsssdr_common.h>
++#include <volk_gnsssdr/volk_gnsssdr_complex.h>
++#include <stdio.h>
++#include <string.h>
++
++#ifdef LV_HAVE_GENERIC
++/*!
++ \brief Multiplies the two input complex vectors and accumulates them, storing the result in the third vector
++ \param cVector The vector where the accumulated result will be stored
++ \param aVector One of the vectors to be multiplied and accumulated
++ \param bVector One of the vectors to be multiplied and accumulated
++ \param num_points The number of complex values in aVector and bVector to be multiplied together, accumulated and stored into cVector
++ */
++static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_generic(lv_8sc_t* result, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points) {
++
++ /*lv_8sc_t* cPtr = result;
++ const lv_8sc_t* aPtr = input;
++ const lv_8sc_t* bPtr = taps;
++
++ for(int number = 0; number < num_points; number++){
++ *cPtr += (*aPtr++) * (*bPtr++);
++ }*/
++
++ char * res = (char*) result;
++ char * in = (char*) input;
++ char * tp = (char*) taps;
++ unsigned int n_2_ccomplex_blocks = num_points/2;
++ unsigned int isodd = num_points & 1;
++
++ char sum0[2] = {0,0};
++ char sum1[2] = {0,0};
++ unsigned int i = 0;
++
++ for(i = 0; i < n_2_ccomplex_blocks; ++i) {
++ sum0[0] += in[0] * tp[0] - in[1] * tp[1];
++ sum0[1] += in[0] * tp[1] + in[1] * tp[0];
++ sum1[0] += in[2] * tp[2] - in[3] * tp[3];
++ sum1[1] += in[2] * tp[3] + in[3] * tp[2];
++
++ in += 4;
++ tp += 4;
++ }
++
++ res[0] = sum0[0] + sum1[0];
++ res[1] = sum0[1] + sum1[1];
++
++ // Cleanup if we had an odd number of points
++ for(i = 0; i < isodd; ++i) {
++ *result += input[num_points - 1] * taps[num_points - 1];
++ }
++}
++
++#endif /*LV_HAVE_GENERIC*/
++
++#ifdef LV_HAVE_SSE2
++#include "emmintrin.h"
++/*!
++ \brief Multiplies the two input complex vectors and accumulates them, storing the result in the third vector
++ \param cVector The vector where the accumulated result will be stored
++ \param aVector One of the vectors to be multiplied and accumulated
++ \param bVector One of the vectors to be multiplied and accumulated
++ \param num_points The number of complex values in aVector and bVector to be multiplied together, accumulated and stored into cVector
++ */
++static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_sse2(lv_8sc_t* result, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points) {
++
++ lv_8sc_t dotProduct;
++ memset(&dotProduct, 0x0, 2*sizeof(char));
++
++ const lv_8sc_t* a = input;
++ const lv_8sc_t* b = taps;
++
++ const unsigned int sse_iters = num_points/8;
++
++ if (sse_iters>0)
++ {
++ __m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc, realcacc, imagcacc;
++
++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
++ realcacc = _mm_setzero_si128();
++ imagcacc = _mm_setzero_si128();
++
++ for(int number = 0; number < sse_iters; number++){
++
++ x = _mm_lddqu_si128((__m128i*)a);
++ y = _mm_lddqu_si128((__m128i*)b);
++
++ imagx = _mm_srli_si128 (x, 1);
++ imagx = _mm_and_si128 (imagx, mult1);
++ realx = _mm_and_si128 (x, mult1);
++
++ imagy = _mm_srli_si128 (y, 1);
++ imagy = _mm_and_si128 (imagy, mult1);
++ realy = _mm_and_si128 (y, mult1);
++
++ realx_mult_realy = _mm_mullo_epi16 (realx, realy);
++ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);
++ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);
++ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);
++
++ realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
++ imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
++
++ realcacc = _mm_add_epi16 (realcacc, realc);
++ imagcacc = _mm_add_epi16 (imagcacc, imagc);
++
++ a += 8;
++ b += 8;
++ }
++
++ realcacc = _mm_and_si128 (realcacc, mult1);
++ imagcacc = _mm_and_si128 (imagcacc, mult1);
++ imagcacc = _mm_slli_si128 (imagcacc, 1);
++
++ totalc = _mm_or_si128 (realcacc, imagcacc);
++
++ __VOLK_ATTR_ALIGNED(16) lv_8sc_t dotProductVector[8];
++
++ _mm_storeu_si128((__m128i*)dotProductVector,totalc); // Store the results back into the dot product vector
++
++ for (int i = 0; i<8; ++i)
++ {
++ dotProduct += dotProductVector[i];
++ }
++ }
++
++ for (int i = 0; i<(num_points % 8); ++i)
++ {
++ dotProduct += (*a++) * (*b++);
++ }
++
++ *result = dotProduct;
++}
++
++#endif /*LV_HAVE_SSE2*/
++
++#ifdef LV_HAVE_SSE4_1
++#include "smmintrin.h"
++/*!
++ \brief Multiplies the two input complex vectors and accumulates them, storing the result in the third vector
++ \param cVector The vector where the accumulated result will be stored
++ \param aVector One of the vectors to be multiplied and accumulated
++ \param bVector One of the vectors to be multiplied and accumulated
++ \param num_points The number of complex values in aVector and bVector to be multiplied together, accumulated and stored into cVector
++ */
++static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_sse4_1(lv_8sc_t* result, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points) {
++
++ lv_8sc_t dotProduct;
++ memset(&dotProduct, 0x0, 2*sizeof(char));
++
++ const lv_8sc_t* a = input;
++ const lv_8sc_t* b = taps;
++
++ const unsigned int sse_iters = num_points/8;
++
++ if (sse_iters>0)
++ {
++ __m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc, realcacc, imagcacc;
++
++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
++ realcacc = _mm_setzero_si128();
++ imagcacc = _mm_setzero_si128();
++
++ for(int number = 0; number < sse_iters; number++){
++
++ x = _mm_lddqu_si128((__m128i*)a);
++ y = _mm_lddqu_si128((__m128i*)b);
++
++ imagx = _mm_srli_si128 (x, 1);
++ imagx = _mm_and_si128 (imagx, mult1);
++ realx = _mm_and_si128 (x, mult1);
++
++ imagy = _mm_srli_si128 (y, 1);
++ imagy = _mm_and_si128 (imagy, mult1);
++ realy = _mm_and_si128 (y, mult1);
++
++ realx_mult_realy = _mm_mullo_epi16 (realx, realy);
++ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);
++ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);
++ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);
++
++ realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
++ imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
++
++ realcacc = _mm_add_epi16 (realcacc, realc);
++ imagcacc = _mm_add_epi16 (imagcacc, imagc);
++
++ a += 8;
++ b += 8;
++ }
++
++ imagcacc = _mm_slli_si128 (imagcacc, 1);
++
++ totalc = _mm_blendv_epi8 (imagcacc, realcacc, mult1);
++
++ __VOLK_ATTR_ALIGNED(16) lv_8sc_t dotProductVector[8];
++
++ _mm_storeu_si128((__m128i*)dotProductVector,totalc); // Store the results back into the dot product vector
++
++ for (int i = 0; i<8; ++i)
++ {
++ dotProduct += dotProductVector[i];
++ }
++ }
++
++ for (int i = 0; i<(num_points % 8); ++i)
++ {
++ dotProduct += (*a++) * (*b++);
++ }
++
++ *result = dotProduct;
++}
++
++#endif /*LV_HAVE_SSE4_1*/
++
++#endif /*INCLUDED_volk_gnsssdr_8ic_x2_dot_prod_8ic_u_H*/
++
++
++#ifndef INCLUDED_volk_gnsssdr_8ic_x2_dot_prod_8ic_a_H
++#define INCLUDED_volk_gnsssdr_8ic_x2_dot_prod_8ic_a_H
++
++#include <volk_gnsssdr/volk_gnsssdr_common.h>
++#include <volk_gnsssdr/volk_gnsssdr_complex.h>
++#include <stdio.h>
++#include <string.h>
++
++
++#ifdef LV_HAVE_GENERIC
++/*!
++ \brief Multiplies the two input complex vectors and accumulates them, storing the result in the third vector
++ \param cVector The vector where the accumulated result will be stored
++ \param aVector One of the vectors to be multiplied and accumulated
++ \param bVector One of the vectors to be multiplied and accumulated
++ \param num_points The number of complex values in aVector and bVector to be multiplied together, accumulated and stored into cVector
++ */
++static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_generic(lv_8sc_t* result, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points) {
++
++ /*lv_8sc_t* cPtr = result;
++ const lv_8sc_t* aPtr = input;
++ const lv_8sc_t* bPtr = taps;
++
++ for(int number = 0; number < num_points; number++){
++ *cPtr += (*aPtr++) * (*bPtr++);
++ }*/
++
++ char * res = (char*) result;
++ char * in = (char*) input;
++ char * tp = (char*) taps;
++ unsigned int n_2_ccomplex_blocks = num_points/2;
++ unsigned int isodd = num_points & 1;
++
++ char sum0[2] = {0,0};
++ char sum1[2] = {0,0};
++ unsigned int i = 0;
++
++ for(i = 0; i < n_2_ccomplex_blocks; ++i) {
++ sum0[0] += in[0] * tp[0] - in[1] * tp[1];
++ sum0[1] += in[0] * tp[1] + in[1] * tp[0];
++ sum1[0] += in[2] * tp[2] - in[3] * tp[3];
++ sum1[1] += in[2] * tp[3] + in[3] * tp[2];
++
++ in += 4;
++ tp += 4;
++ }
++
++ res[0] = sum0[0] + sum1[0];
++ res[1] = sum0[1] + sum1[1];
++
++ // Cleanup if we had an odd number of points
++ for(i = 0; i < isodd; ++i) {
++ *result += input[num_points - 1] * taps[num_points - 1];
++ }
++}
++
++#endif /*LV_HAVE_GENERIC*/
++
++#ifdef LV_HAVE_SSE2
++#include "emmintrin.h"
++/*!
++ \brief Multiplies the two input complex vectors and accumulates them, storing the result in the third vector
++ \param cVector The vector where the accumulated result will be stored
++ \param aVector One of the vectors to be multiplied and accumulated
++ \param bVector One of the vectors to be multiplied and accumulated
++ \param num_points The number of complex values in aVector and bVector to be multiplied together, accumulated and stored into cVector
++ */
++static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_sse2(lv_8sc_t* result, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points) {
++
++ lv_8sc_t dotProduct;
++ memset(&dotProduct, 0x0, 2*sizeof(char));
++
++ const lv_8sc_t* a = input;
++ const lv_8sc_t* b = taps;
++
++ const unsigned int sse_iters = num_points/8;
++
++ if (sse_iters>0)
++ {
++ __m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc, realcacc, imagcacc;
++
++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
++ realcacc = _mm_setzero_si128();
++ imagcacc = _mm_setzero_si128();
++
++ for(int number = 0; number < sse_iters; number++){
++
++ x = _mm_load_si128((__m128i*)a);
++ y = _mm_load_si128((__m128i*)b);
++
++ imagx = _mm_srli_si128 (x, 1);
++ imagx = _mm_and_si128 (imagx, mult1);
++ realx = _mm_and_si128 (x, mult1);
++
++ imagy = _mm_srli_si128 (y, 1);
++ imagy = _mm_and_si128 (imagy, mult1);
++ realy = _mm_and_si128 (y, mult1);
++
++ realx_mult_realy = _mm_mullo_epi16 (realx, realy);
++ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);
++ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);
++ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);
++
++ realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
++ imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
++
++ realcacc = _mm_add_epi16 (realcacc, realc);
++ imagcacc = _mm_add_epi16 (imagcacc, imagc);
++
++ a += 8;
++ b += 8;
++ }
++
++ realcacc = _mm_and_si128 (realcacc, mult1);
++ imagcacc = _mm_and_si128 (imagcacc, mult1);
++ imagcacc = _mm_slli_si128 (imagcacc, 1);
++
++ totalc = _mm_or_si128 (realcacc, imagcacc);
++
++ __VOLK_ATTR_ALIGNED(16) lv_8sc_t dotProductVector[8];
++
++ _mm_store_si128((__m128i*)dotProductVector,totalc); // Store the results back into the dot product vector
++
++ for (int i = 0; i<8; ++i)
++ {
++ dotProduct += dotProductVector[i];
++ }
++ }
++
++ for (int i = 0; i<(num_points % 8); ++i)
++ {
++ dotProduct += (*a++) * (*b++);
++ }
++
++ *result = dotProduct;
++}
++
++#endif /*LV_HAVE_SSE2*/
++
++#ifdef LV_HAVE_SSE4_1
++#include "smmintrin.h"
++/*!
++ \brief Multiplies the two input complex vectors and accumulates them, storing the result in the third vector
++ \param cVector The vector where the accumulated result will be stored
++ \param aVector One of the vectors to be multiplied and accumulated
++ \param bVector One of the vectors to be multiplied and accumulated
++ \param num_points The number of complex values in aVector and bVector to be multiplied together, accumulated and stored into cVector
++ */
++static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_sse4_1(lv_8sc_t* result, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points) {
++
++ lv_8sc_t dotProduct;
++ memset(&dotProduct, 0x0, 2*sizeof(char));
++
++ const lv_8sc_t* a = input;
++ const lv_8sc_t* b = taps;
++
++ const unsigned int sse_iters = num_points/8;
++
++ if (sse_iters>0)
++ {
++ __m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc, realcacc, imagcacc;
++
++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
++ realcacc = _mm_setzero_si128();
++ imagcacc = _mm_setzero_si128();
++
++ for(int number = 0; number < sse_iters; number++){
++
++ x = _mm_load_si128((__m128i*)a);
++ y = _mm_load_si128((__m128i*)b);
++
++ imagx = _mm_srli_si128 (x, 1);
++ imagx = _mm_and_si128 (imagx, mult1);
++ realx = _mm_and_si128 (x, mult1);
++
++ imagy = _mm_srli_si128 (y, 1);
++ imagy = _mm_and_si128 (imagy, mult1);
++ realy = _mm_and_si128 (y, mult1);
++
++ realx_mult_realy = _mm_mullo_epi16 (realx, realy);
++ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);
++ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);
++ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);
++
++ realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
++ imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
++
++ realcacc = _mm_add_epi16 (realcacc, realc);
++ imagcacc = _mm_add_epi16 (imagcacc, imagc);
++
++ a += 8;
++ b += 8;
++ }
++
++ imagcacc = _mm_slli_si128 (imagcacc, 1);
++
++ totalc = _mm_blendv_epi8 (imagcacc, realcacc, mult1);
++
++ __VOLK_ATTR_ALIGNED(16) lv_8sc_t dotProductVector[8];
++
++ _mm_store_si128((__m128i*)dotProductVector,totalc); // Store the results back into the dot product vector
++
++ for (int i = 0; i<8; ++i)
++ {
++ dotProduct += dotProductVector[i];
++ }
++ }
++
++ for (int i = 0; i<(num_points % 8); ++i)
++ {
++ dotProduct += (*a++) * (*b++);
++ }
++
++ *result = dotProduct;
++}
++
++#endif /*LV_HAVE_SSE4_1*/
++
++#ifdef LV_HAVE_ORC
++/*!
++ \brief Multiplies the two input complex vectors and accumulates them, storing the result in the third vector
++ \param cVector The vector where the accumulated result will be stored
++ \param aVector One of the vectors to be multiplied and accumulated
++ \param bVector One of the vectors to be multiplied and accumulated
++ \param num_points The number of complex values in aVector and bVector to be multiplied together, accumulated and stored into cVector
++ */
++extern void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_orc_impl(short* resRealShort, short* resImagShort, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points);
++static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_orc(lv_8sc_t* result, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points){
++
++ short resReal = 0;
++ char* resRealChar = (char*)&resReal;
++ resRealChar++;
++
++ short resImag = 0;
++ char* resImagChar = (char*)&resImag;
++ resImagChar++;
++
++ volk_gnsssdr_8ic_x2_dot_prod_8ic_a_orc_impl(&resReal, &resImag, input, taps, num_points);
++
++ *result = lv_cmake(*resRealChar, *resImagChar);
++}
++#endif /* LV_HAVE_ORC */
++
++#endif /*INCLUDED_volk_gnsssdr_8ic_x2_dot_prod_8ic_a_H*/
+diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x2_multiply_8ic.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x2_multiply_8ic.h
+--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x2_multiply_8ic.h 1970-01-01 01:00:00.000000000 +0100
++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x2_multiply_8ic.h 2014-10-15 01:55:08.000000000 +0200
+@@ -0,0 +1,346 @@
++/*!
++ * \file volk_gnsssdr_8ic_x2_multiply_8ic.h
++ * \brief Volk protokernel: multiplies two 16 bits vectors
++ * \authors <ul>
++ * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
++ * </ul>
++ *
++ * Volk protokernel that multiplies two 16 bits vectors (8 bits the real part
++ * and 8 bits the imaginary part)
++ *
++ * -------------------------------------------------------------------------
++ *
++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
++ *
++ * GNSS-SDR is a software defined Global Navigation
++ * Satellite Systems receiver
++ *
++ * This file is part of GNSS-SDR.
++ *
++ * GNSS-SDR is free software: you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation, either version 3 of the License, or
++ * at your option) any later version.
++ *
++ * GNSS-SDR is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
++ *
++ * -------------------------------------------------------------------------
++ */
++
++#ifndef INCLUDED_volk_gnsssdr_8ic_x2_multiply_8ic_u_H
++#define INCLUDED_volk_gnsssdr_8ic_x2_multiply_8ic_u_H
++
++#include <inttypes.h>
++#include <stdio.h>
++#include <volk_gnsssdr/volk_gnsssdr_complex.h>
++
++#ifdef LV_HAVE_SSE2
++#include "emmintrin.h"
++/*!
++ \brief Multiplies the two input complex vectors and stores their results in the third vector
++ \param cVector The vector where the results will be stored
++ \param aVector One of the vectors to be multiplied
++ \param bVector One of the vectors to be multiplied
++ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
++ */
++static inline void volk_gnsssdr_8ic_x2_multiply_8ic_u_sse2(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){
++
++ const unsigned int sse_iters = num_points / 8;
++
++ __m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc;
++ lv_8sc_t* c = cVector;
++ const lv_8sc_t* a = aVector;
++ const lv_8sc_t* b = bVector;
++
++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
++
++ for(int number = 0;number < sse_iters; number++){
++
++ x = _mm_lddqu_si128((__m128i*)a);
++ y = _mm_lddqu_si128((__m128i*)b);
++
++ imagx = _mm_srli_si128 (x, 1);
++ imagx = _mm_and_si128 (imagx, mult1);
++ realx = _mm_and_si128 (x, mult1);
++
++ imagy = _mm_srli_si128 (y, 1);
++ imagy = _mm_and_si128 (imagy, mult1);
++ realy = _mm_and_si128 (y, mult1);
++
++ realx_mult_realy = _mm_mullo_epi16 (realx, realy);
++ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);
++ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);
++ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);
++
++ realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
++ realc = _mm_and_si128 (realc, mult1);
++ imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
++ imagc = _mm_and_si128 (imagc, mult1);
++ imagc = _mm_slli_si128 (imagc, 1);
++
++ totalc = _mm_or_si128 (realc, imagc);
++
++ _mm_storeu_si128((__m128i*)c, totalc);
++
++ a += 8;
++ b += 8;
++ c += 8;
++ }
++
++ for (int i = 0; i<(num_points % 8); ++i)
++ {
++ *c++ = (*a++) * (*b++);
++ }
++}
++#endif /* LV_HAVE_SSE2 */
++
++#ifdef LV_HAVE_SSE4_1
++#include "smmintrin.h"
++/*!
++ \brief Multiplies the two input complex vectors and stores their results in the third vector
++ \param cVector The vector where the results will be stored
++ \param aVector One of the vectors to be multiplied
++ \param bVector One of the vectors to be multiplied
++ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
++ */
++static inline void volk_gnsssdr_8ic_x2_multiply_8ic_u_sse4_1(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){
++
++ const unsigned int sse_iters = num_points / 8;
++
++ __m128i x, y, zero;
++ __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc;
++ lv_8sc_t* c = cVector;
++ const lv_8sc_t* a = aVector;
++ const lv_8sc_t* b = bVector;
++
++ zero = _mm_setzero_si128();
++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
++
++ for(int number = 0;number < sse_iters; number++){
++
++ x = _mm_lddqu_si128((__m128i*)a);
++ y = _mm_lddqu_si128((__m128i*)b);
++
++ imagx = _mm_srli_si128 (x, 1);
++ imagx = _mm_and_si128 (imagx, mult1);
++ realx = _mm_and_si128 (x, mult1);
++
++ imagy = _mm_srli_si128 (y, 1);
++ imagy = _mm_and_si128 (imagy, mult1);
++ realy = _mm_and_si128 (y, mult1);
++
++ realx_mult_realy = _mm_mullo_epi16 (realx, realy);
++ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);
++ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);
++ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);
++
++ realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
++ imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
++ imagc = _mm_slli_si128 (imagc, 1);
++
++ totalc = _mm_blendv_epi8 (imagc, realc, mult1);
++
++ _mm_storeu_si128((__m128i*)c, totalc);
++
++ a += 8;
++ b += 8;
++ c += 8;
++ }
++
++ for (int i = 0; i<(num_points % 8); ++i)
++ {
++ *c++ = (*a++) * (*b++);
++ }
++}
++#endif /* LV_HAVE_SSE4_1 */
++
++#ifdef LV_HAVE_GENERIC
++/*!
++ \brief Multiplies the two input complex vectors and stores their results in the third vector
++ \param cVector The vector where the results will be stored
++ \param aVector One of the vectors to be multiplied
++ \param bVector One of the vectors to be multiplied
++ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
++ */
++static inline void volk_gnsssdr_8ic_x2_multiply_8ic_generic(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){
++ lv_8sc_t* cPtr = cVector;
++ const lv_8sc_t* aPtr = aVector;
++ const lv_8sc_t* bPtr = bVector;
++
++ for(int number = 0; number < num_points; number++){
++ *cPtr++ = (*aPtr++) * (*bPtr++);
++ }
++}
++#endif /* LV_HAVE_GENERIC */
++
++#endif /* INCLUDED_volk_gnsssdr_8ic_x2_multiply_8ic_u_H */
++
++
++#ifndef INCLUDED_volk_gnsssdr_8ic_x2_multiply_8ic_a_H
++#define INCLUDED_volk_gnsssdr_8ic_x2_multiply_8ic_a_H
++
++#include <inttypes.h>
++#include <stdio.h>
++#include <volk_gnsssdr/volk_gnsssdr_complex.h>
++
++#ifdef LV_HAVE_SSE2
++#include "emmintrin.h"
++/*!
++ \brief Multiplies the two input complex vectors and stores their results in the third vector
++ \param cVector The vector where the results will be stored
++ \param aVector One of the vectors to be multiplied
++ \param bVector One of the vectors to be multiplied
++ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
++ */
++static inline void volk_gnsssdr_8ic_x2_multiply_8ic_a_sse2(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){
++
++ const unsigned int sse_iters = num_points / 8;
++
++ __m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc;
++ lv_8sc_t* c = cVector;
++ const lv_8sc_t* a = aVector;
++ const lv_8sc_t* b = bVector;
++
++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
++
++ for(int number = 0;number < sse_iters; number++){
++
++ x = _mm_load_si128((__m128i*)a);
++ y = _mm_load_si128((__m128i*)b);
++
++ imagx = _mm_srli_si128 (x, 1);
++ imagx = _mm_and_si128 (imagx, mult1);
++ realx = _mm_and_si128 (x, mult1);
++
++ imagy = _mm_srli_si128 (y, 1);
++ imagy = _mm_and_si128 (imagy, mult1);
++ realy = _mm_and_si128 (y, mult1);
++
++ realx_mult_realy = _mm_mullo_epi16 (realx, realy);
++ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);
++ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);
++ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);
++
++ realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
++ realc = _mm_and_si128 (realc, mult1);
++ imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
++ imagc = _mm_and_si128 (imagc, mult1);
++ imagc = _mm_slli_si128 (imagc, 1);
++
++ totalc = _mm_or_si128 (realc, imagc);
++
++ _mm_store_si128((__m128i*)c, totalc);
++
++ a += 8;
++ b += 8;
++ c += 8;
++ }
++
++ for (int i = 0; i<(num_points % 8); ++i)
++ {
++ *c++ = (*a++) * (*b++);
++ }
++}
++#endif /* LV_HAVE_SSE2 */
++
++#ifdef LV_HAVE_SSE4_1
++#include "smmintrin.h"
++/*!
++ \brief Multiplies the two input complex vectors and stores their results in the third vector
++ \param cVector The vector where the results will be stored
++ \param aVector One of the vectors to be multiplied
++ \param bVector One of the vectors to be multiplied
++ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
++ */
++static inline void volk_gnsssdr_8ic_x2_multiply_8ic_a_sse4_1(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){
++
++ const unsigned int sse_iters = num_points / 8;
++
++ __m128i x, y, zero;
++ __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc;
++ lv_8sc_t* c = cVector;
++ const lv_8sc_t* a = aVector;
++ const lv_8sc_t* b = bVector;
++
++ zero = _mm_setzero_si128();
++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
++
++ for(int number = 0;number < sse_iters; number++){
++
++ x = _mm_load_si128((__m128i*)a);
++ y = _mm_load_si128((__m128i*)b);
++
++ imagx = _mm_srli_si128 (x, 1);
++ imagx = _mm_and_si128 (imagx, mult1);
++ realx = _mm_and_si128 (x, mult1);
++
++ imagy = _mm_srli_si128 (y, 1);
++ imagy = _mm_and_si128 (imagy, mult1);
++ realy = _mm_and_si128 (y, mult1);
++
++ realx_mult_realy = _mm_mullo_epi16 (realx, realy);
++ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);
++ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);
++ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);
++
++ realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
++ imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
++ imagc = _mm_slli_si128 (imagc, 1);
++
++ totalc = _mm_blendv_epi8 (imagc, realc, mult1);
++
++ _mm_store_si128((__m128i*)c, totalc);
++
++ a += 8;
++ b += 8;
++ c += 8;
++ }
++
++ for (int i = 0; i<(num_points % 8); ++i)
++ {
++ *c++ = (*a++) * (*b++);
++ }
++}
++#endif /* LV_HAVE_SSE4_1 */
++
++#ifdef LV_HAVE_GENERIC
++/*!
++ \brief Multiplies the two input complex vectors and stores their results in the third vector
++ \param cVector The vector where the results will be stored
++ \param aVector One of the vectors to be multiplied
++ \param bVector One of the vectors to be multiplied
++ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
++ */
++static inline void volk_gnsssdr_8ic_x2_multiply_8ic_a_generic(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){
++ lv_8sc_t* cPtr = cVector;
++ const lv_8sc_t* aPtr = aVector;
++ const lv_8sc_t* bPtr = bVector;
++
++ for(int number = 0; number < num_points; number++){
++ *cPtr++ = (*aPtr++) * (*bPtr++);
++ }
++
++}
++#endif /* LV_HAVE_GENERIC */
++
++#ifdef LV_HAVE_ORC
++/*!
++ \brief Multiplies the two input complex vectors and stores their results in the third vector
++ \param cVector The vector where the results will be stored
++ \param aVector One of the vectors to be multiplied
++ \param bVector One of the vectors to be multiplied
++ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
++ */
++extern void volk_gnsssdr_8ic_x2_multiply_8ic_a_orc_impl(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points);
++static inline void volk_gnsssdr_8ic_x2_multiply_8ic_u_orc(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){
++ volk_gnsssdr_8ic_x2_multiply_8ic_a_orc_impl(cVector, aVector, bVector, num_points);
++}
++#endif /* LV_HAVE_ORC */
++
++#endif /* INCLUDED_volk_gnsssdr_8ic_x2_multiply_8ic_a_H */
+diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3.h
+--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3.h 1970-01-01 01:00:00.000000000 +0100
++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3.h 2014-10-15 01:55:08.000000000 +0200
+@@ -0,0 +1,613 @@
++/*!
++ * \file volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3.h
++ * \brief Volk protokernel: performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation with 16 bits vectors, and accumulates the results into float32.
++ * \authors <ul>
++ * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
++ * </ul>
++ *
++ * Volk protokernel that performs the carrier wipe-off mixing and the
++ * Early, Prompt, and Late correlation with 16 bits vectors (8 bits the
++ * real part and 8 bits the imaginary part), and accumulates the result
++ * in 32 bits single point values, returning float32 values:
++ * - The carrier wipe-off is done by multiplying the input signal by the
++ * carrier (multiplication of 16 bits vectors) It returns the input
++ * signal in base band (BB)
++ * - Early values are calculated by multiplying the input signal in BB by the
++ * early code (multiplication of 16 bits vectors), accumulating the results into float32 values
++ * - Prompt values are calculated by multiplying the input signal in BB by the
++ * prompt code (multiplication of 16 bits vectors), accumulating the results into float32 values
++ * - Late values are calculated by multiplying the input signal in BB by the
++ * late code (multiplication of 16 bits vectors), accumulating the results into float32 values
++ *
++ * -------------------------------------------------------------------------
++ *
++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
++ *
++ * GNSS-SDR is a software defined Global Navigation
++ * Satellite Systems receiver
++ *
++ * This file is part of GNSS-SDR.
++ *
++ * GNSS-SDR is free software: you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation, either version 3 of the License, or
++ * at your option) any later version.
++ *
++ * GNSS-SDR is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
++ *
++ * -------------------------------------------------------------------------
++ */
++
++#ifndef INCLUDED_gnsssdr_volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_u_H
++#define INCLUDED_gnsssdr_volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_u_H
++
++#include <inttypes.h>
++#include <stdio.h>
++#include <volk_gnsssdr/volk_gnsssdr_complex.h>
++#include <float.h>
++#include <string.h>
++
++#ifdef LV_HAVE_SSE4_1
++#include "smmintrin.h"
++#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h"
++#include "CommonMacros/CommonMacros.h"
++/*!
++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
++ \param input The input signal input
++ \param carrier The carrier signal input
++ \param E_code Early PRN code replica input
++ \param P_code Early PRN code replica input
++ \param L_code Early PRN code replica input
++ \param E_out Early correlation output
++ \param P_out Early correlation output
++ \param L_out Early correlation output
++ \param num_points The number of complex values in vectors
++ */
++static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_u_sse4_1(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points)
++{
++ const unsigned int sse_iters = num_points / 8;
++
++ __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample;
++ __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output;
++
++ __m128 E_code_acc, P_code_acc, L_code_acc;
++ __m128i input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2;
++ __m128 output_ps;
++
++ const lv_8sc_t* input_ptr = input;
++ const lv_8sc_t* carrier_ptr = carrier;
++
++ const lv_8sc_t* E_code_ptr = E_code;
++ lv_32fc_t* E_out_ptr = E_out;
++ const lv_8sc_t* L_code_ptr = L_code;
++ lv_32fc_t* L_out_ptr = L_out;
++ const lv_8sc_t* P_code_ptr = P_code;
++ lv_32fc_t* P_out_ptr = P_out;
++
++ *E_out_ptr = 0;
++ *P_out_ptr = 0;
++ *L_out_ptr = 0;
++
++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
++
++ E_code_acc = _mm_setzero_ps();
++ L_code_acc = _mm_setzero_ps();
++ P_code_acc = _mm_setzero_ps();
++
++ if (sse_iters>0)
++ {
++ for(int number = 0;number < sse_iters; number++){
++
++ //Perform the carrier wipe-off
++ x = _mm_lddqu_si128((__m128i*)input_ptr);
++ y = _mm_lddqu_si128((__m128i*)carrier_ptr);
++
++ CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(x, mult1, realx, imagx)
++ CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy)
++
++ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample)
++
++ //Get early values
++ y = _mm_lddqu_si128((__m128i*)E_code_ptr);
++
++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
++
++ E_code_acc = _mm_add_ps (E_code_acc, output_ps);
++
++ //Get prompt values
++ y = _mm_lddqu_si128((__m128i*)P_code_ptr);
++
++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
++
++ P_code_acc = _mm_add_ps (P_code_acc, output_ps);
++
++ //Get late values
++ y = _mm_lddqu_si128((__m128i*)L_code_ptr);
++
++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
++
++ L_code_acc = _mm_add_ps (L_code_acc, output_ps);
++
++ input_ptr += 8;
++ carrier_ptr += 8;
++ E_code_ptr += 8;
++ P_code_ptr += 8;
++ L_code_ptr += 8;
++ }
++
++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2];
++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2];
++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2];
++
++ _mm_storeu_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector
++ _mm_storeu_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector
++ _mm_storeu_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector
++
++ for (int i = 0; i<2; ++i)
++ {
++ *E_out_ptr += E_dotProductVector[i];
++ *P_out_ptr += P_dotProductVector[i];
++ *L_out_ptr += L_dotProductVector[i];
++ }
++ }
++
++ lv_8sc_t bb_signal_sample;
++ for(int i=0; i < num_points%8; ++i)
++ {
++ //Perform the carrier wipe-off
++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
++ // Now get early, late, and prompt values for each
++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++));
++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++));
++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++));
++ }
++}
++#endif /* LV_HAVE_SSE4_1 */
++
++#ifdef LV_HAVE_SSE2
++#include "emmintrin.h"
++#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h"
++#include "CommonMacros/CommonMacros.h"
++/*!
++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
++ \param input The input signal input
++ \param carrier The carrier signal input
++ \param E_code Early PRN code replica input
++ \param P_code Early PRN code replica input
++ \param L_code Early PRN code replica input
++ \param E_out Early correlation output
++ \param P_out Early correlation output
++ \param L_out Early correlation output
++ \param num_points The number of complex values in vectors
++ */
++static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_u_sse2(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points)
++{
++ const unsigned int sse_iters = num_points / 8;
++
++ __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample;
++ __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output;
++
++ __m128 E_code_acc, P_code_acc, L_code_acc;
++ __m128i input_i_1, input_i_2, output_i32;
++ __m128 output_ps_1, output_ps_2;
++
++ const lv_8sc_t* input_ptr = input;
++ const lv_8sc_t* carrier_ptr = carrier;
++
++ const lv_8sc_t* E_code_ptr = E_code;
++ lv_32fc_t* E_out_ptr = E_out;
++ const lv_8sc_t* L_code_ptr = L_code;
++ lv_32fc_t* L_out_ptr = L_out;
++ const lv_8sc_t* P_code_ptr = P_code;
++ lv_32fc_t* P_out_ptr = P_out;
++
++ *E_out_ptr = 0;
++ *P_out_ptr = 0;
++ *L_out_ptr = 0;
++
++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
++
++ E_code_acc = _mm_setzero_ps();
++ L_code_acc = _mm_setzero_ps();
++ P_code_acc = _mm_setzero_ps();
++
++ if (sse_iters>0)
++ {
++ for(int number = 0;number < sse_iters; number++){
++
++ //Perform the carrier wipe-off
++ x = _mm_lddqu_si128((__m128i*)input_ptr);
++ y = _mm_lddqu_si128((__m128i*)carrier_ptr);
++
++ CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(x, mult1, realx, imagx)
++ CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy)
++
++ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample)
++
++ //Get early values
++ y = _mm_lddqu_si128((__m128i*)E_code_ptr);
++
++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
++
++ E_code_acc = _mm_add_ps (E_code_acc, output_ps_1);
++ E_code_acc = _mm_add_ps (E_code_acc, output_ps_2);
++
++ //Get prompt values
++ y = _mm_lddqu_si128((__m128i*)P_code_ptr);
++
++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
++
++ P_code_acc = _mm_add_ps (P_code_acc, output_ps_1);
++ P_code_acc = _mm_add_ps (P_code_acc, output_ps_2);
++
++ //Get late values
++ y = _mm_lddqu_si128((__m128i*)L_code_ptr);
++
++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
++
++ L_code_acc = _mm_add_ps (L_code_acc, output_ps_1);
++ L_code_acc = _mm_add_ps (L_code_acc, output_ps_2);
++
++ input_ptr += 8;
++ carrier_ptr += 8;
++ E_code_ptr += 8;
++ P_code_ptr += 8;
++ L_code_ptr += 8;
++ }
++
++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2];
++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2];
++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2];
++
++ _mm_storeu_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector
++ _mm_storeu_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector
++ _mm_storeu_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector
++
++ for (int i = 0; i<2; ++i)
++ {
++ *E_out_ptr += E_dotProductVector[i];
++ *P_out_ptr += P_dotProductVector[i];
++ *L_out_ptr += L_dotProductVector[i];
++ }
++ }
++
++ lv_8sc_t bb_signal_sample;
++ for(int i=0; i < num_points%8; ++i)
++ {
++ //Perform the carrier wipe-off
++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
++ // Now get early, late, and prompt values for each
++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++));
++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++));
++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++));
++ }
++}
++#endif /* LV_HAVE_SSE2 */
++
++#ifdef LV_HAVE_GENERIC
++/*!
++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
++ \param input The input signal input
++ \param carrier The carrier signal input
++ \param E_code Early PRN code replica input
++ \param P_code Early PRN code replica input
++ \param L_code Early PRN code replica input
++ \param E_out Early correlation output
++ \param P_out Early correlation output
++ \param L_out Early correlation output
++ \param num_points The number of complex values in vectors
++ */
++static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_generic(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points)
++{
++ lv_8sc_t bb_signal_sample;
++
++ bb_signal_sample = lv_cmake(0, 0);
++
++ *E_out = 0;
++ *P_out = 0;
++ *L_out = 0;
++ // perform Early, Prompt and Late correlation
++ for(int i=0; i < num_points; ++i)
++ {
++ //Perform the carrier wipe-off
++ bb_signal_sample = input[i] * carrier[i];
++ // Now get early, late, and prompt values for each
++ *E_out += (lv_32fc_t) (bb_signal_sample * E_code[i]);
++ *P_out += (lv_32fc_t) (bb_signal_sample * P_code[i]);
++ *L_out += (lv_32fc_t) (bb_signal_sample * L_code[i]);
++ }
++}
++
++#endif /* LV_HAVE_GENERIC */
++
++#endif /* INCLUDED_gnsssdr_volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_u_H */
++
++
++#ifndef INCLUDED_gnsssdr_volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_a_H
++#define INCLUDED_gnsssdr_volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_a_H
++
++#include <inttypes.h>
++#include <stdio.h>
++#include <volk_gnsssdr/volk_gnsssdr_complex.h>
++#include <float.h>
++#include <string.h>
++
++#ifdef LV_HAVE_SSE4_1
++#include "smmintrin.h"
++#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h"
++#include "CommonMacros/CommonMacros.h"
++/*!
++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
++ \param input The input signal input
++ \param carrier The carrier signal input
++ \param E_code Early PRN code replica input
++ \param P_code Early PRN code replica input
++ \param L_code Early PRN code replica input
++ \param E_out Early correlation output
++ \param P_out Early correlation output
++ \param L_out Early correlation output
++ \param num_points The number of complex values in vectors
++ */
++static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_a_sse4_1(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points)
++{
++ const unsigned int sse_iters = num_points / 8;
++
++ __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample;
++ __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output;
++
++ __m128 E_code_acc, P_code_acc, L_code_acc;
++ __m128i input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2;
++ __m128 output_ps;
++
++ const lv_8sc_t* input_ptr = input;
++ const lv_8sc_t* carrier_ptr = carrier;
++
++ const lv_8sc_t* E_code_ptr = E_code;
++ lv_32fc_t* E_out_ptr = E_out;
++ const lv_8sc_t* L_code_ptr = L_code;
++ lv_32fc_t* L_out_ptr = L_out;
++ const lv_8sc_t* P_code_ptr = P_code;
++ lv_32fc_t* P_out_ptr = P_out;
++
++ *E_out_ptr = 0;
++ *P_out_ptr = 0;
++ *L_out_ptr = 0;
++
++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
++
++ E_code_acc = _mm_setzero_ps();
++ L_code_acc = _mm_setzero_ps();
++ P_code_acc = _mm_setzero_ps();
++
++ if (sse_iters>0)
++ {
++ for(int number = 0;number < sse_iters; number++){
++
++ //Perform the carrier wipe-off
++ x = _mm_load_si128((__m128i*)input_ptr);
++ y = _mm_load_si128((__m128i*)carrier_ptr);
++
++ CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(x, mult1, realx, imagx)
++ CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy)
++
++ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample)
++
++ //Get early values
++ y = _mm_load_si128((__m128i*)E_code_ptr);
++
++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
++
++ E_code_acc = _mm_add_ps (E_code_acc, output_ps);
++
++ //Get prompt values
++ y = _mm_load_si128((__m128i*)P_code_ptr);
++
++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
++
++ P_code_acc = _mm_add_ps (P_code_acc, output_ps);
++
++ //Get late values
++ y = _mm_load_si128((__m128i*)L_code_ptr);
++
++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
++
++ L_code_acc = _mm_add_ps (L_code_acc, output_ps);
++
++ input_ptr += 8;
++ carrier_ptr += 8;
++ E_code_ptr += 8;
++ P_code_ptr += 8;
++ L_code_ptr += 8;
++ }
++
++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2];
++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2];
++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2];
++
++ _mm_store_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector
++ _mm_store_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector
++ _mm_store_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector
++
++ for (int i = 0; i<2; ++i)
++ {
++ *E_out_ptr += E_dotProductVector[i];
++ *P_out_ptr += P_dotProductVector[i];
++ *L_out_ptr += L_dotProductVector[i];
++ }
++ }
++
++ lv_8sc_t bb_signal_sample;
++ for(int i=0; i < num_points%8; ++i)
++ {
++ //Perform the carrier wipe-off
++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
++ // Now get early, late, and prompt values for each
++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++));
++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++));
++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++));
++ }
++}
++#endif /* LV_HAVE_SSE4_1 */
++
++#ifdef LV_HAVE_SSE2
++#include "emmintrin.h"
++#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h"
++#include "CommonMacros/CommonMacros.h"
++/*!
++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
++ \param input The input signal input
++ \param carrier The carrier signal input
++ \param E_code Early PRN code replica input
++ \param P_code Early PRN code replica input
++ \param L_code Early PRN code replica input
++ \param E_out Early correlation output
++ \param P_out Early correlation output
++ \param L_out Early correlation output
++ \param num_points The number of complex values in vectors
++ */
++static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_a_sse2(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points)
++{
++ const unsigned int sse_iters = num_points / 8;
++
++ __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample;
++ __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output;
++
++ __m128 E_code_acc, P_code_acc, L_code_acc;
++ __m128i input_i_1, input_i_2, output_i32;
++ __m128 output_ps_1, output_ps_2;
++
++ const lv_8sc_t* input_ptr = input;
++ const lv_8sc_t* carrier_ptr = carrier;
++
++ const lv_8sc_t* E_code_ptr = E_code;
++ lv_32fc_t* E_out_ptr = E_out;
++ const lv_8sc_t* L_code_ptr = L_code;
++ lv_32fc_t* L_out_ptr = L_out;
++ const lv_8sc_t* P_code_ptr = P_code;
++ lv_32fc_t* P_out_ptr = P_out;
++
++ *E_out_ptr = 0;
++ *P_out_ptr = 0;
++ *L_out_ptr = 0;
++
++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
++
++ E_code_acc = _mm_setzero_ps();
++ L_code_acc = _mm_setzero_ps();
++ P_code_acc = _mm_setzero_ps();
++
++ if (sse_iters>0)
++ {
++ for(int number = 0;number < sse_iters; number++){
++
++ //Perform the carrier wipe-off
++ x = _mm_load_si128((__m128i*)input_ptr);
++ y = _mm_load_si128((__m128i*)carrier_ptr);
++
++ CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(x, mult1, realx, imagx)
++ CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy)
++
++ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample)
++
++ //Get early values
++ y = _mm_load_si128((__m128i*)E_code_ptr);
++
++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
++
++ E_code_acc = _mm_add_ps (E_code_acc, output_ps_1);
++ E_code_acc = _mm_add_ps (E_code_acc, output_ps_2);
++
++ //Get prompt values
++ y = _mm_load_si128((__m128i*)P_code_ptr);
++
++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
++
++ P_code_acc = _mm_add_ps (P_code_acc, output_ps_1);
++ P_code_acc = _mm_add_ps (P_code_acc, output_ps_2);
++
++ //Get late values
++ y = _mm_load_si128((__m128i*)L_code_ptr);
++
++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
++
++ L_code_acc = _mm_add_ps (L_code_acc, output_ps_1);
++ L_code_acc = _mm_add_ps (L_code_acc, output_ps_2);
++
++ input_ptr += 8;
++ carrier_ptr += 8;
++ E_code_ptr += 8;
++ P_code_ptr += 8;
++ L_code_ptr += 8;
++ }
++
++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2];
++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2];
++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2];
++
++ _mm_store_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector
++ _mm_store_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector
++ _mm_store_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector
++
++ for (int i = 0; i<2; ++i)
++ {
++ *E_out_ptr += E_dotProductVector[i];
++ *P_out_ptr += P_dotProductVector[i];
++ *L_out_ptr += L_dotProductVector[i];
++ }
++ }
++
++ lv_8sc_t bb_signal_sample;
++ for(int i=0; i < num_points%8; ++i)
++ {
++ //Perform the carrier wipe-off
++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
++ // Now get early, late, and prompt values for each
++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++));
++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++));
++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++));
++ }
++}
++#endif /* LV_HAVE_SSE2 */
++
++#ifdef LV_HAVE_GENERIC
++/*!
++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
++ \param input The input signal input
++ \param carrier The carrier signal input
++ \param E_code Early PRN code replica input
++ \param P_code Early PRN code replica input
++ \param L_code Early PRN code replica input
++ \param E_out Early correlation output
++ \param P_out Early correlation output
++ \param L_out Early correlation output
++ \param num_points The number of complex values in vectors
++ */
++static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_a_generic(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points)
++{
++ lv_8sc_t bb_signal_sample;
++
++ bb_signal_sample = lv_cmake(0, 0);
++
++ *E_out = 0;
++ *P_out = 0;
++ *L_out = 0;
++ // perform Early, Prompt and Late correlation
++ for(int i=0; i < num_points; ++i)
++ {
++ //Perform the carrier wipe-off
++ bb_signal_sample = input[i] * carrier[i];
++ // Now get early, late, and prompt values for each
++ *E_out += (lv_32fc_t) (bb_signal_sample * E_code[i]);
++ *P_out += (lv_32fc_t) (bb_signal_sample * P_code[i]);
++ *L_out += (lv_32fc_t) (bb_signal_sample * L_code[i]);
++ }
++}
++
++#endif /* LV_HAVE_GENERIC */
++#endif /* INCLUDED_gnsssdr_volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_a_H */
+\ No newline at end of file
+diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3.h
+--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3.h 1970-01-01 01:00:00.000000000 +0100
++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3.h 2014-10-15 01:55:08.000000000 +0200
+@@ -0,0 +1,874 @@
++/*!
++ * \file volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3.h
++ * \brief Volk protokernel: performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation with 16 bits vectors
++ * \authors <ul>
++ * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
++ * </ul>
++ *
++ * Volk protokernel that performs the carrier wipe-off mixing and the
++ * Early, Prompt, and Late correlation with 16 bits vectors (8 bits the
++ * real part and 8 bits the imaginary part):
++ * - The carrier wipe-off is done by multiplying the input signal by the
++ * carrier (multiplication of 16 bits vectors) It returns the input
++ * signal in base band (BB)
++ * - Early values are calculated by multiplying the input signal in BB by the
++ * early code (multiplication of 16 bits vectors), accumulating the results
++ * - Prompt values are calculated by multiplying the input signal in BB by the
++ * prompt code (multiplication of 16 bits vectors), accumulating the results
++ * - Late values are calculated by multiplying the input signal in BB by the
++ * late code (multiplication of 16 bits vectors), accumulating the results
++ *
++ * -------------------------------------------------------------------------
++ *
++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
++ *
++ * GNSS-SDR is a software defined Global Navigation
++ * Satellite Systems receiver
++ *
++ * This file is part of GNSS-SDR.
++ *
++ * GNSS-SDR is free software: you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation, either version 3 of the License, or
++ * at your option) any later version.
++ *
++ * GNSS-SDR is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
++ *
++ * -------------------------------------------------------------------------
++ */
++
++#ifndef INCLUDED_gnsssdr_volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_u_H
++#define INCLUDED_gnsssdr_volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_u_H
++
++#include <inttypes.h>
++#include <stdio.h>
++#include <volk_gnsssdr/volk_gnsssdr_complex.h>
++#include <float.h>
++#include <string.h>
++
++#ifdef LV_HAVE_SSE4_1
++#include "smmintrin.h"
++ /*!
++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
++ \param input The input signal input
++ \param carrier The carrier signal input
++ \param E_code Early PRN code replica input
++ \param P_code Early PRN code replica input
++ \param L_code Early PRN code replica input
++ \param E_out Early correlation output
++ \param P_out Early correlation output
++ \param L_out Early correlation output
++ \param num_points The number of complex values in vectors
++ */
++static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_u_sse4_1(lv_8sc_t* E_out, lv_8sc_t* P_out, lv_8sc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points)
++{
++ const unsigned int sse_iters = num_points / 8;
++
++ __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample, real_E_code_acc, imag_E_code_acc, real_L_code_acc, imag_L_code_acc, real_P_code_acc, imag_P_code_acc;
++ __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output;
++
++ const lv_8sc_t* input_ptr = input;
++ const lv_8sc_t* carrier_ptr = carrier;
++
++ const lv_8sc_t* E_code_ptr = E_code;
++ lv_8sc_t* E_out_ptr = E_out;
++ const lv_8sc_t* L_code_ptr = L_code;
++ lv_8sc_t* L_out_ptr = L_out;
++ const lv_8sc_t* P_code_ptr = P_code;
++ lv_8sc_t* P_out_ptr = P_out;
++
++ *E_out_ptr = 0;
++ *P_out_ptr = 0;
++ *L_out_ptr = 0;
++
++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
++
++ real_E_code_acc = _mm_setzero_si128();
++ imag_E_code_acc = _mm_setzero_si128();
++ real_L_code_acc = _mm_setzero_si128();
++ imag_L_code_acc = _mm_setzero_si128();
++ real_P_code_acc = _mm_setzero_si128();
++ imag_P_code_acc = _mm_setzero_si128();
++
++ if (sse_iters>0)
++ {
++ for(int number = 0;number < sse_iters; number++){
++
++ //Perform the carrier wipe-off
++ x = _mm_lddqu_si128((__m128i*)input_ptr);
++ y = _mm_lddqu_si128((__m128i*)carrier_ptr);
++
++ imagx = _mm_srli_si128 (x, 1);
++ imagx = _mm_and_si128 (imagx, mult1);
++ realx = _mm_and_si128 (x, mult1);
++
++ imagy = _mm_srli_si128 (y, 1);
++ imagy = _mm_and_si128 (imagy, mult1);
++ realy = _mm_and_si128 (y, mult1);
++
++ realx_mult_realy = _mm_mullo_epi16 (realx, realy);
++ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);
++ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);
++ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);
++
++ real_bb_signal_sample = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
++ imag_bb_signal_sample = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
++
++ //Get early values
++ y = _mm_lddqu_si128((__m128i*)E_code_ptr);
++
++ imagy = _mm_srli_si128 (y, 1);
++ imagy = _mm_and_si128 (imagy, mult1);
++ realy = _mm_and_si128 (y, mult1);
++
++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy);
++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy);
++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy);
++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy);
++
++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
++
++ real_E_code_acc = _mm_add_epi16 (real_E_code_acc, real_output);
++ imag_E_code_acc = _mm_add_epi16 (imag_E_code_acc, imag_output);
++
++ //Get late values
++ y = _mm_lddqu_si128((__m128i*)L_code_ptr);
++
++ imagy = _mm_srli_si128 (y, 1);
++ imagy = _mm_and_si128 (imagy, mult1);
++ realy = _mm_and_si128 (y, mult1);
++
++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy);
++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy);
++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy);
++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy);
++
++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
++
++ real_L_code_acc = _mm_add_epi16 (real_L_code_acc, real_output);
++ imag_L_code_acc = _mm_add_epi16 (imag_L_code_acc, imag_output);
++
++ //Get prompt values
++ y = _mm_lddqu_si128((__m128i*)P_code_ptr);
++
++ imagy = _mm_srli_si128 (y, 1);
++ imagy = _mm_and_si128 (imagy, mult1);
++ realy = _mm_and_si128 (y, mult1);
++
++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy);
++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy);
++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy);
++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy);
++
++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
++
++ real_P_code_acc = _mm_add_epi16 (real_P_code_acc, real_output);
++ imag_P_code_acc = _mm_add_epi16 (imag_P_code_acc, imag_output);
++
++ input_ptr += 8;
++ carrier_ptr += 8;
++ E_code_ptr += 8;
++ L_code_ptr += 8;
++ P_code_ptr += 8;
++ }
++
++ __VOLK_ATTR_ALIGNED(16) lv_8sc_t E_dotProductVector[8];
++ __VOLK_ATTR_ALIGNED(16) lv_8sc_t L_dotProductVector[8];
++ __VOLK_ATTR_ALIGNED(16) lv_8sc_t P_dotProductVector[8];
++
++ imag_E_code_acc = _mm_slli_si128 (imag_E_code_acc, 1);
++ output = _mm_blendv_epi8 (imag_E_code_acc, real_E_code_acc, mult1);
++ _mm_storeu_si128((__m128i*)E_dotProductVector, output);
++
++ imag_L_code_acc = _mm_slli_si128 (imag_L_code_acc, 1);
++ output = _mm_blendv_epi8 (imag_L_code_acc, real_L_code_acc, mult1);
++ _mm_storeu_si128((__m128i*)L_dotProductVector, output);
++
++ imag_P_code_acc = _mm_slli_si128 (imag_P_code_acc, 1);
++ output = _mm_blendv_epi8 (imag_P_code_acc, real_P_code_acc, mult1);
++ _mm_storeu_si128((__m128i*)P_dotProductVector, output);
++
++ for (int i = 0; i<8; ++i)
++ {
++ *E_out_ptr += E_dotProductVector[i];
++ *L_out_ptr += L_dotProductVector[i];
++ *P_out_ptr += P_dotProductVector[i];
++ }
++ }
++
++ lv_8sc_t bb_signal_sample;
++ for(int i=0; i < num_points%8; ++i)
++ {
++ //Perform the carrier wipe-off
++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
++ // Now get early, late, and prompt values for each
++ *E_out_ptr += bb_signal_sample * (*E_code_ptr++);
++ *P_out_ptr += bb_signal_sample * (*P_code_ptr++);
++ *L_out_ptr += bb_signal_sample * (*L_code_ptr++);
++ }
++}
++
++#endif /* LV_HAVE_SSE4_1 */
++
++#ifdef LV_HAVE_SSE2
++#include "emmintrin.h"
++/*!
++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
++ \param input The input signal input
++ \param carrier The carrier signal input
++ \param E_code Early PRN code replica input
++ \param P_code Early PRN code replica input
++ \param L_code Early PRN code replica input
++ \param E_out Early correlation output
++ \param P_out Early correlation output
++ \param L_out Early correlation output
++ \param num_points The number of complex values in vectors
++ */
++static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_u_sse2(lv_8sc_t* E_out, lv_8sc_t* P_out, lv_8sc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points)
++{
++ const unsigned int sse_iters = num_points / 8;
++
++ __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample, real_E_code_acc, imag_E_code_acc, real_L_code_acc, imag_L_code_acc, real_P_code_acc, imag_P_code_acc;
++ __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output;
++
++ const lv_8sc_t* input_ptr = input;
++ const lv_8sc_t* carrier_ptr = carrier;
++
++ const lv_8sc_t* E_code_ptr = E_code;
++ lv_8sc_t* E_out_ptr = E_out;
++ const lv_8sc_t* L_code_ptr = L_code;
++ lv_8sc_t* L_out_ptr = L_out;
++ const lv_8sc_t* P_code_ptr = P_code;
++ lv_8sc_t* P_out_ptr = P_out;
++
++ *E_out_ptr = 0;
++ *P_out_ptr = 0;
++ *L_out_ptr = 0;
++
++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
++
++ real_E_code_acc = _mm_setzero_si128();
++ imag_E_code_acc = _mm_setzero_si128();
++ real_L_code_acc = _mm_setzero_si128();
++ imag_L_code_acc = _mm_setzero_si128();
++ real_P_code_acc = _mm_setzero_si128();
++ imag_P_code_acc = _mm_setzero_si128();
++
++ if (sse_iters>0)
++ {
++ for(int number = 0;number < sse_iters; number++){
++
++ //Perform the carrier wipe-off
++ x = _mm_lddqu_si128((__m128i*)input_ptr);
++ y = _mm_lddqu_si128((__m128i*)carrier_ptr);
++
++ imagx = _mm_srli_si128 (x, 1);
++ imagx = _mm_and_si128 (imagx, mult1);
++ realx = _mm_and_si128 (x, mult1);
++
++ imagy = _mm_srli_si128 (y, 1);
++ imagy = _mm_and_si128 (imagy, mult1);
++ realy = _mm_and_si128 (y, mult1);
++
++ realx_mult_realy = _mm_mullo_epi16 (realx, realy);
++ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);
++ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);
++ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);
++
++ real_bb_signal_sample = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
++ imag_bb_signal_sample = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
++
++ //Get early values
++ y = _mm_lddqu_si128((__m128i*)E_code_ptr);
++
++ imagy = _mm_srli_si128 (y, 1);
++ imagy = _mm_and_si128 (imagy, mult1);
++ realy = _mm_and_si128 (y, mult1);
++
++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy);
++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy);
++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy);
++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy);
++
++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
++
++ real_E_code_acc = _mm_add_epi16 (real_E_code_acc, real_output);
++ imag_E_code_acc = _mm_add_epi16 (imag_E_code_acc, imag_output);
++
++ //Get late values
++ y = _mm_lddqu_si128((__m128i*)L_code_ptr);
++
++ imagy = _mm_srli_si128 (y, 1);
++ imagy = _mm_and_si128 (imagy, mult1);
++ realy = _mm_and_si128 (y, mult1);
++
++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy);
++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy);
++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy);
++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy);
++
++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
++
++ real_L_code_acc = _mm_add_epi16 (real_L_code_acc, real_output);
++ imag_L_code_acc = _mm_add_epi16 (imag_L_code_acc, imag_output);
++
++ //Get prompt values
++ y = _mm_lddqu_si128((__m128i*)P_code_ptr);
++
++ imagy = _mm_srli_si128 (y, 1);
++ imagy = _mm_and_si128 (imagy, mult1);
++ realy = _mm_and_si128 (y, mult1);
++
++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy);
++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy);
++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy);
++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy);
++
++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
++
++ real_P_code_acc = _mm_add_epi16 (real_P_code_acc, real_output);
++ imag_P_code_acc = _mm_add_epi16 (imag_P_code_acc, imag_output);
++
++ input_ptr += 8;
++ carrier_ptr += 8;
++ E_code_ptr += 8;
++ L_code_ptr += 8;
++ P_code_ptr += 8;
++ }
++
++ __VOLK_ATTR_ALIGNED(16) lv_8sc_t E_dotProductVector[8];
++ __VOLK_ATTR_ALIGNED(16) lv_8sc_t L_dotProductVector[8];
++ __VOLK_ATTR_ALIGNED(16) lv_8sc_t P_dotProductVector[8];
++
++ real_E_code_acc = _mm_and_si128 (real_E_code_acc, mult1);
++ imag_E_code_acc = _mm_and_si128 (imag_E_code_acc, mult1);
++ imag_E_code_acc = _mm_slli_si128 (imag_E_code_acc, 1);
++ output = _mm_or_si128 (real_E_code_acc, imag_E_code_acc);
++ _mm_storeu_si128((__m128i*)E_dotProductVector, output);
++
++ real_L_code_acc = _mm_and_si128 (real_L_code_acc, mult1);
++ imag_L_code_acc = _mm_and_si128 (imag_L_code_acc, mult1);
++ imag_L_code_acc = _mm_slli_si128 (imag_L_code_acc, 1);
++ output = _mm_or_si128 (real_L_code_acc, imag_L_code_acc);
++ _mm_storeu_si128((__m128i*)L_dotProductVector, output);
++
++ real_P_code_acc = _mm_and_si128 (real_P_code_acc, mult1);
++ imag_P_code_acc = _mm_and_si128 (imag_P_code_acc, mult1);
++ imag_P_code_acc = _mm_slli_si128 (imag_P_code_acc, 1);
++ output = _mm_or_si128 (real_P_code_acc, imag_P_code_acc);
++ _mm_storeu_si128((__m128i*)P_dotProductVector, output);
++
++ for (int i = 0; i<8; ++i)
++ {
++ *E_out_ptr += E_dotProductVector[i];
++ *L_out_ptr += L_dotProductVector[i];
++ *P_out_ptr += P_dotProductVector[i];
++ }
++ }
++
++ lv_8sc_t bb_signal_sample;
++ for(int i=0; i < num_points%8; ++i)
++ {
++ //Perform the carrier wipe-off
++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
++ // Now get early, late, and prompt values for each
++ *E_out_ptr += bb_signal_sample * (*E_code_ptr++);
++ *P_out_ptr += bb_signal_sample * (*P_code_ptr++);
++ *L_out_ptr += bb_signal_sample * (*L_code_ptr++);
++ }
++}
++
++#endif /* LV_HAVE_SSE2 */
++
++#ifdef LV_HAVE_GENERIC
++/*!
++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
++ \param input The input signal input
++ \param carrier The carrier signal input
++ \param E_code Early PRN code replica input
++ \param P_code Early PRN code replica input
++ \param L_code Early PRN code replica input
++ \param E_out Early correlation output
++ \param P_out Early correlation output
++ \param L_out Early correlation output
++ \param num_points The number of complex values in vectors
++ */
++static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_generic(lv_8sc_t* E_out, lv_8sc_t* P_out, lv_8sc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points)
++{
++ lv_8sc_t bb_signal_sample;
++
++ bb_signal_sample = lv_cmake(0, 0);
++
++ *E_out = 0;
++ *P_out = 0;
++ *L_out = 0;
++ // perform Early, Prompt and Late correlation
++ for(int i=0; i < num_points; ++i)
++ {
++ //Perform the carrier wipe-off
++ bb_signal_sample = input[i] * carrier[i];
++ // Now get early, late, and prompt values for each
++ *E_out += bb_signal_sample * E_code[i];
++ *P_out += bb_signal_sample * P_code[i];
++ *L_out += bb_signal_sample * L_code[i];
++ }
++}
++
++#endif /* LV_HAVE_GENERIC */
++
++#endif /* INCLUDED_gnsssdr_volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_u_H */
++
++
++#ifndef INCLUDED_gnsssdr_volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_a_H
++#define INCLUDED_gnsssdr_volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_a_H
++
++#include <inttypes.h>
++#include <stdio.h>
++#include <volk_gnsssdr/volk_gnsssdr_complex.h>
++#include <float.h>
++#include <string.h>
++
++#ifdef LV_HAVE_SSE4_1
++#include "smmintrin.h"
++/*!
++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
++ \param input The input signal input
++ \param carrier The carrier signal input
++ \param E_code Early PRN code replica input
++ \param P_code Early PRN code replica input
++ \param L_code Early PRN code replica input
++ \param E_out Early correlation output
++ \param P_out Early correlation output
++ \param L_out Early correlation output
++ \param num_points The number of complex values in vectors
++ */
++static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_a_sse4_1(lv_8sc_t* E_out, lv_8sc_t* P_out, lv_8sc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points)
++{
++ const unsigned int sse_iters = num_points / 8;
++
++ __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample, real_E_code_acc, imag_E_code_acc, real_L_code_acc, imag_L_code_acc, real_P_code_acc, imag_P_code_acc;
++ __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output;
++
++ const lv_8sc_t* input_ptr = input;
++ const lv_8sc_t* carrier_ptr = carrier;
++
++ const lv_8sc_t* E_code_ptr = E_code;
++ lv_8sc_t* E_out_ptr = E_out;
++ const lv_8sc_t* L_code_ptr = L_code;
++ lv_8sc_t* L_out_ptr = L_out;
++ const lv_8sc_t* P_code_ptr = P_code;
++ lv_8sc_t* P_out_ptr = P_out;
++
++ *E_out_ptr = 0;
++ *P_out_ptr = 0;
++ *L_out_ptr = 0;
++
++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
++
++ real_E_code_acc = _mm_setzero_si128();
++ imag_E_code_acc = _mm_setzero_si128();
++ real_L_code_acc = _mm_setzero_si128();
++ imag_L_code_acc = _mm_setzero_si128();
++ real_P_code_acc = _mm_setzero_si128();
++ imag_P_code_acc = _mm_setzero_si128();
++
++ if (sse_iters>0)
++ {
++ for(int number = 0;number < sse_iters; number++){
++
++ //Perform the carrier wipe-off
++ x = _mm_load_si128((__m128i*)input_ptr);
++ y = _mm_load_si128((__m128i*)carrier_ptr);
++
++ imagx = _mm_srli_si128 (x, 1);
++ imagx = _mm_and_si128 (imagx, mult1);
++ realx = _mm_and_si128 (x, mult1);
++
++ imagy = _mm_srli_si128 (y, 1);
++ imagy = _mm_and_si128 (imagy, mult1);
++ realy = _mm_and_si128 (y, mult1);
++
++ realx_mult_realy = _mm_mullo_epi16 (realx, realy);
++ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);
++ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);
++ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);
++
++ real_bb_signal_sample = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
++ imag_bb_signal_sample = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
++
++ //Get early values
++ y = _mm_load_si128((__m128i*)E_code_ptr);
++
++ imagy = _mm_srli_si128 (y, 1);
++ imagy = _mm_and_si128 (imagy, mult1);
++ realy = _mm_and_si128 (y, mult1);
++
++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy);
++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy);
++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy);
++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy);
++
++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
++
++ real_E_code_acc = _mm_add_epi16 (real_E_code_acc, real_output);
++ imag_E_code_acc = _mm_add_epi16 (imag_E_code_acc, imag_output);
++
++ //Get late values
++ y = _mm_load_si128((__m128i*)L_code_ptr);
++
++ imagy = _mm_srli_si128 (y, 1);
++ imagy = _mm_and_si128 (imagy, mult1);
++ realy = _mm_and_si128 (y, mult1);
++
++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy);
++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy);
++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy);
++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy);
++
++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
++
++ real_L_code_acc = _mm_add_epi16 (real_L_code_acc, real_output);
++ imag_L_code_acc = _mm_add_epi16 (imag_L_code_acc, imag_output);
++
++ //Get prompt values
++ y = _mm_load_si128((__m128i*)P_code_ptr);
++
++ imagy = _mm_srli_si128 (y, 1);
++ imagy = _mm_and_si128 (imagy, mult1);
++ realy = _mm_and_si128 (y, mult1);
++
++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy);
++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy);
++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy);
++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy);
++
++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
++
++ real_P_code_acc = _mm_add_epi16 (real_P_code_acc, real_output);
++ imag_P_code_acc = _mm_add_epi16 (imag_P_code_acc, imag_output);
++
++ input_ptr += 8;
++ carrier_ptr += 8;
++ E_code_ptr += 8;
++ L_code_ptr += 8;
++ P_code_ptr += 8;
++ }
++
++ __VOLK_ATTR_ALIGNED(16) lv_8sc_t E_dotProductVector[8];
++ __VOLK_ATTR_ALIGNED(16) lv_8sc_t L_dotProductVector[8];
++ __VOLK_ATTR_ALIGNED(16) lv_8sc_t P_dotProductVector[8];
++
++ imag_E_code_acc = _mm_slli_si128 (imag_E_code_acc, 1);
++ output = _mm_blendv_epi8 (imag_E_code_acc, real_E_code_acc, mult1);
++ _mm_store_si128((__m128i*)E_dotProductVector, output);
++
++ imag_L_code_acc = _mm_slli_si128 (imag_L_code_acc, 1);
++ output = _mm_blendv_epi8 (imag_L_code_acc, real_L_code_acc, mult1);
++ _mm_store_si128((__m128i*)L_dotProductVector, output);
++
++ imag_P_code_acc = _mm_slli_si128 (imag_P_code_acc, 1);
++ output = _mm_blendv_epi8 (imag_P_code_acc, real_P_code_acc, mult1);
++ _mm_store_si128((__m128i*)P_dotProductVector, output);
++
++ for (int i = 0; i<8; ++i)
++ {
++ *E_out_ptr += E_dotProductVector[i];
++ *L_out_ptr += L_dotProductVector[i];
++ *P_out_ptr += P_dotProductVector[i];
++ }
++ }
++
++ lv_8sc_t bb_signal_sample;
++ for(int i=0; i < num_points%8; ++i)
++ {
++ //Perform the carrier wipe-off
++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
++ // Now get early, late, and prompt values for each
++ *E_out_ptr += bb_signal_sample * (*E_code_ptr++);
++ *P_out_ptr += bb_signal_sample * (*P_code_ptr++);
++ *L_out_ptr += bb_signal_sample * (*L_code_ptr++);
++ }
++}
++
++#endif /* LV_HAVE_SSE4_1 */
++
++#ifdef LV_HAVE_SSE2
++#include "emmintrin.h"
++/*!
++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
++ \param input The input signal input
++ \param carrier The carrier signal input
++ \param E_code Early PRN code replica input
++ \param P_code Early PRN code replica input
++ \param L_code Early PRN code replica input
++ \param E_out Early correlation output
++ \param P_out Early correlation output
++ \param L_out Early correlation output
++ \param num_points The number of complex values in vectors
++ */
++static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_a_sse2(lv_8sc_t* E_out, lv_8sc_t* P_out, lv_8sc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points)
++{
++ const unsigned int sse_iters = num_points / 8;
++
++ __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample, real_E_code_acc, imag_E_code_acc, real_L_code_acc, imag_L_code_acc, real_P_code_acc, imag_P_code_acc;
++ __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output;
++
++ const lv_8sc_t* input_ptr = input;
++ const lv_8sc_t* carrier_ptr = carrier;
++
++ const lv_8sc_t* E_code_ptr = E_code;
++ lv_8sc_t* E_out_ptr = E_out;
++ const lv_8sc_t* L_code_ptr = L_code;
++ lv_8sc_t* L_out_ptr = L_out;
++ const lv_8sc_t* P_code_ptr = P_code;
++ lv_8sc_t* P_out_ptr = P_out;
++
++ *E_out_ptr = 0;
++ *P_out_ptr = 0;
++ *L_out_ptr = 0;
++
++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
++
++ real_E_code_acc = _mm_setzero_si128();
++ imag_E_code_acc = _mm_setzero_si128();
++ real_L_code_acc = _mm_setzero_si128();
++ imag_L_code_acc = _mm_setzero_si128();
++ real_P_code_acc = _mm_setzero_si128();
++ imag_P_code_acc = _mm_setzero_si128();
++
++ if (sse_iters>0)
++ {
++ for(int number = 0;number < sse_iters; number++){
++
++ //Perform the carrier wipe-off
++ x = _mm_load_si128((__m128i*)input_ptr);
++ y = _mm_load_si128((__m128i*)carrier_ptr);
++
++ imagx = _mm_srli_si128 (x, 1);
++ imagx = _mm_and_si128 (imagx, mult1);
++ realx = _mm_and_si128 (x, mult1);
++
++ imagy = _mm_srli_si128 (y, 1);
++ imagy = _mm_and_si128 (imagy, mult1);
++ realy = _mm_and_si128 (y, mult1);
++
++ realx_mult_realy = _mm_mullo_epi16 (realx, realy);
++ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);
++ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);
++ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);
++
++ real_bb_signal_sample = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
++ imag_bb_signal_sample = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
++
++ //Get early values
++ y = _mm_load_si128((__m128i*)E_code_ptr);
++
++ imagy = _mm_srli_si128 (y, 1);
++ imagy = _mm_and_si128 (imagy, mult1);
++ realy = _mm_and_si128 (y, mult1);
++
++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy);
++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy);
++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy);
++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy);
++
++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
++
++ real_E_code_acc = _mm_add_epi16 (real_E_code_acc, real_output);
++ imag_E_code_acc = _mm_add_epi16 (imag_E_code_acc, imag_output);
++
++ //Get late values
++ y = _mm_load_si128((__m128i*)L_code_ptr);
++
++ imagy = _mm_srli_si128 (y, 1);
++ imagy = _mm_and_si128 (imagy, mult1);
++ realy = _mm_and_si128 (y, mult1);
++
++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy);
++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy);
++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy);
++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy);
++
++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
++
++ real_L_code_acc = _mm_add_epi16 (real_L_code_acc, real_output);
++ imag_L_code_acc = _mm_add_epi16 (imag_L_code_acc, imag_output);
++
++ //Get prompt values
++ y = _mm_load_si128((__m128i*)P_code_ptr);
++
++ imagy = _mm_srli_si128 (y, 1);
++ imagy = _mm_and_si128 (imagy, mult1);
++ realy = _mm_and_si128 (y, mult1);
++
++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy);
++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy);
++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy);
++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy);
++
++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
++
++ real_P_code_acc = _mm_add_epi16 (real_P_code_acc, real_output);
++ imag_P_code_acc = _mm_add_epi16 (imag_P_code_acc, imag_output);
++
++ input_ptr += 8;
++ carrier_ptr += 8;
++ E_code_ptr += 8;
++ L_code_ptr += 8;
++ P_code_ptr += 8;
++ }
++
++ __VOLK_ATTR_ALIGNED(16) lv_8sc_t E_dotProductVector[8];
++ __VOLK_ATTR_ALIGNED(16) lv_8sc_t L_dotProductVector[8];
++ __VOLK_ATTR_ALIGNED(16) lv_8sc_t P_dotProductVector[8];
++
++ real_E_code_acc = _mm_and_si128 (real_E_code_acc, mult1);
++ imag_E_code_acc = _mm_and_si128 (imag_E_code_acc, mult1);
++ imag_E_code_acc = _mm_slli_si128 (imag_E_code_acc, 1);
++ output = _mm_or_si128 (real_E_code_acc, imag_E_code_acc);
++ _mm_store_si128((__m128i*)E_dotProductVector, output);
++
++ real_L_code_acc = _mm_and_si128 (real_L_code_acc, mult1);
++ imag_L_code_acc = _mm_and_si128 (imag_L_code_acc, mult1);
++ imag_L_code_acc = _mm_slli_si128 (imag_L_code_acc, 1);
++ output = _mm_or_si128 (real_L_code_acc, imag_L_code_acc);
++ _mm_store_si128((__m128i*)L_dotProductVector, output);
++
++ real_P_code_acc = _mm_and_si128 (real_P_code_acc, mult1);
++ imag_P_code_acc = _mm_and_si128 (imag_P_code_acc, mult1);
++ imag_P_code_acc = _mm_slli_si128 (imag_P_code_acc, 1);
++ output = _mm_or_si128 (real_P_code_acc, imag_P_code_acc);
++ _mm_store_si128((__m128i*)P_dotProductVector, output);
++
++ for (int i = 0; i<8; ++i)
++ {
++ *E_out_ptr += E_dotProductVector[i];
++ *L_out_ptr += L_dotProductVector[i];
++ *P_out_ptr += P_dotProductVector[i];
++ }
++ }
++
++ lv_8sc_t bb_signal_sample;
++ for(int i=0; i < num_points%8; ++i)
++ {
++ //Perform the carrier wipe-off
++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
++ // Now get early, late, and prompt values for each
++ *E_out_ptr += bb_signal_sample * (*E_code_ptr++);
++ *P_out_ptr += bb_signal_sample * (*P_code_ptr++);
++ *L_out_ptr += bb_signal_sample * (*L_code_ptr++);
++ }
++}
++
++#endif /* LV_HAVE_SSE2 */
++
++#ifdef LV_HAVE_GENERIC
++/*!
++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
++ \param input The input signal input
++ \param carrier The carrier signal input
++ \param E_code Early PRN code replica input
++ \param P_code Early PRN code replica input
++ \param L_code Early PRN code replica input
++ \param E_out Early correlation output
++ \param P_out Early correlation output
++ \param L_out Early correlation output
++ \param num_points The number of complex values in vectors
++ */
++static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_a_generic(lv_8sc_t* E_out, lv_8sc_t* P_out, lv_8sc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points)
++{
++ lv_8sc_t bb_signal_sample;
++
++ bb_signal_sample = lv_cmake(0, 0);
++
++ *E_out = 0;
++ *P_out = 0;
++ *L_out = 0;
++ // perform Early, Prompt and Late correlation
++ for(int i=0; i < num_points; ++i)
++ {
++ //Perform the carrier wipe-off
++ bb_signal_sample = input[i] * carrier[i];
++ // Now get early, late, and prompt values for each
++ *E_out += bb_signal_sample * E_code[i];
++ *P_out += bb_signal_sample * P_code[i];
++ *L_out += bb_signal_sample * L_code[i];
++ }
++}
++
++#endif /* LV_HAVE_GENERIC */
++
++#ifdef LV_HAVE_ORC
++/*!
++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
++ \param input The input signal input
++ \param carrier The carrier signal input
++ \param E_code Early PRN code replica input
++ \param P_code Early PRN code replica input
++ \param L_code Early PRN code replica input
++ \param E_out Early correlation output
++ \param P_out Early correlation output
++ \param L_out Early correlation output
++ \param num_points The number of complex values in vectors
++ */
++
++extern void volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_first_a_orc_impl(short* E_out_real, short* E_out_imag, short* P_out_real, short* P_out_imag, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, unsigned int num_points);
++extern void volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_second_a_orc_impl(short* L_out_real, short* L_out_imag, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* L_code, unsigned int num_points);
++static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_u_orc(lv_8sc_t* E_out, lv_8sc_t* P_out, lv_8sc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points){
++
++ short E_out_real = 0;
++ short E_out_imag = 0;
++ char* E_out_real_c = (char*)&E_out_real;
++ E_out_real_c++;
++ char* E_out_imag_c = (char*)&E_out_imag;
++ E_out_imag_c++;
++
++ short P_out_real = 0;
++ short P_out_imag = 0;
++ char* P_out_real_c = (char*)&P_out_real;
++ P_out_real_c++;
++ char* P_out_imag_c = (char*)&P_out_imag;
++ P_out_imag_c++;
++
++ short L_out_real = 0;
++ short L_out_imag = 0;
++ char* L_out_real_c = (char*)&L_out_real;
++ L_out_real_c++;
++ char* L_out_imag_c = (char*)&L_out_imag;
++ L_out_imag_c++;
++
++ volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_first_a_orc_impl( &E_out_real, &E_out_imag, &P_out_real, &P_out_imag, input, carrier, E_code, P_code, num_points);
++ volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_second_a_orc_impl( &L_out_real, &L_out_imag, input, carrier, L_code, num_points);
++
++ //ORC implementation of 8ic_x5_cw_epl_corr_8ic_x3 is done in two different functions because it seems that
++ //in one function the length of the code gives memory problems (bad access, segmentation fault).
++ //Also, the maximum number of accumulators that can be used is 4 (and we need 6).
++ //The "carrier wipe-off" step is done two times: one in the first function and another one in the second.
++ //Joining all the ORC code in one function would be quicker because the "carrier wipe-off" step would be done just
++ //one time.
++
++ *E_out = lv_cmake(*E_out_real_c, *E_out_imag_c);
++ *P_out = lv_cmake(*P_out_real_c, *P_out_imag_c);
++ *L_out = lv_cmake(*L_out_real_c, *L_out_imag_c);
++}
++#endif /* LV_HAVE_ORC */
++
++#endif /* INCLUDED_gnsssdr_volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_a_H */
+diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5.h
+--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5.h 1970-01-01 01:00:00.000000000 +0100
++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5.h 2014-10-15 01:55:08.000000000 +0200
+@@ -0,0 +1,797 @@
++/*!
++ * \file volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5.h
++ * \brief Volk protokernel: performs the carrier wipe-off mixing and the Very early, Early, Prompt, Late and very late correlation with 16 bits vectors, and accumulates the results into float32. In order to avoid overflow, If input, carrier and XX_code have the same number of bits, they must be values between —3 and 3 (2 bits).
++ * \authors <ul>
++ * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
++ * </ul>
++ *
++ * Volk protokernel that performs the carrier wipe-off mixing and the
++ * Very early, Early, Prompt, Late and very late correlation with 16 bits vectors (8 bits the
++ * real part and 8 bits the imaginary part), and accumulates the result
++ * in 32 bits single point values, returning float32 values:
++ * - The carrier wipe-off is done by multiplying the input signal by the
++ * carrier (multiplication of 16 bits vectors) It returns the input
++ * signal in base band (BB)
++ * - Very Early values are calculated by multiplying the input signal in BB by the
++ * very early code (multiplication of 16 bits vectors), accumulating the results into float32 values
++ * - Early values are calculated by multiplying the input signal in BB by the
++ * early code (multiplication of 16 bits vectors), accumulating the results into float32 values
++ * - Prompt values are calculated by multiplying the input signal in BB by the
++ * prompt code (multiplication of 16 bits vectors), accumulating the results into float32 values
++ * - Late values are calculated by multiplying the input signal in BB by the
++ * late code (multiplication of 16 bits vectors), accumulating the results into float32 values
++ * - Very Late values are calculated by multiplying the input signal in BB by the
++ * very late code (multiplication of 16 bits vectors), accumulating the results into float32 values
++ *
++ * -------------------------------------------------------------------------
++ * Bits analysis
++ *
++ * input = 8 bits
++ * carrier = 8 bits
++ * XX_code = 8 bits
++ * XX_out = 8 bits
++ * bb_signal_sample = 8 bits
++ *
++ * bb_signal_sample = input*carrier -> 17 bits limited to 8 bits = input and carrier must be values between —7 and 7 to avoid overflow (3 bits)
++ *
++ * XX_out16 = XX_code*bb_signal_sample -> 17 bits limited to 8 bits = XX_code and bb_signal_sample must be values between —7 and 7 to avoid overflow (3 bits)
++ *
++ * conclusion = input and carrier must be values between —1 and 1 (1 bit) and XX_code must be values between —7 and 7 to avoid overflow (3 bits)
++ * If input, carrier and XX_code have the same number of bits, they must be values between —3 and 3 to avoid overflow (2 bits).
++ * -------------------------------------------------------------------------
++ *
++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
++ *
++ * GNSS-SDR is a software defined Global Navigation
++ * Satellite Systems receiver
++ *
++ * This file is part of GNSS-SDR.
++ *
++ * GNSS-SDR is free software: you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation, either version 3 of the License, or
++ * at your option) any later version.
++ *
++ * GNSS-SDR is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
++ *
++ * -------------------------------------------------------------------------
++ */
++
++#ifndef INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_u_H
++#define INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_u_H
++
++#include <inttypes.h>
++#include <stdio.h>
++#include <volk_gnsssdr/volk_gnsssdr_complex.h>
++#include <float.h>
++#include <string.h>
++
++#ifdef LV_HAVE_SSE4_1
++#include "smmintrin.h"
++#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h"
++#include "CommonMacros/CommonMacros.h"
++/*!
++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
++ \param input The input signal input
++ \param carrier The carrier signal input
++ \param VE_code Very Early PRN code replica input
++ \param E_code Early PRN code replica input
++ \param P_code Prompt PRN code replica input
++ \param L_code Late PRN code replica input
++ \param VL_code Very Late PRN code replica input
++ \param VE_out Very Early correlation output
++ \param E_out Early correlation output
++ \param P_out Prompt correlation output
++ \param L_out Late correlation output
++ \param VL_out Very Late correlation output
++ \param num_points The number of complex values in vectors
++ */
++static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_u_sse4_1(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points)
++{
++ const unsigned int sse_iters = num_points / 8;
++
++ __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample;
++ __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output;
++
++ __m128 VE_code_acc, E_code_acc, P_code_acc, L_code_acc, VL_code_acc;
++ __m128i input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2;
++ __m128 output_ps;
++
++ const lv_8sc_t* input_ptr = input;
++ const lv_8sc_t* carrier_ptr = carrier;
++
++ const lv_8sc_t* VE_code_ptr = VE_code;
++ lv_32fc_t* VE_out_ptr = VE_out;
++ const lv_8sc_t* E_code_ptr = E_code;
++ lv_32fc_t* E_out_ptr = E_out;
++ const lv_8sc_t* P_code_ptr = P_code;
++ lv_32fc_t* P_out_ptr = P_out;
++ const lv_8sc_t* L_code_ptr = L_code;
++ lv_32fc_t* L_out_ptr = L_out;
++ const lv_8sc_t* VL_code_ptr = VL_code;
++ lv_32fc_t* VL_out_ptr = VL_out;
++
++ *VE_out_ptr = 0;
++ *E_out_ptr = 0;
++ *P_out_ptr = 0;
++ *L_out_ptr = 0;
++ *VL_out_ptr = 0;
++
++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
++
++ VE_code_acc = _mm_setzero_ps();
++ E_code_acc = _mm_setzero_ps();
++ P_code_acc = _mm_setzero_ps();
++ L_code_acc = _mm_setzero_ps();
++ VL_code_acc = _mm_setzero_ps();
++
++ if (sse_iters>0)
++ {
++ for(int number = 0;number < sse_iters; number++){
++
++ //Perform the carrier wipe-off
++ x = _mm_lddqu_si128((__m128i*)input_ptr);
++ y = _mm_lddqu_si128((__m128i*)carrier_ptr);
++
++ CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(x, mult1, realx, imagx)
++ CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy)
++
++ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample)
++
++ //Get very early values
++ y = _mm_lddqu_si128((__m128i*)VE_code_ptr);
++
++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
++
++ VE_code_acc = _mm_add_ps (VE_code_acc, output_ps);
++
++ //Get early values
++ y = _mm_lddqu_si128((__m128i*)E_code_ptr);
++
++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
++
++ E_code_acc = _mm_add_ps (E_code_acc, output_ps);
++
++ //Get prompt values
++ y = _mm_lddqu_si128((__m128i*)P_code_ptr);
++
++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
++
++ P_code_acc = _mm_add_ps (P_code_acc, output_ps);
++
++ //Get late values
++ y = _mm_lddqu_si128((__m128i*)L_code_ptr);
++
++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
++
++ L_code_acc = _mm_add_ps (L_code_acc, output_ps);
++
++ //Get very late values
++ y = _mm_lddqu_si128((__m128i*)VL_code_ptr);
++
++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
++
++ VL_code_acc = _mm_add_ps (VL_code_acc, output_ps);
++
++ input_ptr += 8;
++ carrier_ptr += 8;
++ VE_code_ptr += 8;
++ E_code_ptr += 8;
++ P_code_ptr += 8;
++ L_code_ptr += 8;
++ VL_code_ptr += 8;
++ }
++
++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t VE_dotProductVector[2];
++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2];
++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2];
++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2];
++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t VL_dotProductVector[2];
++
++ _mm_storeu_ps((float*)VE_dotProductVector,VE_code_acc); // Store the results back into the dot product vector
++ _mm_storeu_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector
++ _mm_storeu_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector
++ _mm_storeu_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector
++ _mm_storeu_ps((float*)VL_dotProductVector,VL_code_acc); // Store the results back into the dot product vector
++
++ for (int i = 0; i<2; ++i)
++ {
++ *VE_out_ptr += VE_dotProductVector[i];
++ *E_out_ptr += E_dotProductVector[i];
++ *P_out_ptr += P_dotProductVector[i];
++ *L_out_ptr += L_dotProductVector[i];
++ *VL_out_ptr += VL_dotProductVector[i];
++ }
++ }
++
++ lv_8sc_t bb_signal_sample;
++ for(int i=0; i < num_points%8; ++i)
++ {
++ //Perform the carrier wipe-off
++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
++ // Now get very early, early, prompt, late and very late values for each
++ *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VE_code_ptr++));
++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++));
++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++));
++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++));
++ *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VL_code_ptr++));
++ }
++}
++#endif /* LV_HAVE_SSE4_1 */
++
++#ifdef LV_HAVE_SSE2
++#include "emmintrin.h"
++#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h"
++#include "CommonMacros/CommonMacros.h"
++/*!
++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
++ \param input The input signal input
++ \param carrier The carrier signal input
++ \param VE_code Very Early PRN code replica input
++ \param E_code Early PRN code replica input
++ \param P_code Prompt PRN code replica input
++ \param L_code Late PRN code replica input
++ \param VL_code Very Late PRN code replica input
++ \param VE_out Very Early correlation output
++ \param E_out Early correlation output
++ \param P_out Prompt correlation output
++ \param L_out Late correlation output
++ \param VL_out Very Late correlation output
++ \param num_points The number of complex values in vectors
++ */
++static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_u_sse2(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points)
++{
++ const unsigned int sse_iters = num_points / 8;
++
++ __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample;
++ __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output;
++
++ __m128 VE_code_acc, E_code_acc, P_code_acc, L_code_acc, VL_code_acc;
++ __m128i input_i_1, input_i_2, output_i32;
++ __m128 output_ps_1, output_ps_2;
++
++ const lv_8sc_t* input_ptr = input;
++ const lv_8sc_t* carrier_ptr = carrier;
++
++ const lv_8sc_t* VE_code_ptr = VE_code;
++ lv_32fc_t* VE_out_ptr = VE_out;
++ const lv_8sc_t* E_code_ptr = E_code;
++ lv_32fc_t* E_out_ptr = E_out;
++ const lv_8sc_t* P_code_ptr = P_code;
++ lv_32fc_t* P_out_ptr = P_out;
++ const lv_8sc_t* L_code_ptr = L_code;
++ lv_32fc_t* L_out_ptr = L_out;
++ const lv_8sc_t* VL_code_ptr = VL_code;
++ lv_32fc_t* VL_out_ptr = VL_out;
++
++ *VE_out_ptr = 0;
++ *E_out_ptr = 0;
++ *P_out_ptr = 0;
++ *L_out_ptr = 0;
++ *VL_out_ptr = 0;
++
++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
++
++ VE_code_acc = _mm_setzero_ps();
++ E_code_acc = _mm_setzero_ps();
++ P_code_acc = _mm_setzero_ps();
++ L_code_acc = _mm_setzero_ps();
++ VL_code_acc = _mm_setzero_ps();
++
++ if (sse_iters>0)
++ {
++ for(int number = 0;number < sse_iters; number++){
++
++ //Perform the carrier wipe-off
++ x = _mm_lddqu_si128((__m128i*)input_ptr);
++ y = _mm_lddqu_si128((__m128i*)carrier_ptr);
++
++ CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(x, mult1, realx, imagx)
++ CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy)
++
++ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample)
++
++ //Get very early values
++ y = _mm_lddqu_si128((__m128i*)VE_code_ptr);
++
++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
++
++ VE_code_acc = _mm_add_ps (VE_code_acc, output_ps_1);
++ VE_code_acc = _mm_add_ps (VE_code_acc, output_ps_2);
++
++ //Get early values
++ y = _mm_lddqu_si128((__m128i*)E_code_ptr);
++
++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
++
++ E_code_acc = _mm_add_ps (E_code_acc, output_ps_1);
++ E_code_acc = _mm_add_ps (E_code_acc, output_ps_2);
++
++ //Get prompt values
++ y = _mm_lddqu_si128((__m128i*)P_code_ptr);
++
++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
++
++ P_code_acc = _mm_add_ps (P_code_acc, output_ps_1);
++ P_code_acc = _mm_add_ps (P_code_acc, output_ps_2);
++
++ //Get late values
++ y = _mm_lddqu_si128((__m128i*)L_code_ptr);
++
++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
++
++ L_code_acc = _mm_add_ps (L_code_acc, output_ps_1);
++ L_code_acc = _mm_add_ps (L_code_acc, output_ps_2);
++
++ //Get very late values
++ y = _mm_lddqu_si128((__m128i*)VL_code_ptr);
++
++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
++
++ VL_code_acc = _mm_add_ps (VL_code_acc, output_ps_1);
++ VL_code_acc = _mm_add_ps (VL_code_acc, output_ps_2);
++
++ input_ptr += 8;
++ carrier_ptr += 8;
++ VE_code_ptr += 8;
++ E_code_ptr += 8;
++ P_code_ptr += 8;
++ L_code_ptr += 8;
++ VL_code_ptr += 8;
++ }
++
++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t VE_dotProductVector[2];
++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2];
++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2];
++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2];
++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t VL_dotProductVector[2];
++
++ _mm_storeu_ps((float*)VE_dotProductVector,VE_code_acc); // Store the results back into the dot product vector
++ _mm_storeu_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector
++ _mm_storeu_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector
++ _mm_storeu_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector
++ _mm_storeu_ps((float*)VL_dotProductVector,VL_code_acc); // Store the results back into the dot product vector
++
++ for (int i = 0; i<2; ++i)
++ {
++ *VE_out_ptr += VE_dotProductVector[i];
++ *E_out_ptr += E_dotProductVector[i];
++ *P_out_ptr += P_dotProductVector[i];
++ *L_out_ptr += L_dotProductVector[i];
++ *VL_out_ptr += VL_dotProductVector[i];
++ }
++ }
++
++ lv_8sc_t bb_signal_sample;
++ for(int i=0; i < num_points%8; ++i)
++ {
++ //Perform the carrier wipe-off
++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
++ // Now get very early, early, prompt, late and very late values for each
++ *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VE_code_ptr++));
++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++));
++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++));
++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++));
++ *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VL_code_ptr++));
++ }
++}
++#endif /* LV_HAVE_SSE2 */
++
++#ifdef LV_HAVE_GENERIC
++/*!
++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
++ \param input The input signal input
++ \param carrier The carrier signal input
++ \param VE_code Very Early PRN code replica input
++ \param E_code Early PRN code replica input
++ \param P_code Prompt PRN code replica input
++ \param L_code Late PRN code replica input
++ \param VL_code Very Late PRN code replica input
++ \param VE_out Very Early correlation output
++ \param E_out Early correlation output
++ \param P_out Prompt correlation output
++ \param L_out Late correlation output
++ \param VL_out Very Late correlation output
++ \param num_points The number of complex values in vectors
++ */
++static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points)
++{
++ lv_8sc_t bb_signal_sample;
++
++ bb_signal_sample = lv_cmake(0, 0);
++
++ *VE_out = 0;
++ *E_out = 0;
++ *P_out = 0;
++ *L_out = 0;
++ *VL_out = 0;
++ // perform very early, Early, Prompt, Late and very late correlation
++ for(int i=0; i < num_points; ++i)
++ {
++ //Perform the carrier wipe-off
++ bb_signal_sample = input[i] * carrier[i];
++
++ *VE_out += (lv_32fc_t) (bb_signal_sample * VE_code[i]);
++ *E_out += (lv_32fc_t) (bb_signal_sample * E_code[i]);
++ *P_out += (lv_32fc_t) (bb_signal_sample * P_code[i]);
++ *L_out += (lv_32fc_t) (bb_signal_sample * L_code[i]);
++ *VL_out += (lv_32fc_t) (bb_signal_sample * VL_code[i]);
++ }
++}
++
++#endif /* LV_HAVE_GENERIC */
++
++#endif /* INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_u_H */
++
++
++#ifndef INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_a_H
++#define INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_a_H
++
++#include <inttypes.h>
++#include <stdio.h>
++#include <volk_gnsssdr/volk_gnsssdr_complex.h>
++#include <float.h>
++#include <string.h>
++
++#ifdef LV_HAVE_SSE4_1
++#include "smmintrin.h"
++#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h"
++#include "CommonMacros/CommonMacros.h"
++/*!
++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
++ \param input The input signal input
++ \param carrier The carrier signal input
++ \param VE_code Very Early PRN code replica input
++ \param E_code Early PRN code replica input
++ \param P_code Prompt PRN code replica input
++ \param L_code Late PRN code replica input
++ \param VL_code Very Late PRN code replica input
++ \param VE_out Very Early correlation output
++ \param E_out Early correlation output
++ \param P_out Prompt correlation output
++ \param L_out Late correlation output
++ \param VL_out Very Late correlation output
++ \param num_points The number of complex values in vectors
++ */
++static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_a_sse4_1(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points)
++{
++ const unsigned int sse_iters = num_points / 8;
++
++ __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample;
++ __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output;
++
++ __m128 VE_code_acc, E_code_acc, P_code_acc, L_code_acc, VL_code_acc;
++ __m128i input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2;
++ __m128 output_ps;
++
++ const lv_8sc_t* input_ptr = input;
++ const lv_8sc_t* carrier_ptr = carrier;
++
++ const lv_8sc_t* VE_code_ptr = VE_code;
++ lv_32fc_t* VE_out_ptr = VE_out;
++ const lv_8sc_t* E_code_ptr = E_code;
++ lv_32fc_t* E_out_ptr = E_out;
++ const lv_8sc_t* P_code_ptr = P_code;
++ lv_32fc_t* P_out_ptr = P_out;
++ const lv_8sc_t* L_code_ptr = L_code;
++ lv_32fc_t* L_out_ptr = L_out;
++ const lv_8sc_t* VL_code_ptr = VL_code;
++ lv_32fc_t* VL_out_ptr = VL_out;
++
++ *VE_out_ptr = 0;
++ *E_out_ptr = 0;
++ *P_out_ptr = 0;
++ *L_out_ptr = 0;
++ *VL_out_ptr = 0;
++
++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
++
++ VE_code_acc = _mm_setzero_ps();
++ E_code_acc = _mm_setzero_ps();
++ P_code_acc = _mm_setzero_ps();
++ L_code_acc = _mm_setzero_ps();
++ VL_code_acc = _mm_setzero_ps();
++
++ if (sse_iters>0)
++ {
++ for(int number = 0;number < sse_iters; number++){
++
++ //Perform the carrier wipe-off
++ x = _mm_load_si128((__m128i*)input_ptr);
++ y = _mm_load_si128((__m128i*)carrier_ptr);
++
++ CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(x, mult1, realx, imagx)
++ CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy)
++
++ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample)
++
++ //Get very early values
++ y = _mm_load_si128((__m128i*)VE_code_ptr);
++
++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
++
++ VE_code_acc = _mm_add_ps (VE_code_acc, output_ps);
++
++ //Get early values
++ y = _mm_load_si128((__m128i*)E_code_ptr);
++
++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
++
++ E_code_acc = _mm_add_ps (E_code_acc, output_ps);
++
++ //Get prompt values
++ y = _mm_load_si128((__m128i*)P_code_ptr);
++
++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
++
++ P_code_acc = _mm_add_ps (P_code_acc, output_ps);
++
++ //Get late values
++ y = _mm_load_si128((__m128i*)L_code_ptr);
++
++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
++
++ L_code_acc = _mm_add_ps (L_code_acc, output_ps);
++
++ //Get very late values
++ y = _mm_load_si128((__m128i*)VL_code_ptr);
++
++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
++
++ VL_code_acc = _mm_add_ps (VL_code_acc, output_ps);
++
++ input_ptr += 8;
++ carrier_ptr += 8;
++ VE_code_ptr += 8;
++ E_code_ptr += 8;
++ P_code_ptr += 8;
++ L_code_ptr += 8;
++ VL_code_ptr += 8;
++ }
++
++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t VE_dotProductVector[2];
++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2];
++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2];
++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2];
++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t VL_dotProductVector[2];
++
++ _mm_store_ps((float*)VE_dotProductVector,VE_code_acc); // Store the results back into the dot product vector
++ _mm_store_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector
++ _mm_store_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector
++ _mm_store_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector
++ _mm_store_ps((float*)VL_dotProductVector,VL_code_acc); // Store the results back into the dot product vector
++
++ for (int i = 0; i<2; ++i)
++ {
++ *VE_out_ptr += VE_dotProductVector[i];
++ *E_out_ptr += E_dotProductVector[i];
++ *P_out_ptr += P_dotProductVector[i];
++ *L_out_ptr += L_dotProductVector[i];
++ *VL_out_ptr += VL_dotProductVector[i];
++ }
++ }
++
++ lv_8sc_t bb_signal_sample;
++ for(int i=0; i < num_points%8; ++i)
++ {
++ //Perform the carrier wipe-off
++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
++ // Now get very early, early, prompt, late and very late values for each
++ *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VE_code_ptr++));
++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++));
++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++));
++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++));
++ *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VL_code_ptr++));
++ }
++}
++#endif /* LV_HAVE_SSE4_1 */
++
++#ifdef LV_HAVE_SSE2
++#include "emmintrin.h"
++#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h"
++#include "CommonMacros/CommonMacros.h"
++/*!
++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
++ \param input The input signal input
++ \param carrier The carrier signal input
++ \param VE_code Very Early PRN code replica input
++ \param E_code Early PRN code replica input
++ \param P_code Prompt PRN code replica input
++ \param L_code Late PRN code replica input
++ \param VL_code Very Late PRN code replica input
++ \param VE_out Very Early correlation output
++ \param E_out Early correlation output
++ \param P_out Prompt correlation output
++ \param L_out Late correlation output
++ \param VL_out Very Late correlation output
++ \param num_points The number of complex values in vectors
++ */
++static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_a_sse2(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points)
++{
++ const unsigned int sse_iters = num_points / 8;
++
++ __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample;
++ __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output;
++
++ __m128 VE_code_acc, E_code_acc, P_code_acc, L_code_acc, VL_code_acc;
++ __m128i input_i_1, input_i_2, output_i32;
++ __m128 output_ps_1, output_ps_2;
++
++ const lv_8sc_t* input_ptr = input;
++ const lv_8sc_t* carrier_ptr = carrier;
++
++ const lv_8sc_t* VE_code_ptr = VE_code;
++ lv_32fc_t* VE_out_ptr = VE_out;
++ const lv_8sc_t* E_code_ptr = E_code;
++ lv_32fc_t* E_out_ptr = E_out;
++ const lv_8sc_t* P_code_ptr = P_code;
++ lv_32fc_t* P_out_ptr = P_out;
++ const lv_8sc_t* L_code_ptr = L_code;
++ lv_32fc_t* L_out_ptr = L_out;
++ const lv_8sc_t* VL_code_ptr = VL_code;
++ lv_32fc_t* VL_out_ptr = VL_out;
++
++ *VE_out_ptr = 0;
++ *E_out_ptr = 0;
++ *P_out_ptr = 0;
++ *L_out_ptr = 0;
++ *VL_out_ptr = 0;
++
++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
++
++ VE_code_acc = _mm_setzero_ps();
++ E_code_acc = _mm_setzero_ps();
++ P_code_acc = _mm_setzero_ps();
++ L_code_acc = _mm_setzero_ps();
++ VL_code_acc = _mm_setzero_ps();
++
++ if (sse_iters>0)
++ {
++ for(int number = 0;number < sse_iters; number++){
++
++ //Perform the carrier wipe-off
++ x = _mm_load_si128((__m128i*)input_ptr);
++ y = _mm_load_si128((__m128i*)carrier_ptr);
++
++ CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(x, mult1, realx, imagx)
++ CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy)
++
++ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample)
++
++ //Get very early values
++ y = _mm_load_si128((__m128i*)VE_code_ptr);
++
++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
++
++ VE_code_acc = _mm_add_ps (VE_code_acc, output_ps_1);
++ VE_code_acc = _mm_add_ps (VE_code_acc, output_ps_2);
++
++ //Get early values
++ y = _mm_load_si128((__m128i*)E_code_ptr);
++
++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
++
++ E_code_acc = _mm_add_ps (E_code_acc, output_ps_1);
++ E_code_acc = _mm_add_ps (E_code_acc, output_ps_2);
++
++ //Get prompt values
++ y = _mm_load_si128((__m128i*)P_code_ptr);
++
++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
++
++ P_code_acc = _mm_add_ps (P_code_acc, output_ps_1);
++ P_code_acc = _mm_add_ps (P_code_acc, output_ps_2);
++
++ //Get late values
++ y = _mm_load_si128((__m128i*)L_code_ptr);
++
++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
++
++ L_code_acc = _mm_add_ps (L_code_acc, output_ps_1);
++ L_code_acc = _mm_add_ps (L_code_acc, output_ps_2);
++
++ //Get very late values
++ y = _mm_load_si128((__m128i*)VL_code_ptr);
++
++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
++
++ VL_code_acc = _mm_add_ps (VL_code_acc, output_ps_1);
++ VL_code_acc = _mm_add_ps (VL_code_acc, output_ps_2);
++
++ input_ptr += 8;
++ carrier_ptr += 8;
++ VE_code_ptr += 8;
++ E_code_ptr += 8;
++ P_code_ptr += 8;
++ L_code_ptr += 8;
++ VL_code_ptr += 8;
++ }
++
++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t VE_dotProductVector[2];
++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2];
++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2];
++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2];
++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t VL_dotProductVector[2];
++
++ _mm_store_ps((float*)VE_dotProductVector,VE_code_acc); // Store the results back into the dot product vector
++ _mm_store_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector
++ _mm_store_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector
++ _mm_store_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector
++ _mm_store_ps((float*)VL_dotProductVector,VL_code_acc); // Store the results back into the dot product vector
++
++ for (int i = 0; i<2; ++i)
++ {
++ *VE_out_ptr += VE_dotProductVector[i];
++ *E_out_ptr += E_dotProductVector[i];
++ *P_out_ptr += P_dotProductVector[i];
++ *L_out_ptr += L_dotProductVector[i];
++ *VL_out_ptr += VL_dotProductVector[i];
++ }
++ }
++
++ lv_8sc_t bb_signal_sample;
++ for(int i=0; i < num_points%8; ++i)
++ {
++ //Perform the carrier wipe-off
++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
++ // Now get very early, early, prompt, late and very late values for each
++ *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VE_code_ptr++));
++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++));
++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++));
++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++));
++ *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VL_code_ptr++));
++ }
++}
++#endif /* LV_HAVE_SSE2 */
++
++#ifdef LV_HAVE_GENERIC
++/*!
++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
++ \param input The input signal input
++ \param carrier The carrier signal input
++ \param VE_code Very Early PRN code replica input
++ \param E_code Early PRN code replica input
++ \param P_code Prompt PRN code replica input
++ \param L_code Late PRN code replica input
++ \param VL_code Very Late PRN code replica input
++ \param VE_out Very Early correlation output
++ \param E_out Early correlation output
++ \param P_out Prompt correlation output
++ \param L_out Late correlation output
++ \param VL_out Very Late correlation output
++ \param num_points The number of complex values in vectors
++ */
++static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_a_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points)
++{
++ lv_8sc_t bb_signal_sample;
++
++ bb_signal_sample = lv_cmake(0, 0);
++
++ *VE_out = 0;
++ *E_out = 0;
++ *P_out = 0;
++ *L_out = 0;
++ *VL_out = 0;
++ // perform very early, Early, Prompt, Late and very late correlation
++ for(int i=0; i < num_points; ++i)
++ {
++ //Perform the carrier wipe-off
++ bb_signal_sample = input[i] * carrier[i];
++
++ *VE_out += (lv_32fc_t) (bb_signal_sample * VE_code[i]);
++ *E_out += (lv_32fc_t) (bb_signal_sample * E_code[i]);
++ *P_out += (lv_32fc_t) (bb_signal_sample * P_code[i]);
++ *L_out += (lv_32fc_t) (bb_signal_sample * L_code[i]);
++ *VL_out += (lv_32fc_t) (bb_signal_sample * VL_code[i]);
++ }
++}
++
++#endif /* LV_HAVE_GENERIC */
++
++#endif /* INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_a_H */
+\ No newline at end of file
+diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5.h
+--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5.h 1970-01-01 01:00:00.000000000 +0100
++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5.h 2014-10-15 01:55:08.000000000 +0200
+@@ -0,0 +1,1520 @@
++/*!
++ * \file volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5.h
++ * \brief Volk protokernel: performs the carrier wipe-off mixing and the Very early, Early, Prompt, Late and very late correlation with 16 bits vectors using different methods: inside u_sse4_1_first there is one method, inside u_sse4_1_second there is another... This protokernel has been created to test the performance of different methods.
++ * \authors <ul>
++ * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
++ * </ul>
++ *
++ * Volk protokernel that performs the carrier wipe-off mixing and the
++ * Very early, Early, Prompt, Late and very late correlation with 16 bits vectors (8 bits the
++ * real part and 8 bits the imaginary part), and accumulates the result
++ * in 32 bits single point values, returning float32 values:
++ * - The carrier wipe-off is done by multiplying the input signal by the
++ * carrier (multiplication of 16 bits vectors) It returns the input
++ * signal in base band (BB)
++ * - Very Early values are calculated by multiplying the input signal in BB by the
++ * very early code (multiplication of 16 bits vectors), accumulating the results into float32 values
++ * - Early values are calculated by multiplying the input signal in BB by the
++ * early code (multiplication of 16 bits vectors), accumulating the results into float32 values
++ * - Prompt values are calculated by multiplying the input signal in BB by the
++ * prompt code (multiplication of 16 bits vectors), accumulating the results into float32 values
++ * - Late values are calculated by multiplying the input signal in BB by the
++ * late code (multiplication of 16 bits vectors), accumulating the results into float32 values
++ * - Very Late values are calculated by multiplying the input signal in BB by the
++ * very late code (multiplication of 16 bits vectors), accumulating the results into float32 values
++ *
++ * -------------------------------------------------------------------------
++ *
++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
++ *
++ * GNSS-SDR is a software defined Global Navigation
++ * Satellite Systems receiver
++ *
++ * This file is part of GNSS-SDR.
++ *
++ * GNSS-SDR is free software: you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation, either version 3 of the License, or
++ * at your option) any later version.
++ *
++ * GNSS-SDR is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
++ *
++ * -------------------------------------------------------------------------
++ */
++
++#ifndef INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5_u_H
++#define INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5_u_H
++
++#include <inttypes.h>
++#include <stdio.h>
++#include <volk_gnsssdr/volk_gnsssdr_complex.h>
++#include <float.h>
++#include <string.h>
++
++#ifdef LV_HAVE_SSE4_1
++#include "smmintrin.h"
++#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h"
++#include "CommonMacros/CommonMacros.h"
++/*!
++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
++ \param input The input signal input
++ \param carrier The carrier signal input
++ \param VE_code Very Early PRN code replica input
++ \param E_code Early PRN code replica input
++ \param P_code Prompt PRN code replica input
++ \param L_code Late PRN code replica input
++ \param VL_code Very Late PRN code replica input
++ \param VE_out Very Early correlation output
++ \param E_out Early correlation output
++ \param P_out Prompt correlation output
++ \param L_out Late correlation output
++ \param VL_out Very Late correlation output
++ \param num_points The number of complex values in vectors
++ */
++static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5_u_sse4_1_first(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points)
++{
++ const unsigned int sse_iters = num_points / 8;
++
++ __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample;
++ __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output;
++
++ __m128 VE_code_acc, E_code_acc, P_code_acc, L_code_acc, VL_code_acc;
++ __m128i input_i_1, input_i_2, output_i32;
++ __m128 output_ps_1, output_ps_2;
++
++ const lv_8sc_t* input_ptr = input;
++ const lv_8sc_t* carrier_ptr = carrier;
++
++ const lv_8sc_t* VE_code_ptr = VE_code;
++ lv_32fc_t* VE_out_ptr = VE_out;
++ const lv_8sc_t* E_code_ptr = E_code;
++ lv_32fc_t* E_out_ptr = E_out;
++ const lv_8sc_t* P_code_ptr = P_code;
++ lv_32fc_t* P_out_ptr = P_out;
++ const lv_8sc_t* L_code_ptr = L_code;
++ lv_32fc_t* L_out_ptr = L_out;
++ const lv_8sc_t* VL_code_ptr = VL_code;
++ lv_32fc_t* VL_out_ptr = VL_out;
++
++ *VE_out_ptr = 0;
++ *E_out_ptr = 0;
++ *P_out_ptr = 0;
++ *L_out_ptr = 0;
++ *VL_out_ptr = 0;
++
++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
++
++ VE_code_acc = _mm_setzero_ps();
++ E_code_acc = _mm_setzero_ps();
++ P_code_acc = _mm_setzero_ps();
++ L_code_acc = _mm_setzero_ps();
++ VL_code_acc = _mm_setzero_ps();
++
++ if (sse_iters>0)
++ {
++ for(int number = 0;number < sse_iters; number++){
++
++ //Perform the carrier wipe-off
++ x = _mm_lddqu_si128((__m128i*)input_ptr);
++ y = _mm_lddqu_si128((__m128i*)carrier_ptr);
++
++ imagx = _mm_srli_si128 (x, 1);
++ imagx = _mm_and_si128 (imagx, mult1);
++ realx = _mm_and_si128 (x, mult1);
++
++ imagy = _mm_srli_si128 (y, 1);
++ imagy = _mm_and_si128 (imagy, mult1);
++ realy = _mm_and_si128 (y, mult1);
++
++ realx_mult_realy = _mm_mullo_epi16 (realx, realy);
++ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);
++ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);
++ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);
++
++ real_bb_signal_sample = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
++ imag_bb_signal_sample = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
++
++ //Get very early values
++ y = _mm_lddqu_si128((__m128i*)VE_code_ptr);
++
++ imagy = _mm_srli_si128 (y, 1);
++ imagy = _mm_and_si128 (imagy, mult1);
++ realy = _mm_and_si128 (y, mult1);
++
++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy);
++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy);
++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy);
++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy);
++
++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
++
++ imag_output = _mm_slli_si128 (imag_output, 1);
++ output = _mm_blendv_epi8 (imag_output, real_output, mult1);
++
++ input_i_1 = _mm_cvtepi8_epi32(output);
++ output = _mm_srli_si128 (output, 4);
++ input_i_2 = _mm_cvtepi8_epi32(output);
++ output = _mm_srli_si128 (output, 4);
++ output_i32 = _mm_add_epi32 (input_i_1, input_i_2);
++ output_ps_1 = _mm_cvtepi32_ps(output_i32);
++
++ input_i_1 = _mm_cvtepi8_epi32(output);
++ output = _mm_srli_si128 (output, 4);
++ input_i_2 = _mm_cvtepi8_epi32(output);
++ output = _mm_srli_si128 (output, 4);
++ output_i32 = _mm_add_epi32 (input_i_1, input_i_2);
++ output_ps_2 = _mm_cvtepi32_ps(output_i32);
++
++ VE_code_acc = _mm_add_ps (VE_code_acc, output_ps_1);
++ VE_code_acc = _mm_add_ps (VE_code_acc, output_ps_2);
++
++ //Get early values
++ y = _mm_lddqu_si128((__m128i*)E_code_ptr);
++
++ imagy = _mm_srli_si128 (y, 1);
++ imagy = _mm_and_si128 (imagy, mult1);
++ realy = _mm_and_si128 (y, mult1);
++
++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy);
++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy);
++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy);
++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy);
++
++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
++
++ imag_output = _mm_slli_si128 (imag_output, 1);
++ output = _mm_blendv_epi8 (imag_output, real_output, mult1);
++
++ input_i_1 = _mm_cvtepi8_epi32(output);
++ output = _mm_srli_si128 (output, 4);
++ input_i_2 = _mm_cvtepi8_epi32(output);
++ output = _mm_srli_si128 (output, 4);
++ output_i32 = _mm_add_epi32 (input_i_1, input_i_2);
++ output_ps_1 = _mm_cvtepi32_ps(output_i32);
++
++ input_i_1 = _mm_cvtepi8_epi32(output);
++ output = _mm_srli_si128 (output, 4);
++ input_i_2 = _mm_cvtepi8_epi32(output);
++ output = _mm_srli_si128 (output, 4);
++ output_i32 = _mm_add_epi32 (input_i_1, input_i_2);
++ output_ps_2 = _mm_cvtepi32_ps(output_i32);
++
++ E_code_acc = _mm_add_ps (E_code_acc, output_ps_1);
++ E_code_acc = _mm_add_ps (E_code_acc, output_ps_2);
++
++ //Get prompt values
++ y = _mm_lddqu_si128((__m128i*)P_code_ptr);
++
++ imagy = _mm_srli_si128 (y, 1);
++ imagy = _mm_and_si128 (imagy, mult1);
++ realy = _mm_and_si128 (y, mult1);
++
++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy);
++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy);
++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy);
++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy);
++
++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
++
++ imag_output = _mm_slli_si128 (imag_output, 1);
++ output = _mm_blendv_epi8 (imag_output, real_output, mult1);
++
++ input_i_1 = _mm_cvtepi8_epi32(output);
++ output = _mm_srli_si128 (output, 4);
++ input_i_2 = _mm_cvtepi8_epi32(output);
++ output = _mm_srli_si128 (output, 4);
++ output_i32 = _mm_add_epi32 (input_i_1, input_i_2);
++ output_ps_1 = _mm_cvtepi32_ps(output_i32);
++
++ input_i_1 = _mm_cvtepi8_epi32(output);
++ output = _mm_srli_si128 (output, 4);
++ input_i_2 = _mm_cvtepi8_epi32(output);
++ output = _mm_srli_si128 (output, 4);
++ output_i32 = _mm_add_epi32 (input_i_1, input_i_2);
++ output_ps_2 = _mm_cvtepi32_ps(output_i32);
++
++ P_code_acc = _mm_add_ps (P_code_acc, output_ps_1);
++ P_code_acc = _mm_add_ps (P_code_acc, output_ps_2);
++
++ //Get late values
++ y = _mm_lddqu_si128((__m128i*)L_code_ptr);
++
++ imagy = _mm_srli_si128 (y, 1);
++ imagy = _mm_and_si128 (imagy, mult1);
++ realy = _mm_and_si128 (y, mult1);
++
++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy);
++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy);
++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy);
++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy);
++
++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
++
++ imag_output = _mm_slli_si128 (imag_output, 1);
++ output = _mm_blendv_epi8 (imag_output, real_output, mult1);
++
++ input_i_1 = _mm_cvtepi8_epi32(output);
++ output = _mm_srli_si128 (output, 4);
++ input_i_2 = _mm_cvtepi8_epi32(output);
++ output = _mm_srli_si128 (output, 4);
++ output_i32 = _mm_add_epi32 (input_i_1, input_i_2);
++ output_ps_1 = _mm_cvtepi32_ps(output_i32);
++
++ input_i_1 = _mm_cvtepi8_epi32(output);
++ output = _mm_srli_si128 (output, 4);
++ input_i_2 = _mm_cvtepi8_epi32(output);
++ output = _mm_srli_si128 (output, 4);
++ output_i32 = _mm_add_epi32 (input_i_1, input_i_2);
++ output_ps_2 = _mm_cvtepi32_ps(output_i32);
++
++ L_code_acc = _mm_add_ps (L_code_acc, output_ps_1);
++ L_code_acc = _mm_add_ps (L_code_acc, output_ps_2);
++
++ //Get very late values
++ y = _mm_lddqu_si128((__m128i*)VL_code_ptr);
++
++ imagy = _mm_srli_si128 (y, 1);
++ imagy = _mm_and_si128 (imagy, mult1);
++ realy = _mm_and_si128 (y, mult1);
++
++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy);
++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy);
++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy);
++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy);
++
++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
++
++ imag_output = _mm_slli_si128 (imag_output, 1);
++ output = _mm_blendv_epi8 (imag_output, real_output, mult1);
++
++ input_i_1 = _mm_cvtepi8_epi32(output);
++ output = _mm_srli_si128 (output, 4);
++ input_i_2 = _mm_cvtepi8_epi32(output);
++ output = _mm_srli_si128 (output, 4);
++ output_i32 = _mm_add_epi32 (input_i_1, input_i_2);
++ output_ps_1 = _mm_cvtepi32_ps(output_i32);
++
++ input_i_1 = _mm_cvtepi8_epi32(output);
++ output = _mm_srli_si128 (output, 4);
++ input_i_2 = _mm_cvtepi8_epi32(output);
++ output = _mm_srli_si128 (output, 4);
++ output_i32 = _mm_add_epi32 (input_i_1, input_i_2);
++ output_ps_2 = _mm_cvtepi32_ps(output_i32);
++
++ VL_code_acc = _mm_add_ps (VL_code_acc, output_ps_1);
++ VL_code_acc = _mm_add_ps (VL_code_acc, output_ps_2);
++
++ input_ptr += 8;
++ carrier_ptr += 8;
++ VE_code_ptr += 8;
++ E_code_ptr += 8;
++ P_code_ptr += 8;
++ L_code_ptr += 8;
++ VL_code_ptr += 8;
++ }
++
++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t VE_dotProductVector[2];
++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2];
++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2];
++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2];
++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t VL_dotProductVector[2];
++
++ _mm_storeu_ps((float*)VE_dotProductVector,VE_code_acc); // Store the results back into the dot product vector
++ _mm_storeu_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector
++ _mm_storeu_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector
++ _mm_storeu_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector
++ _mm_storeu_ps((float*)VL_dotProductVector,VL_code_acc); // Store the results back into the dot product vector
++
++ for (int i = 0; i<2; ++i)
++ {
++ *VE_out_ptr += VE_dotProductVector[i];
++ *E_out_ptr += E_dotProductVector[i];
++ *P_out_ptr += P_dotProductVector[i];
++ *L_out_ptr += L_dotProductVector[i];
++ *VL_out_ptr += VL_dotProductVector[i];
++ }
++ }
++
++ lv_8sc_t bb_signal_sample;
++ for(int i=0; i < num_points%8; ++i)
++ {
++ //Perform the carrier wipe-off
++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
++ // Now get very early, early, prompt, late and very late values for each
++ *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VE_code_ptr++));
++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++));
++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++));
++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++));
++ *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VL_code_ptr++));
++ }
++}
++#endif /* LV_HAVE_SSE4_1 */
++
++#ifdef LV_HAVE_SSE4_1
++#include "smmintrin.h"
++#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h"
++#include "CommonMacros/CommonMacros.h"
++/*!
++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
++ \param input The input signal input
++ \param carrier The carrier signal input
++ \param VE_code Very Early PRN code replica input
++ \param E_code Early PRN code replica input
++ \param P_code Prompt PRN code replica input
++ \param L_code Late PRN code replica input
++ \param VL_code Very Late PRN code replica input
++ \param VE_out Very Early correlation output
++ \param E_out Early correlation output
++ \param P_out Prompt correlation output
++ \param L_out Late correlation output
++ \param VL_out Very Late correlation output
++ \param num_points The number of complex values in vectors
++ */
++static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5_u_sse4_1_second(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points)
++{
++ const unsigned int sse_iters = num_points / 8;
++
++ __m128i x, x_abs, y, y_aux, bb_signal_sample_aux, bb_signal_sample_aux_abs;;
++ __m128i mult1, output, real_output, imag_output;
++
++ __m128 VE_code_acc, E_code_acc, P_code_acc, L_code_acc, VL_code_acc;
++ __m128i input_i_1, input_i_2, output_i32;
++ __m128 output_ps_1, output_ps_2;
++
++ __m128i check_sign_sequence = _mm_set_epi8 (255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1);
++
++ const lv_8sc_t* input_ptr = input;
++ const lv_8sc_t* carrier_ptr = carrier;
++
++ const lv_8sc_t* VE_code_ptr = VE_code;
++ lv_32fc_t* VE_out_ptr = VE_out;
++ const lv_8sc_t* E_code_ptr = E_code;
++ lv_32fc_t* E_out_ptr = E_out;
++ const lv_8sc_t* P_code_ptr = P_code;
++ lv_32fc_t* P_out_ptr = P_out;
++ const lv_8sc_t* L_code_ptr = L_code;
++ lv_32fc_t* L_out_ptr = L_out;
++ const lv_8sc_t* VL_code_ptr = VL_code;
++ lv_32fc_t* VL_out_ptr = VL_out;
++
++ *VE_out_ptr = 0;
++ *E_out_ptr = 0;
++ *P_out_ptr = 0;
++ *L_out_ptr = 0;
++ *VL_out_ptr = 0;
++
++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
++
++ VE_code_acc = _mm_setzero_ps();
++ E_code_acc = _mm_setzero_ps();
++ P_code_acc = _mm_setzero_ps();
++ L_code_acc = _mm_setzero_ps();
++ VL_code_acc = _mm_setzero_ps();
++
++ if (sse_iters>0)
++ {
++ for(int number = 0;number < sse_iters; number++){
++
++ //Perform the carrier wipe-off
++ x = _mm_lddqu_si128((__m128i*)input_ptr);
++ y = _mm_lddqu_si128((__m128i*)carrier_ptr);
++
++ x_abs = _mm_abs_epi8 (x);
++
++ y_aux = _mm_sign_epi8 (y, x);
++ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence);
++ real_output = _mm_maddubs_epi16 (x_abs, y_aux);
++
++ y_aux = _mm_shuffle_epi8 (y, _mm_set_epi8 (14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1));
++ y_aux = _mm_sign_epi8 (y_aux, x);
++ imag_output = _mm_maddubs_epi16 (x_abs, y_aux);
++
++ imag_output = _mm_slli_si128 (imag_output, 1);
++ bb_signal_sample_aux = _mm_blendv_epi8 (imag_output, real_output, mult1);
++
++ bb_signal_sample_aux_abs = _mm_abs_epi8 (bb_signal_sample_aux);
++
++ //Get very early values
++ y = _mm_lddqu_si128((__m128i*)VE_code_ptr);
++
++ y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux);
++ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence);
++ real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux);
++
++ y_aux = _mm_shuffle_epi8 (y, _mm_set_epi8 (14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1));
++ y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux);
++ imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux);
++
++ imag_output = _mm_slli_si128 (imag_output, 1);
++ output = _mm_blendv_epi8 (imag_output, real_output, mult1);
++
++ input_i_1 = _mm_cvtepi8_epi32(output);
++ output = _mm_srli_si128 (output, 4);
++ input_i_2 = _mm_cvtepi8_epi32(output);
++ output = _mm_srli_si128 (output, 4);
++ output_i32 = _mm_add_epi32 (input_i_1, input_i_2);
++ output_ps_1 = _mm_cvtepi32_ps(output_i32);
++
++ input_i_1 = _mm_cvtepi8_epi32(output);
++ output = _mm_srli_si128 (output, 4);
++ input_i_2 = _mm_cvtepi8_epi32(output);
++ output = _mm_srli_si128 (output, 4);
++ output_i32 = _mm_add_epi32 (input_i_1, input_i_2);
++ output_ps_2 = _mm_cvtepi32_ps(output_i32);
++
++ VE_code_acc = _mm_add_ps (VE_code_acc, output_ps_1);
++ VE_code_acc = _mm_add_ps (VE_code_acc, output_ps_2);
++
++ //Get early values
++ y = _mm_lddqu_si128((__m128i*)E_code_ptr);
++
++ y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux);
++ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence);
++ real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux);
++
++ y_aux = _mm_shuffle_epi8 (y, _mm_set_epi8 (14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1));
++ y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux);
++ imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux);
++
++ imag_output = _mm_slli_si128 (imag_output, 1);
++ output = _mm_blendv_epi8 (imag_output, real_output, mult1);
++
++ input_i_1 = _mm_cvtepi8_epi32(output);
++ output = _mm_srli_si128 (output, 4);
++ input_i_2 = _mm_cvtepi8_epi32(output);
++ output = _mm_srli_si128 (output, 4);
++ output_i32 = _mm_add_epi32 (input_i_1, input_i_2);
++ output_ps_1 = _mm_cvtepi32_ps(output_i32);
++
++ input_i_1 = _mm_cvtepi8_epi32(output);
++ output = _mm_srli_si128 (output, 4);
++ input_i_2 = _mm_cvtepi8_epi32(output);
++ output = _mm_srli_si128 (output, 4);
++ output_i32 = _mm_add_epi32 (input_i_1, input_i_2);
++ output_ps_2 = _mm_cvtepi32_ps(output_i32);
++
++ E_code_acc = _mm_add_ps (E_code_acc, output_ps_1);
++ E_code_acc = _mm_add_ps (E_code_acc, output_ps_2);
++
++ //Get prompt values
++ y = _mm_lddqu_si128((__m128i*)P_code_ptr);
++
++ y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux);
++ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence);
++ real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux);
++
++ y_aux = _mm_shuffle_epi8 (y, _mm_set_epi8 (14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1));
++ y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux);
++ imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux);
++
++ imag_output = _mm_slli_si128 (imag_output, 1);
++ output = _mm_blendv_epi8 (imag_output, real_output, mult1);
++
++ input_i_1 = _mm_cvtepi8_epi32(output);
++ output = _mm_srli_si128 (output, 4);
++ input_i_2 = _mm_cvtepi8_epi32(output);
++ output = _mm_srli_si128 (output, 4);
++ output_i32 = _mm_add_epi32 (input_i_1, input_i_2);
++ output_ps_1 = _mm_cvtepi32_ps(output_i32);
++
++ input_i_1 = _mm_cvtepi8_epi32(output);
++ output = _mm_srli_si128 (output, 4);
++ input_i_2 = _mm_cvtepi8_epi32(output);
++ output = _mm_srli_si128 (output, 4);
++ output_i32 = _mm_add_epi32 (input_i_1, input_i_2);
++ output_ps_2 = _mm_cvtepi32_ps(output_i32);
++
++ P_code_acc = _mm_add_ps (P_code_acc, output_ps_1);
++ P_code_acc = _mm_add_ps (P_code_acc, output_ps_2);
++
++ //Get late values
++ y = _mm_lddqu_si128((__m128i*)L_code_ptr);
++
++ y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux);
++ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence);
++ real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux);
++
++ y_aux = _mm_shuffle_epi8 (y, _mm_set_epi8 (14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1));
++ y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux);
++ imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux);
++
++ imag_output = _mm_slli_si128 (imag_output, 1);
++ output = _mm_blendv_epi8 (imag_output, real_output, mult1);
++
++ input_i_1 = _mm_cvtepi8_epi32(output);
++ output = _mm_srli_si128 (output, 4);
++ input_i_2 = _mm_cvtepi8_epi32(output);
++ output = _mm_srli_si128 (output, 4);
++ output_i32 = _mm_add_epi32 (input_i_1, input_i_2);
++ output_ps_1 = _mm_cvtepi32_ps(output_i32);
++
++ input_i_1 = _mm_cvtepi8_epi32(output);
++ output = _mm_srli_si128 (output, 4);
++ input_i_2 = _mm_cvtepi8_epi32(output);
++ output = _mm_srli_si128 (output, 4);
++ output_i32 = _mm_add_epi32 (input_i_1, input_i_2);
++ output_ps_2 = _mm_cvtepi32_ps(output_i32);
++
++ L_code_acc = _mm_add_ps (L_code_acc, output_ps_1);
++ L_code_acc = _mm_add_ps (L_code_acc, output_ps_2);
++
++ //Get very late values
++ y = _mm_lddqu_si128((__m128i*)VL_code_ptr);
++
++ y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux);
++ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence);
++ real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux);
++
++ y_aux = _mm_shuffle_epi8 (y, _mm_set_epi8 (14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1));
++ y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux);
++ imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux);
++
++ imag_output = _mm_slli_si128 (imag_output, 1);
++ output = _mm_blendv_epi8 (imag_output, real_output, mult1);
++
++ input_i_1 = _mm_cvtepi8_epi32(output);
++ output = _mm_srli_si128 (output, 4);
++ input_i_2 = _mm_cvtepi8_epi32(output);
++ output = _mm_srli_si128 (output, 4);
++ output_i32 = _mm_add_epi32 (input_i_1, input_i_2);
++ output_ps_1 = _mm_cvtepi32_ps(output_i32);
++
++ input_i_1 = _mm_cvtepi8_epi32(output);
++ output = _mm_srli_si128 (output, 4);
++ input_i_2 = _mm_cvtepi8_epi32(output);
++ output = _mm_srli_si128 (output, 4);
++ output_i32 = _mm_add_epi32 (input_i_1, input_i_2);
++ output_ps_2 = _mm_cvtepi32_ps(output_i32);
++
++ VL_code_acc = _mm_add_ps (VL_code_acc, output_ps_1);
++ VL_code_acc = _mm_add_ps (VL_code_acc, output_ps_2);
++
++ input_ptr += 8;
++ carrier_ptr += 8;
++ VE_code_ptr += 8;
++ E_code_ptr += 8;
++ P_code_ptr += 8;
++ L_code_ptr += 8;
++ VL_code_ptr += 8;
++ }
++
++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t VE_dotProductVector[2];
++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2];
++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2];
++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2];
++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t VL_dotProductVector[2];
++
++ _mm_storeu_ps((float*)VE_dotProductVector,VE_code_acc); // Store the results back into the dot product vector
++ _mm_storeu_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector
++ _mm_storeu_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector
++ _mm_storeu_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector
++ _mm_storeu_ps((float*)VL_dotProductVector,VL_code_acc); // Store the results back into the dot product vector
++
++ for (int i = 0; i<2; ++i)
++ {
++ *VE_out_ptr += VE_dotProductVector[i];
++ *E_out_ptr += E_dotProductVector[i];
++ *P_out_ptr += P_dotProductVector[i];
++ *L_out_ptr += L_dotProductVector[i];
++ *VL_out_ptr += VL_dotProductVector[i];
++ }
++ }
++
++ lv_8sc_t bb_signal_sample;
++ for(int i=0; i < num_points%8; ++i)
++ {
++ //Perform the carrier wipe-off
++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
++ // Now get very early, early, prompt, late and very late values for each
++ *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VE_code_ptr++));
++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++));
++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++));
++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++));
++ *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VL_code_ptr++));
++ }
++}
++#endif /* LV_HAVE_SSE4_1 */
++
++#ifdef LV_HAVE_SSE4_1
++#include "smmintrin.h"
++#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h"
++#include "CommonMacros/CommonMacros.h"
++/*!
++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
++ \param input The input signal input
++ \param carrier The carrier signal input
++ \param VE_code Very Early PRN code replica input
++ \param E_code Early PRN code replica input
++ \param P_code Prompt PRN code replica input
++ \param L_code Late PRN code replica input
++ \param VL_code Very Late PRN code replica input
++ \param VE_out Very Early correlation output
++ \param E_out Early correlation output
++ \param P_out Prompt correlation output
++ \param L_out Late correlation output
++ \param VL_out Very Late correlation output
++ \param num_points The number of complex values in vectors
++ */
++static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5_u_sse4_1_third(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points)
++{
++ const unsigned int sse_iters = num_points / 8;
++
++ __m128i x, x_abs, y, y_aux, bb_signal_sample_aux, bb_signal_sample_aux_abs;;
++ __m128i mult1, real_output, imag_output;
++
++ __m128 real_VE_code_acc, imag_VE_code_acc, real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc, real_VL_code_acc, imag_VL_code_acc;
++ __m128i real_output_i_1, real_output_i_2, imag_output_i_1, imag_output_i_2, real_output_i32, imag_output_i32;
++ __m128 real_output_ps, imag_output_ps;
++
++ __m128i check_sign_sequence = _mm_set_epi8 (255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1);
++ __m128i rearrange_sequence = _mm_set_epi8 (14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
++
++ const lv_8sc_t* input_ptr = input;
++ const lv_8sc_t* carrier_ptr = carrier;
++
++ const lv_8sc_t* VE_code_ptr = VE_code;
++ lv_32fc_t* VE_out_ptr = VE_out;
++ const lv_8sc_t* E_code_ptr = E_code;
++ lv_32fc_t* E_out_ptr = E_out;
++ const lv_8sc_t* P_code_ptr = P_code;
++ lv_32fc_t* P_out_ptr = P_out;
++ const lv_8sc_t* L_code_ptr = L_code;
++ lv_32fc_t* L_out_ptr = L_out;
++ const lv_8sc_t* VL_code_ptr = VL_code;
++ lv_32fc_t* VL_out_ptr = VL_out;
++
++ float VE_out_real = 0;
++ float VE_out_imag = 0;
++ float E_out_real = 0;
++ float E_out_imag = 0;
++ float P_out_real = 0;
++ float P_out_imag = 0;
++ float L_out_real = 0;
++ float L_out_imag = 0;
++ float VL_out_real = 0;
++ float VL_out_imag = 0;
++
++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
++
++ real_VE_code_acc = _mm_setzero_ps();
++ imag_VE_code_acc = _mm_setzero_ps();
++ real_E_code_acc = _mm_setzero_ps();
++ imag_E_code_acc = _mm_setzero_ps();
++ real_P_code_acc = _mm_setzero_ps();
++ imag_P_code_acc = _mm_setzero_ps();
++ real_L_code_acc = _mm_setzero_ps();
++ imag_L_code_acc = _mm_setzero_ps();
++ real_VL_code_acc = _mm_setzero_ps();
++ imag_VL_code_acc = _mm_setzero_ps();
++
++ if (sse_iters>0)
++ {
++ for(int number = 0;number < sse_iters; number++){
++
++ //Perform the carrier wipe-off
++ x = _mm_lddqu_si128((__m128i*)input_ptr);
++ y = _mm_lddqu_si128((__m128i*)carrier_ptr);
++
++ x_abs = _mm_abs_epi8 (x);
++
++ y_aux = _mm_sign_epi8 (y, x);
++ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence);
++ real_output = _mm_maddubs_epi16 (x_abs, y_aux);
++
++ y_aux = _mm_shuffle_epi8 (y, rearrange_sequence);
++ y_aux = _mm_sign_epi8 (y_aux, x);
++ imag_output = _mm_maddubs_epi16 (x_abs, y_aux);
++
++ imag_output = _mm_slli_si128 (imag_output, 1);
++ bb_signal_sample_aux = _mm_blendv_epi8 (imag_output, real_output, mult1);
++ bb_signal_sample_aux_abs = _mm_abs_epi8 (bb_signal_sample_aux);
++
++ //Get very early values
++ y = _mm_lddqu_si128((__m128i*)VE_code_ptr);
++
++ y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux);
++ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence);
++ real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux);
++
++ y_aux = _mm_shuffle_epi8 (y, rearrange_sequence);
++ y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux);
++ imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux);
++
++ real_output_i_1 = _mm_cvtepi16_epi32(real_output);
++ real_output = _mm_srli_si128 (real_output, 8);
++ real_output_i_2 = _mm_cvtepi16_epi32(real_output);
++ real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2);
++ real_output_ps = _mm_cvtepi32_ps(real_output_i32);
++
++ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output);
++ imag_output = _mm_srli_si128 (imag_output, 8);
++ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output);
++ imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2);
++ imag_output_ps = _mm_cvtepi32_ps(imag_output_i32);
++
++ real_VE_code_acc = _mm_add_ps (real_VE_code_acc, real_output_ps);
++ imag_VE_code_acc = _mm_add_ps (imag_VE_code_acc, imag_output_ps);
++
++ //Get early values
++ y = _mm_lddqu_si128((__m128i*)E_code_ptr);
++
++ y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux);
++ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence);
++ real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux);
++
++ y_aux = _mm_shuffle_epi8 (y, rearrange_sequence);
++ y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux);
++ imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux);
++
++ real_output_i_1 = _mm_cvtepi16_epi32(real_output);
++ real_output = _mm_srli_si128 (real_output, 8);
++ real_output_i_2 = _mm_cvtepi16_epi32(real_output);
++ real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2);
++ real_output_ps = _mm_cvtepi32_ps(real_output_i32);
++
++ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output);
++ imag_output = _mm_srli_si128 (imag_output, 8);
++ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output);
++ imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2);
++ imag_output_ps = _mm_cvtepi32_ps(imag_output_i32);
++
++ real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps);
++ imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps);
++
++ //Get prompt values
++ y = _mm_lddqu_si128((__m128i*)P_code_ptr);
++
++ y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux);
++ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence);
++ real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux);
++
++ y_aux = _mm_shuffle_epi8 (y, rearrange_sequence);
++ y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux);
++ imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux);
++
++ real_output_i_1 = _mm_cvtepi16_epi32(real_output);
++ real_output = _mm_srli_si128 (real_output, 8);
++ real_output_i_2 = _mm_cvtepi16_epi32(real_output);
++ real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2);
++ real_output_ps = _mm_cvtepi32_ps(real_output_i32);
++
++ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output);
++ imag_output = _mm_srli_si128 (imag_output, 8);
++ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output);
++ imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2);
++ imag_output_ps = _mm_cvtepi32_ps(imag_output_i32);
++
++ real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps);
++ imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps);
++
++ //Get late values
++ y = _mm_lddqu_si128((__m128i*)L_code_ptr);
++
++ y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux);
++ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence);
++ real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux);
++
++ y_aux = _mm_shuffle_epi8 (y, rearrange_sequence);
++ y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux);
++ imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux);
++
++ real_output_i_1 = _mm_cvtepi16_epi32(real_output);
++ real_output = _mm_srli_si128 (real_output, 8);
++ real_output_i_2 = _mm_cvtepi16_epi32(real_output);
++ real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2);
++ real_output_ps = _mm_cvtepi32_ps(real_output_i32);
++
++ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output);
++ imag_output = _mm_srli_si128 (imag_output, 8);
++ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output);
++ imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2);
++ imag_output_ps = _mm_cvtepi32_ps(imag_output_i32);
++
++ real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps);
++ imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps);
++
++ //Get very late values
++ y = _mm_lddqu_si128((__m128i*)VL_code_ptr);
++
++ y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux);
++ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence);
++ real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux);
++
++ y_aux = _mm_shuffle_epi8 (y, _mm_set_epi8 (14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1));
++ y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux);
++ imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux);
++
++ real_output_i_1 = _mm_cvtepi16_epi32(real_output);
++ real_output = _mm_srli_si128 (real_output, 8);
++ real_output_i_2 = _mm_cvtepi16_epi32(real_output);
++ real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2);
++ real_output_ps = _mm_cvtepi32_ps(real_output_i32);
++
++ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output);
++ imag_output = _mm_srli_si128 (imag_output, 8);
++ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output);
++ imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2);
++ imag_output_ps = _mm_cvtepi32_ps(imag_output_i32);
++
++ real_VL_code_acc = _mm_add_ps (real_VL_code_acc, real_output_ps);
++ imag_VL_code_acc = _mm_add_ps (imag_VL_code_acc, imag_output_ps);
++
++ input_ptr += 8;
++ carrier_ptr += 8;
++ VE_code_ptr += 8;
++ E_code_ptr += 8;
++ P_code_ptr += 8;
++ L_code_ptr += 8;
++ VL_code_ptr += 8;
++ }
++
++ __VOLK_ATTR_ALIGNED(16) float real_VE_dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(16) float imag_VE_dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(16) float real_VL_dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(16) float imag_VL_dotProductVector[4];
++
++ _mm_storeu_ps((float*)real_VE_dotProductVector,real_VE_code_acc); // Store the results back into the dot product vector
++ _mm_storeu_ps((float*)imag_VE_dotProductVector,imag_VE_code_acc); // Store the results back into the dot product vector
++ _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector
++ _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector
++ _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector
++ _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector
++ _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector
++ _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector
++ _mm_storeu_ps((float*)real_VL_dotProductVector,real_VL_code_acc); // Store the results back into the dot product vector
++ _mm_storeu_ps((float*)imag_VL_dotProductVector,imag_VL_code_acc); // Store the results back into the dot product vector
++
++ for (int i = 0; i<4; ++i)
++ {
++ VE_out_real += real_VE_dotProductVector[i];
++ VE_out_imag += imag_VE_dotProductVector[i];
++ E_out_real += real_E_dotProductVector[i];
++ E_out_imag += imag_E_dotProductVector[i];
++ P_out_real += real_P_dotProductVector[i];
++ P_out_imag += imag_P_dotProductVector[i];
++ L_out_real += real_L_dotProductVector[i];
++ L_out_imag += imag_L_dotProductVector[i];
++ VL_out_real += real_VL_dotProductVector[i];
++ VL_out_imag += imag_VL_dotProductVector[i];
++ }
++ *VE_out_ptr = lv_cmake(VE_out_real, VE_out_imag);
++ *E_out_ptr = lv_cmake(E_out_real, E_out_imag);
++ *P_out_ptr = lv_cmake(P_out_real, P_out_imag);
++ *L_out_ptr = lv_cmake(L_out_real, L_out_imag);
++ *VL_out_ptr = lv_cmake(VL_out_real, VL_out_imag);
++ }
++
++ lv_16sc_t bb_signal_sample;
++ for(int i=0; i < num_points%8; ++i)
++ {
++ //Perform the carrier wipe-off
++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
++ // Now get very early, early, prompt, late and very late values for each
++ *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*VE_code_ptr++));
++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*E_code_ptr++));
++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*P_code_ptr++));
++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*L_code_ptr++));
++ *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*VL_code_ptr++));
++ }
++}
++#endif /* LV_HAVE_SSE4_1 */
++
++#ifdef LV_HAVE_SSE4_1
++#include "smmintrin.h"
++#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h"
++#include "CommonMacros/CommonMacros.h"
++/*!
++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
++ \param input The input signal input
++ \param carrier The carrier signal input
++ \param VE_code Very Early PRN code replica input
++ \param E_code Early PRN code replica input
++ \param P_code Prompt PRN code replica input
++ \param L_code Late PRN code replica input
++ \param VL_code Very Late PRN code replica input
++ \param VE_out Very Early correlation output
++ \param E_out Early correlation output
++ \param P_out Prompt correlation output
++ \param L_out Late correlation output
++ \param VL_out Very Late correlation output
++ \param num_points The number of complex values in vectors
++ */
++static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5_u_sse4_1_fourth(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points)
++{
++ const unsigned int sse_iters = num_points / 8;
++
++ __m128i x, x_abs, y, y_aux, bb_signal_sample_aux, bb_signal_sample_aux_abs;;
++ __m128i real_output, imag_output;
++ __m128 real_VE_code_acc, imag_VE_code_acc, real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc, real_VL_code_acc, imag_VL_code_acc;
++ __m128i real_output_i_1, real_output_i_2, imag_output_i_1, imag_output_i_2, real_output_i32, imag_output_i32;
++ __m128 real_output_ps, imag_output_ps;
++ __m128i minus128control;
++
++ __m128i minus128 = _mm_set1_epi8 (-128);
++ __m128i check_sign_sequence = _mm_set_epi8 (255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1);
++ __m128i rearrange_sequence = _mm_set_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
++ __m128i mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
++
++ const lv_8sc_t* input_ptr = input;
++ const lv_8sc_t* carrier_ptr = carrier;
++
++ const lv_8sc_t* VE_code_ptr = VE_code;
++ lv_32fc_t* VE_out_ptr = VE_out;
++ const lv_8sc_t* E_code_ptr = E_code;
++ lv_32fc_t* E_out_ptr = E_out;
++ const lv_8sc_t* P_code_ptr = P_code;
++ lv_32fc_t* P_out_ptr = P_out;
++ const lv_8sc_t* L_code_ptr = L_code;
++ lv_32fc_t* L_out_ptr = L_out;
++ const lv_8sc_t* VL_code_ptr = VL_code;
++ lv_32fc_t* VL_out_ptr = VL_out;
++
++ float VE_out_real = 0;
++ float VE_out_imag = 0;
++ float E_out_real = 0;
++ float E_out_imag = 0;
++ float P_out_real = 0;
++ float P_out_imag = 0;
++ float L_out_real = 0;
++ float L_out_imag = 0;
++ float VL_out_real = 0;
++ float VL_out_imag = 0;
++
++ real_VE_code_acc = _mm_setzero_ps();
++ imag_VE_code_acc = _mm_setzero_ps();
++ real_E_code_acc = _mm_setzero_ps();
++ imag_E_code_acc = _mm_setzero_ps();
++ real_P_code_acc = _mm_setzero_ps();
++ imag_P_code_acc = _mm_setzero_ps();
++ real_L_code_acc = _mm_setzero_ps();
++ imag_L_code_acc = _mm_setzero_ps();
++ real_VL_code_acc = _mm_setzero_ps();
++ imag_VL_code_acc = _mm_setzero_ps();
++
++ if (sse_iters>0)
++ {
++ for(int number = 0;number < sse_iters; number++){
++
++ //Perform the carrier wipe-off
++ x = _mm_lddqu_si128((__m128i*)input_ptr);
++ y = _mm_lddqu_si128((__m128i*)carrier_ptr);
++
++ x_abs = _mm_abs_epi8 (x);
++
++ y_aux = _mm_sign_epi8 (y, x);
++ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence);
++ real_output = _mm_maddubs_epi16 (x_abs, y_aux);
++
++ y_aux = _mm_shuffle_epi8 (y, rearrange_sequence);
++ y_aux = _mm_sign_epi8 (y_aux, x);
++ imag_output = _mm_maddubs_epi16 (x_abs, y_aux);
++
++ imag_output = _mm_slli_si128 (imag_output, 1);
++ bb_signal_sample_aux = _mm_blendv_epi8 (imag_output, real_output, mult1);
++ bb_signal_sample_aux_abs = _mm_abs_epi8 (bb_signal_sample_aux);
++
++ //Get very early values
++ y = _mm_lddqu_si128((__m128i*)VE_code_ptr);
++ minus128control = _mm_cmpeq_epi8 (y, minus128);
++ y = _mm_sub_epi8 (y, minus128control);
++
++ y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux);
++ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence);
++ real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux);
++
++ y_aux = _mm_shuffle_epi8 (y, rearrange_sequence);
++ y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux);
++ imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux);
++
++ real_output_i_1 = _mm_cvtepi16_epi32(real_output);
++ real_output = _mm_srli_si128 (real_output, 8);
++ real_output_i_2 = _mm_cvtepi16_epi32(real_output);
++ real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2);
++ real_output_ps = _mm_cvtepi32_ps(real_output_i32);
++
++ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output);
++ imag_output = _mm_srli_si128 (imag_output, 8);
++ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output);
++ imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2);
++ imag_output_ps = _mm_cvtepi32_ps(imag_output_i32);
++
++ real_VE_code_acc = _mm_add_ps (real_VE_code_acc, real_output_ps);
++ imag_VE_code_acc = _mm_add_ps (imag_VE_code_acc, imag_output_ps);
++
++ //Get early values
++ y = _mm_lddqu_si128((__m128i*)E_code_ptr);
++ minus128control = _mm_cmpeq_epi8 (y, minus128);
++ y = _mm_sub_epi8 (y, minus128control);
++
++ y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux);
++ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence);
++ real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux);
++
++ y_aux = _mm_shuffle_epi8 (y, rearrange_sequence);
++ y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux);
++ imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux);
++
++ real_output_i_1 = _mm_cvtepi16_epi32(real_output);
++ real_output = _mm_srli_si128 (real_output, 8);
++ real_output_i_2 = _mm_cvtepi16_epi32(real_output);
++ real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2);
++ real_output_ps = _mm_cvtepi32_ps(real_output_i32);
++
++ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output);
++ imag_output = _mm_srli_si128 (imag_output, 8);
++ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output);
++ imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2);
++ imag_output_ps = _mm_cvtepi32_ps(imag_output_i32);
++
++ real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps);
++ imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps);
++
++ //Get prompt values
++ y = _mm_lddqu_si128((__m128i*)P_code_ptr);
++ minus128control = _mm_cmpeq_epi8 (y, minus128);
++ y = _mm_sub_epi8 (y, minus128control);
++
++ y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux);
++ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence);
++ real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux);
++
++ y_aux = _mm_shuffle_epi8 (y, rearrange_sequence);
++ y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux);
++ imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux);
++
++ real_output_i_1 = _mm_cvtepi16_epi32(real_output);
++ real_output = _mm_srli_si128 (real_output, 8);
++ real_output_i_2 = _mm_cvtepi16_epi32(real_output);
++ real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2);
++ real_output_ps = _mm_cvtepi32_ps(real_output_i32);
++
++ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output);
++ imag_output = _mm_srli_si128 (imag_output, 8);
++ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output);
++ imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2);
++ imag_output_ps = _mm_cvtepi32_ps(imag_output_i32);
++
++ real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps);
++ imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps);
++
++ //Get late values
++ y = _mm_lddqu_si128((__m128i*)L_code_ptr);
++ minus128control = _mm_cmpeq_epi8 (y, minus128);
++ y = _mm_sub_epi8 (y, minus128control);
++
++ y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux);
++ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence);
++ real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux);
++
++ y_aux = _mm_shuffle_epi8 (y, rearrange_sequence);
++ y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux);
++ imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux);
++
++ real_output_i_1 = _mm_cvtepi16_epi32(real_output);
++ real_output = _mm_srli_si128 (real_output, 8);
++ real_output_i_2 = _mm_cvtepi16_epi32(real_output);
++ real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2);
++ real_output_ps = _mm_cvtepi32_ps(real_output_i32);
++
++ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output);
++ imag_output = _mm_srli_si128 (imag_output, 8);
++ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output);
++ imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2);
++ imag_output_ps = _mm_cvtepi32_ps(imag_output_i32);
++
++ real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps);
++ imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps);
++
++ //Get very late values
++ y = _mm_lddqu_si128((__m128i*)VL_code_ptr);
++ minus128control = _mm_cmpeq_epi8 (y, minus128);
++ y = _mm_sub_epi8 (y, minus128control);
++
++
++ y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux);
++ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence);
++ real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux);
++
++ y_aux = _mm_shuffle_epi8 (y, _mm_set_epi8 (14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1));
++ y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux);
++ imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux);
++
++ real_output_i_1 = _mm_cvtepi16_epi32(real_output);
++ real_output = _mm_srli_si128 (real_output, 8);
++ real_output_i_2 = _mm_cvtepi16_epi32(real_output);
++ real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2);
++ real_output_ps = _mm_cvtepi32_ps(real_output_i32);
++
++ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output);
++ imag_output = _mm_srli_si128 (imag_output, 8);
++ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output);
++ imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2);
++ imag_output_ps = _mm_cvtepi32_ps(imag_output_i32);
++
++ real_VL_code_acc = _mm_add_ps (real_VL_code_acc, real_output_ps);
++ imag_VL_code_acc = _mm_add_ps (imag_VL_code_acc, imag_output_ps);
++
++ input_ptr += 8;
++ carrier_ptr += 8;
++ VE_code_ptr += 8;
++ E_code_ptr += 8;
++ P_code_ptr += 8;
++ L_code_ptr += 8;
++ VL_code_ptr += 8;
++ }
++
++ __VOLK_ATTR_ALIGNED(16) float real_VE_dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(16) float imag_VE_dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(16) float real_VL_dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(16) float imag_VL_dotProductVector[4];
++
++ _mm_storeu_ps((float*)real_VE_dotProductVector,real_VE_code_acc); // Store the results back into the dot product vector
++ _mm_storeu_ps((float*)imag_VE_dotProductVector,imag_VE_code_acc); // Store the results back into the dot product vector
++ _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector
++ _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector
++ _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector
++ _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector
++ _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector
++ _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector
++ _mm_storeu_ps((float*)real_VL_dotProductVector,real_VL_code_acc); // Store the results back into the dot product vector
++ _mm_storeu_ps((float*)imag_VL_dotProductVector,imag_VL_code_acc); // Store the results back into the dot product vector
++
++ for (int i = 0; i<4; ++i)
++ {
++ VE_out_real += real_VE_dotProductVector[i];
++ VE_out_imag += imag_VE_dotProductVector[i];
++ E_out_real += real_E_dotProductVector[i];
++ E_out_imag += imag_E_dotProductVector[i];
++ P_out_real += real_P_dotProductVector[i];
++ P_out_imag += imag_P_dotProductVector[i];
++ L_out_real += real_L_dotProductVector[i];
++ L_out_imag += imag_L_dotProductVector[i];
++ VL_out_real += real_VL_dotProductVector[i];
++ VL_out_imag += imag_VL_dotProductVector[i];
++ }
++ *VE_out_ptr = lv_cmake(VE_out_real, VE_out_imag);
++ *E_out_ptr = lv_cmake(E_out_real, E_out_imag);
++ *P_out_ptr = lv_cmake(P_out_real, P_out_imag);
++ *L_out_ptr = lv_cmake(L_out_real, L_out_imag);
++ *VL_out_ptr = lv_cmake(VL_out_real, VL_out_imag);
++ }
++
++ lv_16sc_t bb_signal_sample;
++ for(int i=0; i < num_points%8; ++i)
++ {
++ //Perform the carrier wipe-off
++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
++ // Now get very early, early, prompt, late and very late values for each
++ *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*VE_code_ptr++));
++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*E_code_ptr++));
++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*P_code_ptr++));
++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*L_code_ptr++));
++ *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*VL_code_ptr++));
++ }
++}
++#endif /* LV_HAVE_SSE4_1 */
++
++
++#ifdef LV_HAVE_GENERIC
++#include <stdio.h>
++#include <tmmintrin.h>
++
++/*!
++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
++ \param input The input signal input
++ \param carrier The carrier signal input
++ \param VE_code Very Early PRN code replica input
++ \param E_code Early PRN code replica input
++ \param P_code Prompt PRN code replica input
++ \param L_code Late PRN code replica input
++ \param VL_code Very Late PRN code replica input
++ \param VE_out Very Early correlation output
++ \param E_out Early correlation output
++ \param P_out Prompt correlation output
++ \param L_out Late correlation output
++ \param VL_out Very Late correlation output
++ \param num_points The number of complex values in vectors
++ */
++static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points)
++{
++ *VE_out = 0;
++ *E_out = 0;
++ *P_out = 0;
++ *L_out = 0;
++ *VL_out = 0;
++
++
++ lv_16sc_t VE_code_value;
++ lv_16sc_t E_code_value;
++ lv_16sc_t P_code_value;
++ lv_16sc_t L_code_value;
++ lv_16sc_t VL_code_value;
++ lv_16sc_t bb_signal_sample;
++
++ for(int i=0; i < num_points; ++i)
++ {
++ VE_code_value = VE_code[i];
++ E_code_value = E_code[i];
++ P_code_value = P_code[i];
++ L_code_value = L_code[i];
++ VL_code_value = VL_code[i];
++
++ if(lv_creal(VE_code_value) == -128)
++ {
++ VE_code_value = lv_cmake(-127, lv_cimag(VE_code_value));
++ }
++ if(lv_cimag(VE_code_value) == -128)
++ {
++ VE_code_value = lv_cmake(lv_creal(VE_code_value), -127);
++ }
++
++ if(lv_creal(E_code_value) == -128)
++ {
++ E_code_value = lv_cmake(-127, lv_cimag(E_code_value));
++ }
++ if(lv_cimag(E_code_value) == -128)
++ {
++ E_code_value = lv_cmake(lv_creal(E_code_value), -127);
++ }
++
++ if(lv_creal(P_code_value) == -128)
++ {
++ P_code_value = lv_cmake(-127, lv_cimag(P_code_value));
++ }
++ if(lv_cimag(P_code_value) == -128)
++ {
++ P_code_value = lv_cmake(lv_creal(P_code_value), -127);
++ }
++
++ if(lv_creal(L_code_value) == -128)
++ {
++ L_code_value = lv_cmake(-127, lv_cimag(L_code_value));
++ }
++ if(lv_cimag(L_code_value) == -128)
++ {
++ L_code_value = lv_cmake(lv_creal(L_code_value), -127);
++ }
++
++ if(lv_creal(VL_code_value) == -128)
++ {
++ VL_code_value = lv_cmake(-127, lv_cimag(VL_code_value));
++ }
++ if(lv_cimag(VL_code_value) == -128)
++ {
++ VL_code_value = lv_cmake(lv_creal(VL_code_value), -127);
++ }
++
++ //Perform the carrier wipe-off
++ bb_signal_sample = input[i] * carrier[i];
++ // Now get very early, early, prompt, late and very late values for each
++ *VE_out += (lv_32fc_t) (bb_signal_sample * VE_code_value);
++ *E_out += (lv_32fc_t) (bb_signal_sample * E_code_value);
++ *P_out += (lv_32fc_t) (bb_signal_sample * P_code_value);
++ *L_out += (lv_32fc_t) (bb_signal_sample * L_code_value);
++ *VL_out += (lv_32fc_t) (bb_signal_sample * VL_code_value);
++ }
++}
++
++#endif /* LV_HAVE_GENERIC */
++
++//#ifdef LV_HAVE_GENERIC
++//#include <stdio.h>
++//#include <stdlib.h>
++//#include <tmmintrin.h>
++//
++//#ifndef MAX
++//#define MAX(a,b) ((a) > (b) ? a : b)
++//#endif
++//
++//#ifndef MIN
++//#define MIN(a,b) ((a) < (b) ? a : b)
++//#endif
++//
++///*!
++// \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
++// \param input The input signal input
++// \param carrier The carrier signal input
++// \param VE_code Very Early PRN code replica input
++// \param E_code Early PRN code replica input
++// \param P_code Prompt PRN code replica input
++// \param L_code Late PRN code replica input
++// \param VL_code Very Late PRN code replica input
++// \param VE_out Very Early correlation output
++// \param E_out Early correlation output
++// \param P_out Prompt correlation output
++// \param L_out Late correlation output
++// \param VL_out Very Late correlation output
++// \param num_points The number of complex values in vectors
++// */
++//static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points)
++//{
++// *VE_out = 0;
++// *E_out = 0;
++// *P_out = 0;
++// *L_out = 0;
++// *VL_out = 0;
++//
++// lv_16sc_t VE_out16;
++// lv_16sc_t E_out16;
++// lv_16sc_t P_out16;
++// lv_16sc_t L_out16;
++// lv_16sc_t VL_out16;
++//
++// int32_t max = 32767;
++// int32_t min = -32768;
++//
++// int16_t real_real;
++// int16_t imag_imag;
++// int16_t real_imag;
++// int16_t imag_real;
++// int32_t out_real_32;
++// int32_t out_imag_32;
++// int16_t out_real_16;
++// int16_t out_imag_16;
++// int16_t aux1;
++// int16_t aux2;
++//
++//
++// lv_8sc_t bb_signal_sample = lv_cmake(0, 0);
++//
++// // perform very early, Early, Prompt, Late and very late correlation
++// for(int i=0; i < num_points; ++i)
++// {
++// //Perform the carrier wipe-off
++// bb_signal_sample = input[i] * carrier[i];
++//
++// aux1 = (int16_t)lv_creal(bb_signal_sample);
++// aux2 = (int16_t)lv_creal(VE_code[i]);
++// real_real = aux1*aux2;
++// aux1 = (int16_t)lv_cimag(bb_signal_sample);
++// aux2 = (int16_t)lv_cimag(VE_code[i]);
++// imag_imag = aux1*aux2;
++// aux1 = (int16_t)lv_creal(bb_signal_sample);
++// aux2 = (int16_t)lv_cimag(VE_code[i]);
++// real_imag = aux1*aux2;
++// aux1 = (int16_t)lv_cimag(bb_signal_sample);
++// aux2 = (int16_t)lv_creal(VE_code[i]);
++// imag_real = aux1*aux2;
++// out_real_32 = (int32_t)real_real - (int32_t)imag_imag;
++// out_imag_32 = (int32_t)real_imag + (int32_t)imag_real;
++// out_real_16 = MIN(MAX(out_real_32, min), max);
++// out_imag_16 = MIN(MAX(out_imag_32, min), max);
++// VE_out16 = lv_cmake(out_real_16, out_imag_16);
++//
++//
++//
++// if(lv_creal(L_code[i]) == -128)
++// {
++// int8_t* L_pointer = (int8_t*)&L_code[i];
++// *L_pointer = -127;
++// }
++// if(lv_cimag(L_code[i]) == -128)
++// {
++// int8_t* L_pointer = (int8_t*)&L_code[i];
++// L_pointer++;
++// *L_pointer = -127;
++// }
++// aux1 = (int16_t)lv_creal(bb_signal_sample);
++// aux2 = (int16_t)lv_creal(L_code[i]);
++// real_real = aux1*aux2;
++// aux1 = (int16_t)lv_cimag(bb_signal_sample);
++// aux2 = (int16_t)lv_cimag(L_code[i]);
++// imag_imag = aux1*aux2;
++// aux1 = (int16_t)lv_creal(bb_signal_sample);
++// aux2 = (int16_t)lv_cimag(L_code[i]);
++// real_imag = aux1*aux2;
++// aux1 = (int16_t)lv_cimag(bb_signal_sample);
++// aux2 = (int16_t)lv_creal(L_code[i]);
++// imag_real = aux1*aux2;
++// out_real_32 = (int32_t)real_real - (int32_t)imag_imag;
++// out_imag_32 = (int32_t)real_imag + (int32_t)imag_real;
++// out_real_16 = MIN(MAX(out_real_32, min), max);
++// out_imag_16 = MIN(MAX(out_imag_32, min), max);
++// L_out16 = lv_cmake(out_real_16, out_imag_16);
++//
++// E_out16 = (lv_16sc_t)bb_signal_sample * (lv_16sc_t)E_code[i];
++// P_out16 = (lv_16sc_t)bb_signal_sample * (lv_16sc_t)P_code[i];
++// VL_out16 = (lv_16sc_t)bb_signal_sample * (lv_16sc_t)VL_code[i];
++//
++//
++// *VE_out += (lv_32fc_t) VE_out16;
++// *E_out += (lv_32fc_t) E_out16;
++// *P_out += (lv_32fc_t) P_out16;
++// *L_out += (lv_32fc_t) L_out16;
++// *VL_out += (lv_32fc_t) VL_out16;
++//
++// //error en la parte real de L con 32 muestras
++// //*L_out = lv_cmake(12, 12);
++// }
++//}
++//
++//#endif /* LV_HAVE_GENERIC */
++
++//#ifdef LV_HAVE_GENERIC
++///*!
++// \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
++// \param input The input signal input
++// \param carrier The carrier signal input
++// \param VE_code Very Early PRN code replica input
++// \param E_code Early PRN code replica input
++// \param P_code Prompt PRN code replica input
++// \param L_code Late PRN code replica input
++// \param VL_code Very Late PRN code replica input
++// \param VE_out Very Early correlation output
++// \param E_out Early correlation output
++// \param P_out Prompt correlation output
++// \param L_out Late correlation output
++// \param VL_out Very Late correlation output
++// \param num_points The number of complex values in vectors
++// */
++//static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points)
++//{
++// lv_8sc_t bb_signal_sample;
++//
++// bb_signal_sample = lv_cmake(0, 0);
++//
++// *VE_out = 0;
++// *E_out = 0;
++// *P_out = 0;
++// *L_out = 0;
++// *VL_out = 0;
++// // perform very early, Early, Prompt, Late and very late correlation
++// for(int i=0; i < num_points; ++i)
++// {
++// //Perform the carrier wipe-off
++// bb_signal_sample = input[i] * carrier[i];
++//
++// *VE_out += (lv_32fc_t) (bb_signal_sample * VE_code[i]);
++// *E_out += (lv_32fc_t) (bb_signal_sample * E_code[i]);
++// *P_out += (lv_32fc_t) (bb_signal_sample * P_code[i]);
++// *L_out += (lv_32fc_t) (bb_signal_sample * L_code[i]);
++// *VL_out += (lv_32fc_t) (bb_signal_sample * VL_code[i]);
++// }
++//}
++//
++//#endif /* LV_HAVE_GENERIC */
++
++#endif /* INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5_u_H */
+\ No newline at end of file
+diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5.h
+--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5.h 1970-01-01 01:00:00.000000000 +0100
++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5.h 2014-10-15 01:55:08.000000000 +0200
+@@ -0,0 +1,772 @@
++/*!
++ * \file volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5.h
++ * \brief Volk protokernel: performs the carrier wipe-off mixing and the Very early, Early, Prompt, Late and very late correlation with 16 bits vectors, and accumulates the results into float32. This protokernel is called "safe" because it checks when the inputs have a -128 value, and replaces it with a -127 value. By doing this it avoids malfunctioning, but it lasts more time that the "unsafe" implementation. In order to avoid overflow, "input" and "carrier" must be values between —7 and 7 and "XX_code inputs" must be values between —127 and 127.
++ * \authors <ul>
++ * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
++ * </ul>
++ *
++ * Volk protokernel that performs the carrier wipe-off mixing and the
++ * Very early, Early, Prompt, Late and very late correlation with 16 bits vectors (8 bits the
++ * real part and 8 bits the imaginary part), and accumulates the result
++ * in 32 bits single point values, returning float32 values:
++ * - The carrier wipe-off is done by multiplying the input signal by the
++ * carrier (multiplication of 16 bits vectors) It returns the input
++ * signal in base band (BB)
++ * - Very Early values are calculated by multiplying the input signal in BB by the
++ * very early code (multiplication of 16 bits vectors), accumulating the results into float32 values
++ * - Early values are calculated by multiplying the input signal in BB by the
++ * early code (multiplication of 16 bits vectors), accumulating the results into float32 values
++ * - Prompt values are calculated by multiplying the input signal in BB by the
++ * prompt code (multiplication of 16 bits vectors), accumulating the results into float32 values
++ * - Late values are calculated by multiplying the input signal in BB by the
++ * late code (multiplication of 16 bits vectors), accumulating the results into float32 values
++ * - Very Late values are calculated by multiplying the input signal in BB by the
++ * very late code (multiplication of 16 bits vectors), accumulating the results into float32 values
++ *
++ * -------------------------------------------------------------------------
++ * Bits analysis
++ *
++ * input = 8 bits
++ * carrier = 8 bits
++ * XX_code = 8 bits
++ * XX_out16 = 16 bits
++ * bb_signal_sample = 8 bits
++ *
++ * bb_signal_sample = input*carrier -> 17 bits limited to 8 bits = input and carrier must be values between —7 and 7 to avoid overflow (3 bits)
++ *
++ * XX_out16 = XX_code*bb_signal_sample -> 17 bits limited to 16 bits = XX_code must be values between —127 and 127 to avoid overflow (7 bits)
++ * -------------------------------------------------------------------------
++ *
++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
++ *
++ * GNSS-SDR is a software defined Global Navigation
++ * Satellite Systems receiver
++ *
++ * This file is part of GNSS-SDR.
++ *
++ * GNSS-SDR is free software: you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation, either version 3 of the License, or
++ * at your option) any later version.
++ *
++ * GNSS-SDR is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
++ *
++ * -------------------------------------------------------------------------
++ */
++
++#ifndef INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5_u_H
++#define INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5_u_H
++
++#include <inttypes.h>
++#include <stdio.h>
++#include <volk_gnsssdr/volk_gnsssdr_complex.h>
++#include <float.h>
++#include <string.h>
++
++#ifdef LV_HAVE_SSE4_1
++#include "smmintrin.h"
++#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h"
++#include "CommonMacros/CommonMacros.h"
++/*!
++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
++ \param input The input signal input
++ \param carrier The carrier signal input
++ \param VE_code Very Early PRN code replica input
++ \param E_code Early PRN code replica input
++ \param P_code Prompt PRN code replica input
++ \param L_code Late PRN code replica input
++ \param VL_code Very Late PRN code replica input
++ \param VE_out Very Early correlation output
++ \param E_out Early correlation output
++ \param P_out Prompt correlation output
++ \param L_out Late correlation output
++ \param VL_out Very Late correlation output
++ \param num_points The number of complex values in vectors
++ */
++static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5_u_sse4_1(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points)
++{
++ const unsigned int sse_iters = num_points / 8;
++
++ __m128i x, x_abs, y, y_aux, bb_signal_sample_aux, bb_signal_sample_aux_abs;;
++ __m128i real_output, imag_output;
++ __m128 real_VE_code_acc, imag_VE_code_acc, real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc, real_VL_code_acc, imag_VL_code_acc;
++ __m128i input_i_1, input_i_2, output_i32;
++ __m128 real_output_ps, imag_output_ps;
++ __m128i minus128control;
++
++ __m128i minus128 = _mm_set1_epi8 (-128);
++ __m128i check_sign_sequence = _mm_set_epi8 (255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1);
++ __m128i rearrange_sequence = _mm_set_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
++ __m128i mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
++
++ const lv_8sc_t* input_ptr = input;
++ const lv_8sc_t* carrier_ptr = carrier;
++
++ const lv_8sc_t* VE_code_ptr = VE_code;
++ lv_32fc_t* VE_out_ptr = VE_out;
++ const lv_8sc_t* E_code_ptr = E_code;
++ lv_32fc_t* E_out_ptr = E_out;
++ const lv_8sc_t* P_code_ptr = P_code;
++ lv_32fc_t* P_out_ptr = P_out;
++ const lv_8sc_t* L_code_ptr = L_code;
++ lv_32fc_t* L_out_ptr = L_out;
++ const lv_8sc_t* VL_code_ptr = VL_code;
++ lv_32fc_t* VL_out_ptr = VL_out;
++
++ float VE_out_real = 0;
++ float VE_out_imag = 0;
++ float E_out_real = 0;
++ float E_out_imag = 0;
++ float P_out_real = 0;
++ float P_out_imag = 0;
++ float L_out_real = 0;
++ float L_out_imag = 0;
++ float VL_out_real = 0;
++ float VL_out_imag = 0;
++
++ real_VE_code_acc = _mm_setzero_ps();
++ imag_VE_code_acc = _mm_setzero_ps();
++ real_E_code_acc = _mm_setzero_ps();
++ imag_E_code_acc = _mm_setzero_ps();
++ real_P_code_acc = _mm_setzero_ps();
++ imag_P_code_acc = _mm_setzero_ps();
++ real_L_code_acc = _mm_setzero_ps();
++ imag_L_code_acc = _mm_setzero_ps();
++ real_VL_code_acc = _mm_setzero_ps();
++ imag_VL_code_acc = _mm_setzero_ps();
++
++ if (sse_iters>0)
++ {
++ for(int number = 0;number < sse_iters; number++){
++
++ //Perform the carrier wipe-off
++ x = _mm_lddqu_si128((__m128i*)input_ptr);
++ y = _mm_lddqu_si128((__m128i*)carrier_ptr);
++
++ x_abs = _mm_abs_epi8 (x);
++
++ CM_8IC_X2_SCALAR_PRODUCT_16IC_X2_U_SSSE3(y, x, check_sign_sequence, rearrange_sequence, y_aux, x_abs, real_output, imag_output)
++
++ imag_output = _mm_slli_si128 (imag_output, 1);
++ bb_signal_sample_aux = _mm_blendv_epi8 (imag_output, real_output, mult1);
++ bb_signal_sample_aux_abs = _mm_abs_epi8 (bb_signal_sample_aux);
++
++ //Get very early values
++ y = _mm_lddqu_si128((__m128i*)VE_code_ptr);
++
++ CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
++
++ real_VE_code_acc = _mm_add_ps (real_VE_code_acc, real_output_ps);
++ imag_VE_code_acc = _mm_add_ps (imag_VE_code_acc, imag_output_ps);
++
++ //Get early values
++ y = _mm_lddqu_si128((__m128i*)E_code_ptr);
++
++ CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
++
++ real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps);
++ imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps);
++
++ //Get prompt values
++ y = _mm_lddqu_si128((__m128i*)P_code_ptr);
++
++ CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
++
++ real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps);
++ imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps);
++
++ //Get late values
++ y = _mm_lddqu_si128((__m128i*)L_code_ptr);
++
++ CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
++
++ real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps);
++ imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps);
++
++ //Get very late values
++ y = _mm_lddqu_si128((__m128i*)VL_code_ptr);
++
++ CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
++
++ real_VL_code_acc = _mm_add_ps (real_VL_code_acc, real_output_ps);
++ imag_VL_code_acc = _mm_add_ps (imag_VL_code_acc, imag_output_ps);
++
++ input_ptr += 8;
++ carrier_ptr += 8;
++ VE_code_ptr += 8;
++ E_code_ptr += 8;
++ P_code_ptr += 8;
++ L_code_ptr += 8;
++ VL_code_ptr += 8;
++ }
++
++ __VOLK_ATTR_ALIGNED(16) float real_VE_dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(16) float imag_VE_dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(16) float real_VL_dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(16) float imag_VL_dotProductVector[4];
++
++ _mm_storeu_ps((float*)real_VE_dotProductVector,real_VE_code_acc); // Store the results back into the dot product vector
++ _mm_storeu_ps((float*)imag_VE_dotProductVector,imag_VE_code_acc); // Store the results back into the dot product vector
++ _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector
++ _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector
++ _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector
++ _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector
++ _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector
++ _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector
++ _mm_storeu_ps((float*)real_VL_dotProductVector,real_VL_code_acc); // Store the results back into the dot product vector
++ _mm_storeu_ps((float*)imag_VL_dotProductVector,imag_VL_code_acc); // Store the results back into the dot product vector
++
++ for (int i = 0; i<4; ++i)
++ {
++ VE_out_real += real_VE_dotProductVector[i];
++ VE_out_imag += imag_VE_dotProductVector[i];
++ E_out_real += real_E_dotProductVector[i];
++ E_out_imag += imag_E_dotProductVector[i];
++ P_out_real += real_P_dotProductVector[i];
++ P_out_imag += imag_P_dotProductVector[i];
++ L_out_real += real_L_dotProductVector[i];
++ L_out_imag += imag_L_dotProductVector[i];
++ VL_out_real += real_VL_dotProductVector[i];
++ VL_out_imag += imag_VL_dotProductVector[i];
++ }
++ *VE_out_ptr = lv_cmake(VE_out_real, VE_out_imag);
++ *E_out_ptr = lv_cmake(E_out_real, E_out_imag);
++ *P_out_ptr = lv_cmake(P_out_real, P_out_imag);
++ *L_out_ptr = lv_cmake(L_out_real, L_out_imag);
++ *VL_out_ptr = lv_cmake(VL_out_real, VL_out_imag);
++ }
++
++ if(num_points%8!=0)
++ {
++ lv_16sc_t bb_signal_sample;
++ lv_16sc_t VE_code_value;
++ lv_16sc_t E_code_value;
++ lv_16sc_t P_code_value;
++ lv_16sc_t L_code_value;
++ lv_16sc_t VL_code_value;
++
++ for(int i=0; i < num_points%8; ++i)
++ {
++ VE_code_value = *VE_code_ptr++;
++ E_code_value = *E_code_ptr++;
++ P_code_value = *P_code_ptr++;
++ L_code_value = *L_code_ptr++;
++ VL_code_value = *VL_code_ptr++;
++
++ if(lv_creal(VE_code_value) == -128)
++ {
++ VE_code_value = lv_cmake(-127, lv_cimag(VE_code_value));
++ }
++ if(lv_cimag(VE_code_value) == -128)
++ {
++ VE_code_value = lv_cmake(lv_creal(VE_code_value), -127);
++ }
++
++ if(lv_creal(E_code_value) == -128)
++ {
++ E_code_value = lv_cmake(-127, lv_cimag(E_code_value));
++ }
++ if(lv_cimag(E_code_value) == -128)
++ {
++ E_code_value = lv_cmake(lv_creal(E_code_value), -127);
++ }
++
++ if(lv_creal(P_code_value) == -128)
++ {
++ P_code_value = lv_cmake(-127, lv_cimag(P_code_value));
++ }
++ if(lv_cimag(P_code_value) == -128)
++ {
++ P_code_value = lv_cmake(lv_creal(P_code_value), -127);
++ }
++
++ if(lv_creal(L_code_value) == -128)
++ {
++ L_code_value = lv_cmake(-127, lv_cimag(L_code_value));
++ }
++ if(lv_cimag(L_code_value) == -128)
++ {
++ L_code_value = lv_cmake(lv_creal(L_code_value), -127);
++ }
++
++ //Perform the carrier wipe-off
++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
++ // Now get very early, early, prompt, late and very late values for each
++ *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * VE_code_value);
++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * E_code_value);
++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * P_code_value);
++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * L_code_value);
++ *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * VL_code_value);
++ }
++ }
++}
++#endif /* LV_HAVE_SSE4_1 */
++
++#ifdef LV_HAVE_GENERIC
++#include <stdio.h>
++#include <tmmintrin.h>
++
++/*!
++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
++ \param input The input signal input
++ \param carrier The carrier signal input
++ \param VE_code Very Early PRN code replica input
++ \param E_code Early PRN code replica input
++ \param P_code Prompt PRN code replica input
++ \param L_code Late PRN code replica input
++ \param VL_code Very Late PRN code replica input
++ \param VE_out Very Early correlation output
++ \param E_out Early correlation output
++ \param P_out Prompt correlation output
++ \param L_out Late correlation output
++ \param VL_out Very Late correlation output
++ \param num_points The number of complex values in vectors
++ */
++static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points)
++{
++ *VE_out = 0;
++ *E_out = 0;
++ *P_out = 0;
++ *L_out = 0;
++ *VL_out = 0;
++
++ lv_16sc_t VE_code_value;
++ lv_16sc_t E_code_value;
++ lv_16sc_t P_code_value;
++ lv_16sc_t L_code_value;
++ lv_16sc_t VL_code_value;
++ lv_16sc_t bb_signal_sample;
++
++ for(int i=0; i < num_points; ++i)
++ {
++ VE_code_value = VE_code[i];
++ E_code_value = E_code[i];
++ P_code_value = P_code[i];
++ L_code_value = L_code[i];
++ VL_code_value = VL_code[i];
++
++ if(lv_creal(VE_code_value) == -128)
++ {
++ VE_code_value = lv_cmake(-127, lv_cimag(VE_code_value));
++ }
++ if(lv_cimag(VE_code_value) == -128)
++ {
++ VE_code_value = lv_cmake(lv_creal(VE_code_value), -127);
++ }
++
++ if(lv_creal(E_code_value) == -128)
++ {
++ E_code_value = lv_cmake(-127, lv_cimag(E_code_value));
++ }
++ if(lv_cimag(E_code_value) == -128)
++ {
++ E_code_value = lv_cmake(lv_creal(E_code_value), -127);
++ }
++
++ if(lv_creal(P_code_value) == -128)
++ {
++ P_code_value = lv_cmake(-127, lv_cimag(P_code_value));
++ }
++ if(lv_cimag(P_code_value) == -128)
++ {
++ P_code_value = lv_cmake(lv_creal(P_code_value), -127);
++ }
++
++ if(lv_creal(L_code_value) == -128)
++ {
++ L_code_value = lv_cmake(-127, lv_cimag(L_code_value));
++ }
++ if(lv_cimag(L_code_value) == -128)
++ {
++ L_code_value = lv_cmake(lv_creal(L_code_value), -127);
++ }
++
++ if(lv_creal(VL_code_value) == -128)
++ {
++ VL_code_value = lv_cmake(-127, lv_cimag(VL_code_value));
++ }
++ if(lv_cimag(VL_code_value) == -128)
++ {
++ VL_code_value = lv_cmake(lv_creal(VL_code_value), -127);
++ }
++
++ //Perform the carrier wipe-off
++ bb_signal_sample = input[i] * carrier[i];
++ // Now get very early, early, prompt, late and very late values for each
++ *VE_out += (lv_32fc_t) (bb_signal_sample * VE_code_value);
++ *E_out += (lv_32fc_t) (bb_signal_sample * E_code_value);
++ *P_out += (lv_32fc_t) (bb_signal_sample * P_code_value);
++ *L_out += (lv_32fc_t) (bb_signal_sample * L_code_value);
++ *VL_out += (lv_32fc_t) (bb_signal_sample * VL_code_value);
++ }
++}
++#endif /* LV_HAVE_GENERIC */
++#endif /* INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5_u_H */
++
++
++#ifndef INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5_a_H
++#define INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5_a_H
++
++#include <inttypes.h>
++#include <stdio.h>
++#include <volk_gnsssdr/volk_gnsssdr_complex.h>
++#include <float.h>
++#include <string.h>
++
++#ifdef LV_HAVE_SSE4_1
++#include "smmintrin.h"
++#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h"
++#include "CommonMacros/CommonMacros.h"
++/*!
++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
++ \param input The input signal input
++ \param carrier The carrier signal input
++ \param VE_code Very Early PRN code replica input
++ \param E_code Early PRN code replica input
++ \param P_code Prompt PRN code replica input
++ \param L_code Late PRN code replica input
++ \param VL_code Very Late PRN code replica input
++ \param VE_out Very Early correlation output
++ \param E_out Early correlation output
++ \param P_out Prompt correlation output
++ \param L_out Late correlation output
++ \param VL_out Very Late correlation output
++ \param num_points The number of complex values in vectors
++ */
++static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5_a_sse4_1(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points)
++{
++ const unsigned int sse_iters = num_points / 8;
++
++ __m128i x, x_abs, y, y_aux, bb_signal_sample_aux, bb_signal_sample_aux_abs;;
++ __m128i real_output, imag_output;
++ __m128 real_VE_code_acc, imag_VE_code_acc, real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc, real_VL_code_acc, imag_VL_code_acc;
++ __m128i input_i_1, input_i_2, output_i32;
++ __m128 real_output_ps, imag_output_ps;
++ __m128i minus128control;
++
++ __m128i minus128 = _mm_set1_epi8 (-128);
++ __m128i check_sign_sequence = _mm_set_epi8 (255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1);
++ __m128i rearrange_sequence = _mm_set_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
++ __m128i mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
++
++ const lv_8sc_t* input_ptr = input;
++ const lv_8sc_t* carrier_ptr = carrier;
++
++ const lv_8sc_t* VE_code_ptr = VE_code;
++ lv_32fc_t* VE_out_ptr = VE_out;
++ const lv_8sc_t* E_code_ptr = E_code;
++ lv_32fc_t* E_out_ptr = E_out;
++ const lv_8sc_t* P_code_ptr = P_code;
++ lv_32fc_t* P_out_ptr = P_out;
++ const lv_8sc_t* L_code_ptr = L_code;
++ lv_32fc_t* L_out_ptr = L_out;
++ const lv_8sc_t* VL_code_ptr = VL_code;
++ lv_32fc_t* VL_out_ptr = VL_out;
++
++ float VE_out_real = 0;
++ float VE_out_imag = 0;
++ float E_out_real = 0;
++ float E_out_imag = 0;
++ float P_out_real = 0;
++ float P_out_imag = 0;
++ float L_out_real = 0;
++ float L_out_imag = 0;
++ float VL_out_real = 0;
++ float VL_out_imag = 0;
++
++ real_VE_code_acc = _mm_setzero_ps();
++ imag_VE_code_acc = _mm_setzero_ps();
++ real_E_code_acc = _mm_setzero_ps();
++ imag_E_code_acc = _mm_setzero_ps();
++ real_P_code_acc = _mm_setzero_ps();
++ imag_P_code_acc = _mm_setzero_ps();
++ real_L_code_acc = _mm_setzero_ps();
++ imag_L_code_acc = _mm_setzero_ps();
++ real_VL_code_acc = _mm_setzero_ps();
++ imag_VL_code_acc = _mm_setzero_ps();
++
++ if (sse_iters>0)
++ {
++ for(int number = 0;number < sse_iters; number++){
++
++ //Perform the carrier wipe-off
++ x = _mm_load_si128((__m128i*)input_ptr);
++ y = _mm_load_si128((__m128i*)carrier_ptr);
++
++ x_abs = _mm_abs_epi8 (x);
++
++ CM_8IC_X2_SCALAR_PRODUCT_16IC_X2_U_SSSE3(y, x, check_sign_sequence, rearrange_sequence, y_aux, x_abs, real_output, imag_output)
++
++ imag_output = _mm_slli_si128 (imag_output, 1);
++ bb_signal_sample_aux = _mm_blendv_epi8 (imag_output, real_output, mult1);
++ bb_signal_sample_aux_abs = _mm_abs_epi8 (bb_signal_sample_aux);
++
++ //Get very early values
++ y = _mm_load_si128((__m128i*)VE_code_ptr);
++
++ CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
++
++ real_VE_code_acc = _mm_add_ps (real_VE_code_acc, real_output_ps);
++ imag_VE_code_acc = _mm_add_ps (imag_VE_code_acc, imag_output_ps);
++
++ //Get early values
++ y = _mm_load_si128((__m128i*)E_code_ptr);
++
++ CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
++
++ real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps);
++ imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps);
++
++ //Get prompt values
++ y = _mm_load_si128((__m128i*)P_code_ptr);
++
++ CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
++
++ real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps);
++ imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps);
++
++ //Get late values
++ y = _mm_load_si128((__m128i*)L_code_ptr);
++
++ CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
++
++ real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps);
++ imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps);
++
++ //Get very late values
++ y = _mm_load_si128((__m128i*)VL_code_ptr);
++
++ CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
++
++ real_VL_code_acc = _mm_add_ps (real_VL_code_acc, real_output_ps);
++ imag_VL_code_acc = _mm_add_ps (imag_VL_code_acc, imag_output_ps);
++
++ input_ptr += 8;
++ carrier_ptr += 8;
++ VE_code_ptr += 8;
++ E_code_ptr += 8;
++ P_code_ptr += 8;
++ L_code_ptr += 8;
++ VL_code_ptr += 8;
++ }
++
++ __VOLK_ATTR_ALIGNED(16) float real_VE_dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(16) float imag_VE_dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(16) float real_VL_dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(16) float imag_VL_dotProductVector[4];
++
++ _mm_store_ps((float*)real_VE_dotProductVector,real_VE_code_acc); // Store the results back into the dot product vector
++ _mm_store_ps((float*)imag_VE_dotProductVector,imag_VE_code_acc); // Store the results back into the dot product vector
++ _mm_store_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector
++ _mm_store_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector
++ _mm_store_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector
++ _mm_store_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector
++ _mm_store_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector
++ _mm_store_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector
++ _mm_store_ps((float*)real_VL_dotProductVector,real_VL_code_acc); // Store the results back into the dot product vector
++ _mm_store_ps((float*)imag_VL_dotProductVector,imag_VL_code_acc); // Store the results back into the dot product vector
++
++ for (int i = 0; i<4; ++i)
++ {
++ VE_out_real += real_VE_dotProductVector[i];
++ VE_out_imag += imag_VE_dotProductVector[i];
++ E_out_real += real_E_dotProductVector[i];
++ E_out_imag += imag_E_dotProductVector[i];
++ P_out_real += real_P_dotProductVector[i];
++ P_out_imag += imag_P_dotProductVector[i];
++ L_out_real += real_L_dotProductVector[i];
++ L_out_imag += imag_L_dotProductVector[i];
++ VL_out_real += real_VL_dotProductVector[i];
++ VL_out_imag += imag_VL_dotProductVector[i];
++ }
++ *VE_out_ptr = lv_cmake(VE_out_real, VE_out_imag);
++ *E_out_ptr = lv_cmake(E_out_real, E_out_imag);
++ *P_out_ptr = lv_cmake(P_out_real, P_out_imag);
++ *L_out_ptr = lv_cmake(L_out_real, L_out_imag);
++ *VL_out_ptr = lv_cmake(VL_out_real, VL_out_imag);
++ }
++
++ if(num_points%8!=0)
++ {
++ lv_16sc_t bb_signal_sample;
++ lv_16sc_t VE_code_value;
++ lv_16sc_t E_code_value;
++ lv_16sc_t P_code_value;
++ lv_16sc_t L_code_value;
++ lv_16sc_t VL_code_value;
++
++ for(int i=0; i < num_points%8; ++i)
++ {
++ VE_code_value = *VE_code_ptr++;
++ E_code_value = *E_code_ptr++;
++ P_code_value = *P_code_ptr++;
++ L_code_value = *L_code_ptr++;
++ VL_code_value = *VL_code_ptr++;
++
++ if(lv_creal(VE_code_value) == -128)
++ {
++ VE_code_value = lv_cmake(-127, lv_cimag(VE_code_value));
++ }
++ if(lv_cimag(VE_code_value) == -128)
++ {
++ VE_code_value = lv_cmake(lv_creal(VE_code_value), -127);
++ }
++
++ if(lv_creal(E_code_value) == -128)
++ {
++ E_code_value = lv_cmake(-127, lv_cimag(E_code_value));
++ }
++ if(lv_cimag(E_code_value) == -128)
++ {
++ E_code_value = lv_cmake(lv_creal(E_code_value), -127);
++ }
++
++ if(lv_creal(P_code_value) == -128)
++ {
++ P_code_value = lv_cmake(-127, lv_cimag(P_code_value));
++ }
++ if(lv_cimag(P_code_value) == -128)
++ {
++ P_code_value = lv_cmake(lv_creal(P_code_value), -127);
++ }
++
++ if(lv_creal(L_code_value) == -128)
++ {
++ L_code_value = lv_cmake(-127, lv_cimag(L_code_value));
++ }
++ if(lv_cimag(L_code_value) == -128)
++ {
++ L_code_value = lv_cmake(lv_creal(L_code_value), -127);
++ }
++
++ //Perform the carrier wipe-off
++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
++ // Now get very early, early, prompt, late and very late values for each
++ *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * VE_code_value);
++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * E_code_value);
++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * P_code_value);
++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * L_code_value);
++ *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * VL_code_value);
++ }
++ }
++}
++#endif /* LV_HAVE_SSE4_1 */
++
++#ifdef LV_HAVE_GENERIC
++#include <stdio.h>
++#include <tmmintrin.h>
++
++/*!
++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
++ \param input The input signal input
++ \param carrier The carrier signal input
++ \param VE_code Very Early PRN code replica input
++ \param E_code Early PRN code replica input
++ \param P_code Prompt PRN code replica input
++ \param L_code Late PRN code replica input
++ \param VL_code Very Late PRN code replica input
++ \param VE_out Very Early correlation output
++ \param E_out Early correlation output
++ \param P_out Prompt correlation output
++ \param L_out Late correlation output
++ \param VL_out Very Late correlation output
++ \param num_points The number of complex values in vectors
++ */
++static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5_a_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points)
++{
++ *VE_out = 0;
++ *E_out = 0;
++ *P_out = 0;
++ *L_out = 0;
++ *VL_out = 0;
++
++ lv_16sc_t VE_code_value;
++ lv_16sc_t E_code_value;
++ lv_16sc_t P_code_value;
++ lv_16sc_t L_code_value;
++ lv_16sc_t VL_code_value;
++ lv_16sc_t bb_signal_sample;
++
++ for(int i=0; i < num_points; ++i)
++ {
++ VE_code_value = VE_code[i];
++ E_code_value = E_code[i];
++ P_code_value = P_code[i];
++ L_code_value = L_code[i];
++ VL_code_value = VL_code[i];
++
++ if(lv_creal(VE_code_value) == -128)
++ {
++ VE_code_value = lv_cmake(-127, lv_cimag(VE_code_value));
++ }
++ if(lv_cimag(VE_code_value) == -128)
++ {
++ VE_code_value = lv_cmake(lv_creal(VE_code_value), -127);
++ }
++
++ if(lv_creal(E_code_value) == -128)
++ {
++ E_code_value = lv_cmake(-127, lv_cimag(E_code_value));
++ }
++ if(lv_cimag(E_code_value) == -128)
++ {
++ E_code_value = lv_cmake(lv_creal(E_code_value), -127);
++ }
++
++ if(lv_creal(P_code_value) == -128)
++ {
++ P_code_value = lv_cmake(-127, lv_cimag(P_code_value));
++ }
++ if(lv_cimag(P_code_value) == -128)
++ {
++ P_code_value = lv_cmake(lv_creal(P_code_value), -127);
++ }
++
++ if(lv_creal(L_code_value) == -128)
++ {
++ L_code_value = lv_cmake(-127, lv_cimag(L_code_value));
++ }
++ if(lv_cimag(L_code_value) == -128)
++ {
++ L_code_value = lv_cmake(lv_creal(L_code_value), -127);
++ }
++
++ if(lv_creal(VL_code_value) == -128)
++ {
++ VL_code_value = lv_cmake(-127, lv_cimag(VL_code_value));
++ }
++ if(lv_cimag(VL_code_value) == -128)
++ {
++ VL_code_value = lv_cmake(lv_creal(VL_code_value), -127);
++ }
++
++ //Perform the carrier wipe-off
++ bb_signal_sample = input[i] * carrier[i];
++ // Now get very early, early, prompt, late and very late values for each
++ *VE_out += (lv_32fc_t) (bb_signal_sample * VE_code_value);
++ *E_out += (lv_32fc_t) (bb_signal_sample * E_code_value);
++ *P_out += (lv_32fc_t) (bb_signal_sample * P_code_value);
++ *L_out += (lv_32fc_t) (bb_signal_sample * L_code_value);
++ *VL_out += (lv_32fc_t) (bb_signal_sample * VL_code_value);
++ }
++}
++#endif /* LV_HAVE_GENERIC */
++#endif /* INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5_a_H */
+\ No newline at end of file
+diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5.h
+--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5.h 1970-01-01 01:00:00.000000000 +0100
++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5.h 2014-10-15 01:55:08.000000000 +0200
+@@ -0,0 +1,554 @@
++/*!
++ * \file volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5.h
++ * \brief Volk protokernel: performs the carrier wipe-off mixing and the Very early, Early, Prompt, Late and very late correlation with 16 bits vectors, and accumulates the results into float32. This protokernel is called "unsafe" because it does NOT check when the inputs have a -128 value. If you introduce a -128 value the protokernel will NOT operate properly (generic implementation will have different results than volk implementation). In order to avoid overflow, "input" and "carrier" must be values between —7 and 7 and "XX_code inputs" must be values between —127 and 127.
++ * \authors <ul>
++ * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
++ * </ul>
++ *
++ * Volk protokernel that performs the carrier wipe-off mixing and the
++ * Very early, Early, Prompt, Late and very late correlation with 16 bits vectors (8 bits the
++ * real part and 8 bits the imaginary part), and accumulates the result
++ * in 32 bits single point values, returning float32 values:
++ * - The carrier wipe-off is done by multiplying the input signal by the
++ * carrier (multiplication of 16 bits vectors) It returns the input
++ * signal in base band (BB)
++ * - Very Early values are calculated by multiplying the input signal in BB by the
++ * very early code (multiplication of 16 bits vectors), accumulating the results into float32 values
++ * - Early values are calculated by multiplying the input signal in BB by the
++ * early code (multiplication of 16 bits vectors), accumulating the results into float32 values
++ * - Prompt values are calculated by multiplying the input signal in BB by the
++ * prompt code (multiplication of 16 bits vectors), accumulating the results into float32 values
++ * - Late values are calculated by multiplying the input signal in BB by the
++ * late code (multiplication of 16 bits vectors), accumulating the results into float32 values
++ * - Very Late values are calculated by multiplying the input signal in BB by the
++ * very late code (multiplication of 16 bits vectors), accumulating the results into float32 values
++ *
++ * -------------------------------------------------------------------------
++ * Bits analysis
++ *
++ * input = 8 bits
++ * carrier = 8 bits
++ * XX_code = 8 bits
++ * XX_out16 = 16 bits
++ * bb_signal_sample = 8 bits
++ *
++ * bb_signal_sample = input*carrier -> 17 bits limited to 8 bits = input and carrier must be values between —7 and 7 to avoid overflow (3 bits)
++ *
++ * XX_out16 = XX_code*bb_signal_sample -> 17 bits limited to 16 bits = XX_code must be values between —127 and 127 to avoid overflow (7 bits)
++ * -------------------------------------------------------------------------
++ *
++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
++ *
++ * GNSS-SDR is a software defined Global Navigation
++ * Satellite Systems receiver
++ *
++ * This file is part of GNSS-SDR.
++ *
++ * GNSS-SDR is free software: you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation, either version 3 of the License, or
++ * at your option) any later version.
++ *
++ * GNSS-SDR is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
++ *
++ * -------------------------------------------------------------------------
++ */
++
++#ifndef INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_u_H
++#define INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_u_H
++
++#include <inttypes.h>
++#include <stdio.h>
++#include <volk_gnsssdr/volk_gnsssdr_complex.h>
++#include <float.h>
++#include <string.h>
++
++#ifdef LV_HAVE_SSE4_1
++#include "smmintrin.h"
++#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h"
++#include "CommonMacros/CommonMacros.h"
++/*!
++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
++ \param input The input signal input
++ \param carrier The carrier signal input
++ \param VE_code Very Early PRN code replica input
++ \param E_code Early PRN code replica input
++ \param P_code Prompt PRN code replica input
++ \param L_code Late PRN code replica input
++ \param VL_code Very Late PRN code replica input
++ \param VE_out Very Early correlation output
++ \param E_out Early correlation output
++ \param P_out Prompt correlation output
++ \param L_out Late correlation output
++ \param VL_out Very Late correlation output
++ \param num_points The number of complex values in vectors
++ */
++static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_u_sse4_1(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points)
++{
++ const unsigned int sse_iters = num_points / 8;
++
++ __m128i x, x_abs, y, y_aux, bb_signal_sample_aux, bb_signal_sample_aux_abs;;
++ __m128i real_output, imag_output;
++ __m128 real_VE_code_acc, imag_VE_code_acc, real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc, real_VL_code_acc, imag_VL_code_acc;
++ __m128i input_i_1, input_i_2, output_i32;
++ __m128 real_output_ps, imag_output_ps;
++
++ __m128i check_sign_sequence = _mm_set_epi8 (255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1);
++ __m128i rearrange_sequence = _mm_set_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
++ __m128i mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
++
++ const lv_8sc_t* input_ptr = input;
++ const lv_8sc_t* carrier_ptr = carrier;
++
++ const lv_8sc_t* VE_code_ptr = VE_code;
++ lv_32fc_t* VE_out_ptr = VE_out;
++ const lv_8sc_t* E_code_ptr = E_code;
++ lv_32fc_t* E_out_ptr = E_out;
++ const lv_8sc_t* P_code_ptr = P_code;
++ lv_32fc_t* P_out_ptr = P_out;
++ const lv_8sc_t* L_code_ptr = L_code;
++ lv_32fc_t* L_out_ptr = L_out;
++ const lv_8sc_t* VL_code_ptr = VL_code;
++ lv_32fc_t* VL_out_ptr = VL_out;
++
++ float VE_out_real = 0;
++ float VE_out_imag = 0;
++ float E_out_real = 0;
++ float E_out_imag = 0;
++ float P_out_real = 0;
++ float P_out_imag = 0;
++ float L_out_real = 0;
++ float L_out_imag = 0;
++ float VL_out_real = 0;
++ float VL_out_imag = 0;
++
++ real_VE_code_acc = _mm_setzero_ps();
++ imag_VE_code_acc = _mm_setzero_ps();
++ real_E_code_acc = _mm_setzero_ps();
++ imag_E_code_acc = _mm_setzero_ps();
++ real_P_code_acc = _mm_setzero_ps();
++ imag_P_code_acc = _mm_setzero_ps();
++ real_L_code_acc = _mm_setzero_ps();
++ imag_L_code_acc = _mm_setzero_ps();
++ real_VL_code_acc = _mm_setzero_ps();
++ imag_VL_code_acc = _mm_setzero_ps();
++
++ if (sse_iters>0)
++ {
++ for(int number = 0;number < sse_iters; number++){
++
++ //Perform the carrier wipe-off
++ x = _mm_lddqu_si128((__m128i*)input_ptr);
++ y = _mm_lddqu_si128((__m128i*)carrier_ptr);
++
++ x_abs = _mm_abs_epi8 (x);
++
++ CM_8IC_X2_SCALAR_PRODUCT_16IC_X2_U_SSSE3(y, x, check_sign_sequence, rearrange_sequence, y_aux, x_abs, real_output, imag_output)
++
++ imag_output = _mm_slli_si128 (imag_output, 1);
++ bb_signal_sample_aux = _mm_blendv_epi8 (imag_output, real_output, mult1);
++ bb_signal_sample_aux_abs = _mm_abs_epi8 (bb_signal_sample_aux);
++
++ //Get very early values
++ y = _mm_lddqu_si128((__m128i*)VE_code_ptr);
++
++ CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
++
++ real_VE_code_acc = _mm_add_ps (real_VE_code_acc, real_output_ps);
++ imag_VE_code_acc = _mm_add_ps (imag_VE_code_acc, imag_output_ps);
++
++ //Get early values
++ y = _mm_lddqu_si128((__m128i*)E_code_ptr);
++
++ CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
++
++ real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps);
++ imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps);
++
++ //Get prompt values
++ y = _mm_lddqu_si128((__m128i*)P_code_ptr);
++
++ CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
++
++ real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps);
++ imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps);
++
++ //Get late values
++ y = _mm_lddqu_si128((__m128i*)L_code_ptr);
++
++ CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
++
++ real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps);
++ imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps);
++
++ //Get very late values
++ y = _mm_lddqu_si128((__m128i*)VL_code_ptr);
++
++ CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
++
++ real_VL_code_acc = _mm_add_ps (real_VL_code_acc, real_output_ps);
++ imag_VL_code_acc = _mm_add_ps (imag_VL_code_acc, imag_output_ps);
++
++ input_ptr += 8;
++ carrier_ptr += 8;
++ VE_code_ptr += 8;
++ E_code_ptr += 8;
++ P_code_ptr += 8;
++ L_code_ptr += 8;
++ VL_code_ptr += 8;
++ }
++
++ __VOLK_ATTR_ALIGNED(16) float real_VE_dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(16) float imag_VE_dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(16) float real_VL_dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(16) float imag_VL_dotProductVector[4];
++
++ _mm_storeu_ps((float*)real_VE_dotProductVector,real_VE_code_acc); // Store the results back into the dot product vector
++ _mm_storeu_ps((float*)imag_VE_dotProductVector,imag_VE_code_acc); // Store the results back into the dot product vector
++ _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector
++ _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector
++ _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector
++ _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector
++ _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector
++ _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector
++ _mm_storeu_ps((float*)real_VL_dotProductVector,real_VL_code_acc); // Store the results back into the dot product vector
++ _mm_storeu_ps((float*)imag_VL_dotProductVector,imag_VL_code_acc); // Store the results back into the dot product vector
++
++ for (int i = 0; i<4; ++i)
++ {
++ VE_out_real += real_VE_dotProductVector[i];
++ VE_out_imag += imag_VE_dotProductVector[i];
++ E_out_real += real_E_dotProductVector[i];
++ E_out_imag += imag_E_dotProductVector[i];
++ P_out_real += real_P_dotProductVector[i];
++ P_out_imag += imag_P_dotProductVector[i];
++ L_out_real += real_L_dotProductVector[i];
++ L_out_imag += imag_L_dotProductVector[i];
++ VL_out_real += real_VL_dotProductVector[i];
++ VL_out_imag += imag_VL_dotProductVector[i];
++ }
++ *VE_out_ptr = lv_cmake(VE_out_real, VE_out_imag);
++ *E_out_ptr = lv_cmake(E_out_real, E_out_imag);
++ *P_out_ptr = lv_cmake(P_out_real, P_out_imag);
++ *L_out_ptr = lv_cmake(L_out_real, L_out_imag);
++ *VL_out_ptr = lv_cmake(VL_out_real, VL_out_imag);
++ }
++
++ lv_16sc_t bb_signal_sample;
++ for(int i=0; i < num_points%8; ++i)
++ {
++ //Perform the carrier wipe-off
++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
++ // Now get very early, early, prompt, late and very late values for each
++ *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*VE_code_ptr++));
++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*E_code_ptr++));
++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*P_code_ptr++));
++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*L_code_ptr++));
++ *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*VL_code_ptr++));
++ }
++}
++#endif /* LV_HAVE_SSE4_1 */
++
++#ifdef LV_HAVE_GENERIC
++#include <stdio.h>
++#include <tmmintrin.h>
++
++/*!
++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
++ \param input The input signal input
++ \param carrier The carrier signal input
++ \param VE_code Very Early PRN code replica input
++ \param E_code Early PRN code replica input
++ \param P_code Prompt PRN code replica input
++ \param L_code Late PRN code replica input
++ \param VL_code Very Late PRN code replica input
++ \param VE_out Very Early correlation output
++ \param E_out Early correlation output
++ \param P_out Prompt correlation output
++ \param L_out Late correlation output
++ \param VL_out Very Late correlation output
++ \param num_points The number of complex values in vectors
++ */
++static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points)
++{
++ *VE_out = 0;
++ *E_out = 0;
++ *P_out = 0;
++ *L_out = 0;
++ *VL_out = 0;
++
++ lv_16sc_t bb_signal_sample;
++
++ for(int i=0; i < num_points; ++i)
++ {
++ //Perform the carrier wipe-off
++ bb_signal_sample = input[i] * carrier[i];
++ // Now get very early, early, prompt, late and very late values for each
++ *VE_out += (lv_32fc_t) (bb_signal_sample * VE_code[i]);
++ *E_out += (lv_32fc_t) (bb_signal_sample * E_code[i]);
++ *P_out += (lv_32fc_t) (bb_signal_sample * P_code[i]);
++ *L_out += (lv_32fc_t) (bb_signal_sample * L_code[i]);
++ *VL_out += (lv_32fc_t) (bb_signal_sample * VL_code[i]);
++ }
++}
++#endif /* LV_HAVE_GENERIC */
++#endif /* INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_u_H */
++
++
++#ifndef INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_a_H
++#define INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_a_H
++
++#include <inttypes.h>
++#include <stdio.h>
++#include <volk_gnsssdr/volk_gnsssdr_complex.h>
++#include <float.h>
++#include <string.h>
++
++#ifdef LV_HAVE_SSE4_1
++#include "smmintrin.h"
++#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h"
++#include "CommonMacros/CommonMacros.h"
++/*!
++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
++ \param input The input signal input
++ \param carrier The carrier signal input
++ \param VE_code Very Early PRN code replica input
++ \param E_code Early PRN code replica input
++ \param P_code Prompt PRN code replica input
++ \param L_code Late PRN code replica input
++ \param VL_code Very Late PRN code replica input
++ \param VE_out Very Early correlation output
++ \param E_out Early correlation output
++ \param P_out Prompt correlation output
++ \param L_out Late correlation output
++ \param VL_out Very Late correlation output
++ \param num_points The number of complex values in vectors
++ */
++static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_a_sse4_1(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points)
++{
++ const unsigned int sse_iters = num_points / 8;
++
++ __m128i x, x_abs, y, y_aux, bb_signal_sample_aux, bb_signal_sample_aux_abs;;
++ __m128i real_output, imag_output;
++ __m128 real_VE_code_acc, imag_VE_code_acc, real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc, real_VL_code_acc, imag_VL_code_acc;
++ __m128i input_i_1, input_i_2, output_i32;
++ __m128 real_output_ps, imag_output_ps;
++
++ __m128i check_sign_sequence = _mm_set_epi8 (255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1);
++ __m128i rearrange_sequence = _mm_set_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
++ __m128i mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
++
++ const lv_8sc_t* input_ptr = input;
++ const lv_8sc_t* carrier_ptr = carrier;
++
++ const lv_8sc_t* VE_code_ptr = VE_code;
++ lv_32fc_t* VE_out_ptr = VE_out;
++ const lv_8sc_t* E_code_ptr = E_code;
++ lv_32fc_t* E_out_ptr = E_out;
++ const lv_8sc_t* P_code_ptr = P_code;
++ lv_32fc_t* P_out_ptr = P_out;
++ const lv_8sc_t* L_code_ptr = L_code;
++ lv_32fc_t* L_out_ptr = L_out;
++ const lv_8sc_t* VL_code_ptr = VL_code;
++ lv_32fc_t* VL_out_ptr = VL_out;
++
++ float VE_out_real = 0;
++ float VE_out_imag = 0;
++ float E_out_real = 0;
++ float E_out_imag = 0;
++ float P_out_real = 0;
++ float P_out_imag = 0;
++ float L_out_real = 0;
++ float L_out_imag = 0;
++ float VL_out_real = 0;
++ float VL_out_imag = 0;
++
++ real_VE_code_acc = _mm_setzero_ps();
++ imag_VE_code_acc = _mm_setzero_ps();
++ real_E_code_acc = _mm_setzero_ps();
++ imag_E_code_acc = _mm_setzero_ps();
++ real_P_code_acc = _mm_setzero_ps();
++ imag_P_code_acc = _mm_setzero_ps();
++ real_L_code_acc = _mm_setzero_ps();
++ imag_L_code_acc = _mm_setzero_ps();
++ real_VL_code_acc = _mm_setzero_ps();
++ imag_VL_code_acc = _mm_setzero_ps();
++
++ if (sse_iters>0)
++ {
++ for(int number = 0;number < sse_iters; number++){
++
++ //Perform the carrier wipe-off
++ x = _mm_load_si128((__m128i*)input_ptr);
++ y = _mm_load_si128((__m128i*)carrier_ptr);
++
++ x_abs = _mm_abs_epi8 (x);
++
++ CM_8IC_X2_SCALAR_PRODUCT_16IC_X2_U_SSSE3(y, x, check_sign_sequence, rearrange_sequence, y_aux, x_abs, real_output, imag_output)
++
++ imag_output = _mm_slli_si128 (imag_output, 1);
++ bb_signal_sample_aux = _mm_blendv_epi8 (imag_output, real_output, mult1);
++ bb_signal_sample_aux_abs = _mm_abs_epi8 (bb_signal_sample_aux);
++
++ //Get very early values
++ y = _mm_load_si128((__m128i*)VE_code_ptr);
++
++ CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
++
++ real_VE_code_acc = _mm_add_ps (real_VE_code_acc, real_output_ps);
++ imag_VE_code_acc = _mm_add_ps (imag_VE_code_acc, imag_output_ps);
++
++ //Get early values
++ y = _mm_load_si128((__m128i*)E_code_ptr);
++
++ CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
++
++ real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps);
++ imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps);
++
++ //Get prompt values
++ y = _mm_load_si128((__m128i*)P_code_ptr);
++
++ CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
++
++ real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps);
++ imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps);
++
++ //Get late values
++ y = _mm_load_si128((__m128i*)L_code_ptr);
++
++ CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
++
++ real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps);
++ imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps);
++
++ //Get very late values
++ y = _mm_load_si128((__m128i*)VL_code_ptr);
++
++ CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
++
++ real_VL_code_acc = _mm_add_ps (real_VL_code_acc, real_output_ps);
++ imag_VL_code_acc = _mm_add_ps (imag_VL_code_acc, imag_output_ps);
++
++ input_ptr += 8;
++ carrier_ptr += 8;
++ VE_code_ptr += 8;
++ E_code_ptr += 8;
++ P_code_ptr += 8;
++ L_code_ptr += 8;
++ VL_code_ptr += 8;
++ }
++
++ __VOLK_ATTR_ALIGNED(16) float real_VE_dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(16) float imag_VE_dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(16) float real_VL_dotProductVector[4];
++ __VOLK_ATTR_ALIGNED(16) float imag_VL_dotProductVector[4];
++
++ _mm_store_ps((float*)real_VE_dotProductVector,real_VE_code_acc); // Store the results back into the dot product vector
++ _mm_store_ps((float*)imag_VE_dotProductVector,imag_VE_code_acc); // Store the results back into the dot product vector
++ _mm_store_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector
++ _mm_store_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector
++ _mm_store_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector
++ _mm_store_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector
++ _mm_store_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector
++ _mm_store_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector
++ _mm_store_ps((float*)real_VL_dotProductVector,real_VL_code_acc); // Store the results back into the dot product vector
++ _mm_store_ps((float*)imag_VL_dotProductVector,imag_VL_code_acc); // Store the results back into the dot product vector
++
++ for (int i = 0; i<4; ++i)
++ {
++ VE_out_real += real_VE_dotProductVector[i];
++ VE_out_imag += imag_VE_dotProductVector[i];
++ E_out_real += real_E_dotProductVector[i];
++ E_out_imag += imag_E_dotProductVector[i];
++ P_out_real += real_P_dotProductVector[i];
++ P_out_imag += imag_P_dotProductVector[i];
++ L_out_real += real_L_dotProductVector[i];
++ L_out_imag += imag_L_dotProductVector[i];
++ VL_out_real += real_VL_dotProductVector[i];
++ VL_out_imag += imag_VL_dotProductVector[i];
++ }
++ *VE_out_ptr = lv_cmake(VE_out_real, VE_out_imag);
++ *E_out_ptr = lv_cmake(E_out_real, E_out_imag);
++ *P_out_ptr = lv_cmake(P_out_real, P_out_imag);
++ *L_out_ptr = lv_cmake(L_out_real, L_out_imag);
++ *VL_out_ptr = lv_cmake(VL_out_real, VL_out_imag);
++ }
++
++ lv_16sc_t bb_signal_sample;
++ for(int i=0; i < num_points%8; ++i)
++ {
++ //Perform the carrier wipe-off
++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
++ // Now get very early, early, prompt, late and very late values for each
++ *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*VE_code_ptr++));
++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*E_code_ptr++));
++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*P_code_ptr++));
++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*L_code_ptr++));
++ *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*VL_code_ptr++));
++ }
++}
++#endif /* LV_HAVE_SSE4_1 */
++
++#ifdef LV_HAVE_GENERIC
++#include <stdio.h>
++#include <tmmintrin.h>
++
++/*!
++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
++ \param input The input signal input
++ \param carrier The carrier signal input
++ \param VE_code Very Early PRN code replica input
++ \param E_code Early PRN code replica input
++ \param P_code Prompt PRN code replica input
++ \param L_code Late PRN code replica input
++ \param VL_code Very Late PRN code replica input
++ \param VE_out Very Early correlation output
++ \param E_out Early correlation output
++ \param P_out Prompt correlation output
++ \param L_out Late correlation output
++ \param VL_out Very Late correlation output
++ \param num_points The number of complex values in vectors
++ */
++static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_a_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points)
++{
++ *VE_out = 0;
++ *E_out = 0;
++ *P_out = 0;
++ *L_out = 0;
++ *VL_out = 0;
++
++ lv_16sc_t bb_signal_sample;
++
++ for(int i=0; i < num_points; ++i)
++ {
++ //Perform the carrier wipe-off
++ bb_signal_sample = input[i] * carrier[i];
++ // Now get very early, early, prompt, late and very late values for each
++ *VE_out += (lv_32fc_t) (bb_signal_sample * VE_code[i]);
++ *E_out += (lv_32fc_t) (bb_signal_sample * E_code[i]);
++ *P_out += (lv_32fc_t) (bb_signal_sample * P_code[i]);
++ *L_out += (lv_32fc_t) (bb_signal_sample * L_code[i]);
++ *VL_out += (lv_32fc_t) (bb_signal_sample * VL_code[i]);
++ }
++}
++#endif /* LV_HAVE_GENERIC */
++#endif /* INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_a_H */
+\ No newline at end of file
+diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8u_x2_multiply_8u.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8u_x2_multiply_8u.h
+--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8u_x2_multiply_8u.h 1970-01-01 01:00:00.000000000 +0100
++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8u_x2_multiply_8u.h 2014-10-15 01:55:08.000000000 +0200
+@@ -0,0 +1,210 @@
++/*!
++ * \file volk_gnsssdr_8u_x2_multiply_8u.h
++ * \brief Volk protokernel: multiplies unsigned char values
++ * \authors <ul>
++ * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
++ * </ul>
++ *
++ * Volk protokernel that multiplies unsigned char values (8 bits data)
++ *
++ * -------------------------------------------------------------------------
++ *
++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
++ *
++ * GNSS-SDR is a software defined Global Navigation
++ * Satellite Systems receiver
++ *
++ * This file is part of GNSS-SDR.
++ *
++ * GNSS-SDR is free software: you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation, either version 3 of the License, or
++ * at your option) any later version.
++ *
++ * GNSS-SDR is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
++ *
++ * -------------------------------------------------------------------------
++ */
++
++#ifndef INCLUDED_volk_gnsssdr_8u_x2_multiply_8u_u_H
++#define INCLUDED_volk_gnsssdr_8u_x2_multiply_8u_u_H
++
++#include <inttypes.h>
++#include <stdio.h>
++
++#ifdef LV_HAVE_SSE3
++#include <pmmintrin.h>
++#include <emmintrin.h>
++/*!
++ \brief Multiplies the two input unsigned char values and stores their results in the third unisgned char
++ \param cChar The unsigned char where the results will be stored
++ \param aChar One of the unsigned char to be multiplied
++ \param bChar One of the unsigned char to be multiplied
++ \param num_points The number of unsigned char values in aChar and bChar to be multiplied together and stored into cChar
++ */
++static inline void volk_gnsssdr_8u_x2_multiply_8u_u_sse3(unsigned char* cChar, const unsigned char* aChar, const unsigned char* bChar, unsigned int num_points){
++
++ const unsigned int sse_iters = num_points / 16;
++
++ __m128i x, y, x1, x2, y1, y2, mult1, x1_mult_y1, x2_mult_y2, tmp, tmp1, tmp2, totalc;
++ unsigned char* c = cChar;
++ const unsigned char* a = aChar;
++ const unsigned char* b = bChar;
++
++ for(int number = 0;number < sse_iters; number++){
++ x = _mm_lddqu_si128((__m128i*)a);
++ y = _mm_lddqu_si128((__m128i*)b);
++
++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
++ x1 = _mm_srli_si128 (x, 1);
++ x1 = _mm_and_si128 (x1, mult1);
++ x2 = _mm_and_si128 (x, mult1);
++
++ y1 = _mm_srli_si128 (y, 1);
++ y1 = _mm_and_si128 (y1, mult1);
++ y2 = _mm_and_si128 (y, mult1);
++
++ x1_mult_y1 = _mm_mullo_epi16 (x1, y1);
++ x2_mult_y2 = _mm_mullo_epi16 (x2, y2);
++
++ tmp = _mm_and_si128 (x1_mult_y1, mult1);
++ tmp1 = _mm_slli_si128 (tmp, 1);
++ tmp2 = _mm_and_si128 (x2_mult_y2, mult1);
++ totalc = _mm_or_si128 (tmp1, tmp2);
++
++ _mm_storeu_si128((__m128i*)c, totalc);
++
++ a += 16;
++ b += 16;
++ c += 16;
++ }
++
++ for (int i = 0; i<(num_points % 16); ++i)
++ {
++ *c++ = (*a++) * (*b++);
++ }
++}
++#endif /* LV_HAVE_SSE3 */
++
++#ifdef LV_HAVE_GENERIC
++/*!
++ \brief Multiplies the two input unsigned char values and stores their results in the third unisgned char
++ \param cChar The unsigned char where the results will be stored
++ \param aChar One of the unsigned char to be multiplied
++ \param bChar One of the unsigned char to be multiplied
++ \param num_points The number of unsigned char values in aChar and bChar to be multiplied together and stored into cChar
++ */
++static inline void volk_gnsssdr_8u_x2_multiply_8u_generic(unsigned char* cChar, const unsigned char* aChar, const unsigned char* bChar, unsigned int num_points){
++ unsigned char* cPtr = cChar;
++ const unsigned char* aPtr = aChar;
++ const unsigned char* bPtr = bChar;
++
++ for(int number = 0; number < num_points; number++){
++ *cPtr++ = (*aPtr++) * (*bPtr++);
++ }
++}
++#endif /* LV_HAVE_GENERIC */
++
++#endif /* INCLUDED_volk_gnsssdr_8u_x2_multiply_8u_u_H */
++
++
++#ifndef INCLUDED_volk_gnsssdr_8u_x2_multiply_8u_a_H
++#define INCLUDED_volk_gnsssdr_8u_x2_multiply_8u_a_H
++
++#include <inttypes.h>
++#include <stdio.h>
++
++#ifdef LV_HAVE_SSE3
++#include <pmmintrin.h>
++#include <emmintrin.h>
++/*!
++ \brief Multiplies the two input unsigned char values and stores their results in the third unisgned char
++ \param cChar The unsigned char where the results will be stored
++ \param aChar One of the unsigned char to be multiplied
++ \param bChar One of the unsigned char to be multiplied
++ \param num_points The number of unsigned char values in aChar and bChar to be multiplied together and stored into cChar
++ */
++static inline void volk_gnsssdr_8u_x2_multiply_8u_a_sse3(unsigned char* cChar, const unsigned char* aChar, const unsigned char* bChar, unsigned int num_points){
++
++ const unsigned int sse_iters = num_points / 16;
++
++ __m128i x, y, x1, x2, y1, y2, mult1, x1_mult_y1, x2_mult_y2, tmp, tmp1, tmp2, totalc;
++ unsigned char* c = cChar;
++ const unsigned char* a = aChar;
++ const unsigned char* b = bChar;
++
++ for(int number = 0;number < sse_iters; number++){
++ x = _mm_load_si128((__m128i*)a);
++ y = _mm_load_si128((__m128i*)b);
++
++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
++ x1 = _mm_srli_si128 (x, 1);
++ x1 = _mm_and_si128 (x1, mult1);
++ x2 = _mm_and_si128 (x, mult1);
++
++ y1 = _mm_srli_si128 (y, 1);
++ y1 = _mm_and_si128 (y1, mult1);
++ y2 = _mm_and_si128 (y, mult1);
++
++ x1_mult_y1 = _mm_mullo_epi16 (x1, y1);
++ x2_mult_y2 = _mm_mullo_epi16 (x2, y2);
++
++ tmp = _mm_and_si128 (x1_mult_y1, mult1);
++ tmp1 = _mm_slli_si128 (tmp, 1);
++ tmp2 = _mm_and_si128 (x2_mult_y2, mult1);
++ totalc = _mm_or_si128 (tmp1, tmp2);
++
++ _mm_store_si128((__m128i*)c, totalc);
++
++ a += 16;
++ b += 16;
++ c += 16;
++ }
++
++ for (int i = 0; i<(num_points % 16); ++i)
++ {
++ *c++ = (*a++) * (*b++);
++ }
++}
++#endif /* LV_HAVE_SSE */
++
++#ifdef LV_HAVE_GENERIC
++/*!
++ \brief Multiplies the two input unsigned char values and stores their results in the third unisgned char
++ \param cChar The unsigned char where the results will be stored
++ \param aChar One of the unsigned char to be multiplied
++ \param bChar One of the unsigned char to be multiplied
++ \param num_points The number of unsigned char values in aChar and bChar to be multiplied together and stored into cChar
++ */
++static inline void volk_gnsssdr_8u_x2_multiply_8u_a_generic(unsigned char* cChar, const unsigned char* aChar, const unsigned char* bChar, unsigned int num_points){
++ unsigned char* cPtr = cChar;
++ const unsigned char* aPtr = aChar;
++ const unsigned char* bPtr = bChar;
++
++ for(int number = 0; number < num_points; number++){
++ *cPtr++ = (*aPtr++) * (*bPtr++);
++ }
++}
++#endif /* LV_HAVE_GENERIC */
++
++#ifdef LV_HAVE_ORC
++/*!
++ \brief Multiplies the two input unsigned char values and stores their results in the third unisgned char
++ \param cChar The unsigned char where the results will be stored
++ \param aChar One of the unsigned char to be multiplied
++ \param bChar One of the unsigned char to be multiplied
++ \param num_points The number of unsigned char values in aChar and bChar to be multiplied together and stored into cChar
++ */
++extern void volk_gnsssdr_8u_x2_multiply_8u_a_orc_impl(unsigned char* cVector, const unsigned char* aVector, const unsigned char* bVector, unsigned int num_points);
++static inline void volk_gnsssdr_8u_x2_multiply_8u_u_orc(unsigned char* cVector, const unsigned char* aVector, const unsigned char* bVector, unsigned int num_points){
++ volk_gnsssdr_8u_x2_multiply_8u_a_orc_impl(cVector, aVector, bVector, num_points);
++}
++#endif /* LV_HAVE_ORC */
++
++#endif /* INCLUDED_volk_gnsssdr_8u_x2_multiply_8u_a_H */
+diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_s32f_x2_update_local_carrier_32fc.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_s32f_x2_update_local_carrier_32fc.h
+--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_s32f_x2_update_local_carrier_32fc.h 1970-01-01 01:00:00.000000000 +0100
++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_s32f_x2_update_local_carrier_32fc.h 2014-10-17 01:53:55.000000000 +0200
+@@ -0,0 +1,866 @@
++/*!
++ * \file volk_gnsssdr_32fc_s32f_x2_update_local_carrier_32fc
++ * \brief Volk protokernel: replaces the tracking function for update_local_carrier. Algorithm by Julien Pommier and Giovanni Garberoglio, modified by Andrés Cecilia.
++ * \authors <ul>
++ * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
++ * </ul>
++ *
++ * Volk protokernel that replaces the tracking function for update_local_carrier. Algorithm by Julien Pommier and Giovanni Garberoglio, modified by Andrés Cecilia.
++ *
++ * -------------------------------------------------------------------------
++ *
++ * Copyright (C) 2007 Julien Pommier
++ *
++ * This software is provided 'as-is', without any express or implied
++ * warranty. In no event will the authors be held liable for any damages
++ * arising from the use of this software.
++ *
++ * Permission is granted to anyone to use this software for any purpose,
++ * including commercial applications, and to alter it and redistribute it
++ * freely, subject to the following restrictions:
++ *
++ * 1. The origin of this software must not be misrepresented; you must not
++ * claim that you wrote the original software. If you use this software
++ * in a product, an acknowledgment in the product documentation would be
++ * appreciated but is not required.
++ * 2. Altered source versions must be plainly marked as such, and must not be
++ * misrepresented as being the original software.
++ * 3. This notice may not be removed or altered from any source distribution.
++ *
++ *(this is the zlib license)
++ *
++ * -------------------------------------------------------------------------
++ *
++ * Copyright (C) 2012 Giovanni Garberoglio
++ * Interdisciplinary Laboratory for Computational Science (LISC)
++ * Fondazione Bruno Kessler and University of Trento
++ * via Sommarive, 18
++ * I-38123 Trento (Italy)
++ *
++ * -------------------------------------------------------------------------
++ *
++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
++ *
++ * GNSS-SDR is a software defined Global Navigation
++ * Satellite Systems receiver
++ *
++ * This file is part of GNSS-SDR.
++ *
++ * GNSS-SDR is free software: you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation, either version 3 of the License, or
++ * at your option) any later version.
++ *
++ * GNSS-SDR is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
++ *
++ * -------------------------------------------------------------------------
++ */
++
++#ifndef INCLUDED_volk_gnsssdr_32fc_s32f_x2_update_local_carrier_32fc_u_H
++#define INCLUDED_volk_gnsssdr_32fc_s32f_x2_update_local_carrier_32fc_u_H
++
++#include <volk_gnsssdr/volk_gnsssdr_common.h>
++#include <inttypes.h>
++#include <stdio.h>
++
++#ifdef LV_HAVE_AVX
++#include <tmmintrin.h>
++/*!
++ \brief Accumulates the values in the input buffer
++ \param result The accumulated result
++ \param inputBuffer The buffer of data to be accumulated
++ \param num_points The number of values in inputBuffer to be accumulated
++ */
++static inline void volk_gnsssdr_s32f_x2_update_local_carrier_32fc_u_avx(lv_32fc_t* d_carr_sign, const float phase_rad_init, const float phase_step_rad, unsigned int num_points){
++
++// float* pointer1 = (float*)&phase_rad_init;
++// *pointer1 = 0;
++// float* pointer2 = (float*)&phase_step_rad;
++// *pointer2 = 0.5;
++
++ const unsigned int sse_iters = num_points / 8;
++
++ __m256 _ps256_minus_cephes_DP1 = _mm256_set1_ps(-0.78515625f);
++ __m256 _ps256_minus_cephes_DP2 = _mm256_set1_ps(-2.4187564849853515625e-4f);
++ __m256 _ps256_minus_cephes_DP3 = _mm256_set1_ps(-3.77489497744594108e-8f);
++ __m256 _ps256_sign_mask = _mm256_set1_ps(-0.f);
++ __m128i _pi32avx_1 = _mm_set1_epi32(1);
++ __m128i _pi32avx_inv1 = _mm_set1_epi32(~1);
++ __m128i _pi32avx_2 = _mm_set1_epi32(2);
++ __m128i _pi32avx_4 = _mm_set1_epi32(4);
++ __m256 _ps256_cephes_FOPI = _mm256_set1_ps(1.27323954473516f); // 4 / PI
++ __m256 _ps256_sincof_p0 = _mm256_set1_ps(-1.9515295891E-4f);
++ __m256 _ps256_sincof_p1 = _mm256_set1_ps( 8.3321608736E-3f);
++ __m256 _ps256_sincof_p2 = _mm256_set1_ps(-1.6666654611E-1f);
++ __m256 _ps256_coscof_p0 = _mm256_set1_ps( 2.443315711809948E-005f);
++ __m256 _ps256_coscof_p1 = _mm256_set1_ps(-1.388731625493765E-003f);
++ __m256 _ps256_coscof_p2 = _mm256_set1_ps( 4.166664568298827E-002f);
++ __m256 _ps256_1 = _mm256_set1_ps(1.f);
++ __m256 _ps256_0p5 = _mm256_set1_ps(0.5f);
++
++ __m256 phase_step_rad_array = _mm256_set1_ps(8*phase_step_rad);
++
++ __m256 phase_rad_array, x, s, c, swap_sign_bit_sin, sign_bit_cos, poly_mask, z, tmp, y, y2, ysin1, ysin2;
++ __m256 xmm1, xmm2, xmm3, sign_bit_sin;
++ __m256i imm0, imm2, imm4;
++ __m128i imm0_1, imm0_2, imm2_1, imm2_2, imm4_1, imm4_2;
++ __VOLK_ATTR_ALIGNED(32) float sin_value[8];
++ __VOLK_ATTR_ALIGNED(32) float cos_value[8];
++
++ phase_rad_array = _mm256_set_ps (phase_rad_init+7*phase_step_rad, phase_rad_init+6*phase_step_rad, phase_rad_init+5*phase_step_rad, phase_rad_init+4*phase_step_rad, phase_rad_init+3*phase_step_rad, phase_rad_init+2*phase_step_rad, phase_rad_init+phase_step_rad, phase_rad_init);
++
++ for(int i = 0; i < sse_iters; i++)
++ {
++
++ x = phase_rad_array;
++
++ /* extract the sign bit (upper one) */
++ sign_bit_sin = _mm256_and_ps(x, _ps256_sign_mask);
++
++ /* take the absolute value */
++ x = _mm256_xor_ps(x, sign_bit_sin);
++
++ /* scale by 4/Pi */
++ y = _mm256_mul_ps(x, _ps256_cephes_FOPI);
++
++ /* we use SSE2 routines to perform the integer ops */
++
++ //COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y),imm2_1,imm2_2);
++ y = _mm256_cvttps_epi32(y);
++ imm2_1 = _mm256_extractf128_ps (y, 0);
++ imm2_2 = _mm256_extractf128_ps (y, 1);
++
++ imm2_1 = _mm_add_epi32(imm2_1, _pi32avx_1);
++ imm2_2 = _mm_add_epi32(imm2_2, _pi32avx_1);
++
++ imm2_1 = _mm_and_si128(imm2_1, _pi32avx_inv1);
++ imm2_2 = _mm_and_si128(imm2_2, _pi32avx_inv1);
++
++ //COPY_XMM_TO_IMM(imm2_1,imm2_2,imm2);
++ //_mm256_set_m128i not defined in some versions of immintrin.h
++ //imm2 = _mm256_set_m128i (imm2_2, imm2_1);
++ imm2 = _mm256_insertf128_si256(_mm256_castsi128_si256(imm2_1),(imm2_2),1);
++
++ y = _mm256_cvtepi32_ps(imm2);
++
++ imm4_1 = imm2_1;
++ imm4_2 = imm2_2;
++
++ imm0_1 = _mm_and_si128(imm2_1, _pi32avx_4);
++ imm0_2 = _mm_and_si128(imm2_2, _pi32avx_4);
++
++ imm0_1 = _mm_slli_epi32(imm0_1, 29);
++ imm0_2 = _mm_slli_epi32(imm0_2, 29);
++
++ //COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0);
++ //_mm256_set_m128i not defined in some versions of immintrin.h
++ //imm0 = _mm256_set_m128i (imm0_2, imm0_1);
++ imm0 = _mm256_insertf128_si256(_mm256_castsi128_si256(imm0_1),(imm0_2),1);
++
++ imm2_1 = _mm_and_si128(imm2_1, _pi32avx_2);
++ imm2_2 = _mm_and_si128(imm2_2, _pi32avx_2);
++
++ imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128());
++ imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128());
++
++ //COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
++ //_mm256_set_m128i not defined in some versions of immintrin.h
++ //imm2 = _mm256_set_m128i (imm2_2, imm2_1);
++ imm2 = _mm256_insertf128_si256(_mm256_castsi128_si256(imm2_1),(imm2_2),1);
++
++ swap_sign_bit_sin = _mm256_castsi256_ps(imm0);
++ poly_mask = _mm256_castsi256_ps(imm2);
++
++ /* The magic pass: "Extended precision modular arithmetic"
++ x = ((x - y * DP1) - y * DP2) - y * DP3; */
++ xmm1 = _ps256_minus_cephes_DP1;
++ xmm2 = _ps256_minus_cephes_DP2;
++ xmm3 = _ps256_minus_cephes_DP3;
++ xmm1 = _mm256_mul_ps(y, xmm1);
++ xmm2 = _mm256_mul_ps(y, xmm2);
++ xmm3 = _mm256_mul_ps(y, xmm3);
++ x = _mm256_add_ps(x, xmm1);
++ x = _mm256_add_ps(x, xmm2);
++ x = _mm256_add_ps(x, xmm3);
++
++ imm4_1 = _mm_sub_epi32(imm4_1, _pi32avx_2);
++ imm4_2 = _mm_sub_epi32(imm4_2, _pi32avx_2);
++
++ imm4_1 = _mm_andnot_si128(imm4_1, _pi32avx_4);
++ imm4_2 = _mm_andnot_si128(imm4_2, _pi32avx_4);
++
++ imm4_1 = _mm_slli_epi32(imm4_1, 29);
++ imm4_2 = _mm_slli_epi32(imm4_2, 29);
++
++ //COPY_XMM_TO_IMM(imm4_1, imm4_2, imm4);
++ //_mm256_set_m128i not defined in some versions of immintrin.h
++ //imm4 = _mm256_set_m128i (imm4_2, imm4_1);
++ imm4 = _mm256_insertf128_si256(_mm256_castsi128_si256(imm4_1),(imm4_2),1);
++
++ sign_bit_cos = _mm256_castsi256_ps(imm4);
++
++ sign_bit_sin = _mm256_xor_ps(sign_bit_sin, swap_sign_bit_sin);
++
++ /* Evaluate the first polynom (0 <= x <= Pi/4) */
++ z = _mm256_mul_ps(x,x);
++ y = _ps256_coscof_p0;
++
++ y = _mm256_mul_ps(y, z);
++ y = _mm256_add_ps(y, _ps256_coscof_p1);
++ y = _mm256_mul_ps(y, z);
++ y = _mm256_add_ps(y, _ps256_coscof_p2);
++ y = _mm256_mul_ps(y, z);
++ y = _mm256_mul_ps(y, z);
++ tmp = _mm256_mul_ps(z, _ps256_0p5);
++ y = _mm256_sub_ps(y, tmp);
++ y = _mm256_add_ps(y, _ps256_1);
++
++ /* Evaluate the second polynom (Pi/4 <= x <= 0) */
++
++ y2 = _ps256_sincof_p0;
++ y2 = _mm256_mul_ps(y2, z);
++ y2 = _mm256_add_ps(y2, _ps256_sincof_p1);
++ y2 = _mm256_mul_ps(y2, z);
++ y2 = _mm256_add_ps(y2, _ps256_sincof_p2);
++ y2 = _mm256_mul_ps(y2, z);
++ y2 = _mm256_mul_ps(y2, x);
++ y2 = _mm256_add_ps(y2, x);
++
++ /* select the correct result from the two polynoms */
++ xmm3 = poly_mask;
++ ysin2 = _mm256_and_ps(xmm3, y2);
++ ysin1 = _mm256_andnot_ps(xmm3, y);
++ y2 = _mm256_sub_ps(y2,ysin2);
++ y = _mm256_sub_ps(y, ysin1);
++
++ xmm1 = _mm256_add_ps(ysin1,ysin2);
++ xmm2 = _mm256_add_ps(y,y2);
++
++ /* update the sign */
++ s = _mm256_xor_ps(xmm1, sign_bit_sin);
++ c = _mm256_xor_ps(xmm2, sign_bit_cos);
++
++ //GNSS-SDR needs to return -sin
++ s = _mm256_xor_ps(s, _ps256_sign_mask);
++
++ _mm256_storeu_ps ((float*)sin_value, s);
++ _mm256_storeu_ps ((float*)cos_value, c);
++
++ for(int i = 0; i < 8; i++)
++ {
++ d_carr_sign[i] = lv_cmake(cos_value[i], sin_value[i]);
++ }
++ d_carr_sign += 8;
++
++ phase_rad_array = _mm256_add_ps (phase_rad_array, phase_step_rad_array);
++ }
++
++ if (num_points%8!=0)
++ {
++ __VOLK_ATTR_ALIGNED(32) float phase_rad_store[8];
++ _mm256_storeu_si256 ((float*)phase_rad_store, phase_rad_array);
++
++ float phase_rad = phase_rad_store[0];
++
++ for(int i = 0; i < num_points%8; i++)
++ {
++ *d_carr_sign = lv_cmake(cos(phase_rad), -sin(phase_rad));
++ d_carr_sign++;
++ phase_rad += phase_step_rad;
++ }
++ }
++}
++#endif /* LV_HAVE_AVX */
++
++
++#ifdef LV_HAVE_SSE2
++#include <emmintrin.h>
++/*!
++ \brief Accumulates the values in the input buffer
++ \param result The accumulated result
++ \param inputBuffer The buffer of data to be accumulated
++ \param num_points The number of values in inputBuffer to be accumulated
++*/
++static inline void volk_gnsssdr_s32f_x2_update_local_carrier_32fc_u_sse2(lv_32fc_t* d_carr_sign, const float phase_rad_init, const float phase_step_rad, unsigned int num_points){
++
++// float* pointer1 = (float*)&phase_rad_init;
++// *pointer1 = 0;
++// float* pointer2 = (float*)&phase_step_rad;
++// *pointer2 = 0.5;
++
++ const unsigned int sse_iters = num_points / 4;
++
++ __m128 _ps_minus_cephes_DP1 = _mm_set1_ps(-0.78515625f);
++ __m128 _ps_minus_cephes_DP2 = _mm_set1_ps(-2.4187564849853515625e-4f);
++ __m128 _ps_minus_cephes_DP3 = _mm_set1_ps(-3.77489497744594108e-8f);
++ __m128 _ps_sign_mask = _mm_set1_ps(-0.f);
++ __m128i _pi32_1 = _mm_set1_epi32(1);
++ __m128i _pi32_inv1 = _mm_set1_epi32(~1);
++ __m128i _pi32_2 = _mm_set1_epi32(2);
++ __m128i _pi32_4 = _mm_set1_epi32(4);
++ __m128 _ps_cephes_FOPI = _mm_set1_ps(1.27323954473516f); // 4 / PI
++ __m128 _ps_sincof_p0 = _mm_set1_ps(-1.9515295891E-4f);
++ __m128 _ps_sincof_p1 = _mm_set1_ps( 8.3321608736E-3f);
++ __m128 _ps_sincof_p2 = _mm_set1_ps(-1.6666654611E-1f);
++ __m128 _ps_coscof_p0 = _mm_set1_ps( 2.443315711809948E-005f);
++ __m128 _ps_coscof_p1 = _mm_set1_ps(-1.388731625493765E-003f);
++ __m128 _ps_coscof_p2 = _mm_set1_ps( 4.166664568298827E-002f);
++ __m128 _ps_1 = _mm_set1_ps(1.f);
++ __m128 _ps_0p5 = _mm_set1_ps(0.5f);
++
++ __m128 phase_step_rad_array = _mm_set1_ps(4*phase_step_rad);
++
++ __m128 phase_rad_array, x, s, c, swap_sign_bit_sin, sign_bit_cos, poly_mask, z, tmp, y, y2, ysin1, ysin2;
++ __m128 xmm1, xmm2, xmm3, sign_bit_sin;
++ __m128i emm0, emm2, emm4;
++ __VOLK_ATTR_ALIGNED(16) float sin_value[4];
++ __VOLK_ATTR_ALIGNED(16) float cos_value[4];
++
++ phase_rad_array = _mm_set_ps (phase_rad_init+3*phase_step_rad, phase_rad_init+2*phase_step_rad, phase_rad_init+phase_step_rad, phase_rad_init);
++
++ for(int i = 0; i < sse_iters; i++)
++ {
++ x = phase_rad_array;
++
++ /* extract the sign bit (upper one) */
++ sign_bit_sin = _mm_and_ps(x, _ps_sign_mask);
++
++ /* take the absolute value */
++ x = _mm_xor_ps(x, sign_bit_sin);
++
++ /* scale by 4/Pi */
++ y = _mm_mul_ps(x, _ps_cephes_FOPI);
++
++ /* store the integer part of y in emm2 */
++ emm2 = _mm_cvttps_epi32(y);
++
++ /* j=(j+1) & (~1) (see the cephes sources) */
++ emm2 = _mm_add_epi32(emm2, _pi32_1);
++ emm2 = _mm_and_si128(emm2, _pi32_inv1);
++ y = _mm_cvtepi32_ps(emm2);
++
++ emm4 = emm2;
++
++ /* get the swap sign flag for the sine */
++ emm0 = _mm_and_si128(emm2, _pi32_4);
++ emm0 = _mm_slli_epi32(emm0, 29);
++ swap_sign_bit_sin = _mm_castsi128_ps(emm0);
++
++ /* get the polynom selection mask for the sine*/
++ emm2 = _mm_and_si128(emm2, _pi32_2);
++ emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
++ poly_mask = _mm_castsi128_ps(emm2);
++
++ /* The magic pass: "Extended precision modular arithmetic"
++ x = ((x - y * DP1) - y * DP2) - y * DP3; */
++ xmm1 = _mm_mul_ps(y, _ps_minus_cephes_DP1);
++ xmm2 = _mm_mul_ps(y, _ps_minus_cephes_DP2);
++ xmm3 = _mm_mul_ps(y, _ps_minus_cephes_DP3);
++ x = _mm_add_ps(_mm_add_ps(x, xmm1), _mm_add_ps(xmm2, xmm3));
++
++ emm4 = _mm_sub_epi32(emm4, _pi32_2);
++ emm4 = _mm_andnot_si128(emm4, _pi32_4);
++ emm4 = _mm_slli_epi32(emm4, 29);
++ sign_bit_cos = _mm_castsi128_ps(emm4);
++
++ sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
++
++ /* Evaluate the first polynom (0 <= x <= Pi/4) */
++ z = _mm_mul_ps(x,x);
++ y = _ps_coscof_p0;
++ y = _mm_mul_ps(y, z);
++ y = _mm_add_ps(y, _ps_coscof_p1);
++ y = _mm_mul_ps(y, z);
++ y = _mm_add_ps(y, _ps_coscof_p2);
++ y = _mm_mul_ps(y, _mm_mul_ps(z, z));
++ tmp = _mm_mul_ps(z, _ps_0p5);
++ y = _mm_sub_ps(y, tmp);
++ y = _mm_add_ps(y, _ps_1);
++
++ /* Evaluate the second polynom (Pi/4 <= x <= 0) */
++ y2 = _ps_sincof_p0;
++ y2 = _mm_mul_ps(y2, z);
++ y2 = _mm_add_ps(y2, _ps_sincof_p1);
++ y2 = _mm_mul_ps(y2, z);
++ y2 = _mm_add_ps(y2, _ps_sincof_p2);
++ y2 = _mm_mul_ps(y2, _mm_mul_ps(z, x));
++ y2 = _mm_add_ps(y2, x);
++
++ /* select the correct result from the two polynoms */
++ xmm3 = poly_mask;
++ ysin2 = _mm_and_ps(xmm3, y2);
++ ysin1 = _mm_andnot_ps(xmm3, y);
++ y2 = _mm_sub_ps(y2,ysin2);
++ y = _mm_sub_ps(y, ysin1);
++
++ xmm1 = _mm_add_ps(ysin1,ysin2);
++ xmm2 = _mm_add_ps(y,y2);
++
++ /* update the sign */
++ s = _mm_xor_ps(xmm1, sign_bit_sin);
++ c = _mm_xor_ps(xmm2, sign_bit_cos);
++
++ //GNSS-SDR needs to return -sin
++ s = _mm_xor_ps(s, _ps_sign_mask);
++
++ _mm_storeu_ps ((float*)sin_value, s);
++ _mm_storeu_ps ((float*)cos_value, c);
++
++ for(int i = 0; i < 4; i++)
++ {
++ d_carr_sign[i] = lv_cmake(cos_value[i], sin_value[i]);
++ }
++ d_carr_sign += 4;
++
++ phase_rad_array = _mm_add_ps (phase_rad_array, phase_step_rad_array);
++ }
++
++ if (num_points%4!=0)
++ {
++ __VOLK_ATTR_ALIGNED(16) float phase_rad_store[4];
++ _mm_storeu_si128 ((__m128i*)phase_rad_store, phase_rad_array);
++
++ float phase_rad = phase_rad_store[0];
++
++ for(int i = 0; i < num_points%4; i++)
++ {
++ *d_carr_sign = lv_cmake(cos(phase_rad), -sin(phase_rad));
++ d_carr_sign++;
++ phase_rad += phase_step_rad;
++ }
++ }
++}
++#endif /* LV_HAVE_SSE2 */
++
++#ifdef LV_HAVE_GENERIC
++/*!
++ \brief Accumulates the values in the input buffer
++ \param result The accumulated result
++ \param inputBuffer The buffer of data to be accumulated
++ \param num_points The number of values in inputBuffer to be accumulated
++*/
++static inline void volk_gnsssdr_s32f_x2_update_local_carrier_32fc_generic(lv_32fc_t* d_carr_sign, const float phase_rad_init, const float phase_step_rad, unsigned int num_points){
++
++// float* pointer1 = (float*)&phase_rad_init;
++// *pointer1 = 0;
++// float* pointer2 = (float*)&phase_step_rad;
++// *pointer2 = 0.5;
++
++ float phase_rad = phase_rad_init;
++ for(int i = 0; i < num_points; i++)
++ {
++ *d_carr_sign = lv_cmake(cos(phase_rad), -sin(phase_rad));
++ d_carr_sign++;
++ phase_rad += phase_step_rad;
++ }
++}
++#endif /* LV_HAVE_GENERIC */
++#endif /* INCLUDED_volk_gnsssdr_32fc_s32f_x2_update_local_carrier_32fc_u_H */
++
++
++#ifndef INCLUDED_volk_gnsssdr_32fc_s32f_x2_update_local_carrier_32fc_a_H
++#define INCLUDED_volk_gnsssdr_32fc_s32f_x2_update_local_carrier_32fc_a_H
++
++#include <volk_gnsssdr/volk_gnsssdr_common.h>
++#include <inttypes.h>
++#include <stdio.h>
++
++#ifdef LV_HAVE_AVX
++#include <tmmintrin.h>
++/*!
++ \brief Accumulates the values in the input buffer
++ \param result The accumulated result
++ \param inputBuffer The buffer of data to be accumulated
++ \param num_points The number of values in inputBuffer to be accumulated
++ */
++static inline void volk_gnsssdr_s32f_x2_update_local_carrier_32fc_a_avx(lv_32fc_t* d_carr_sign, const float phase_rad_init, const float phase_step_rad, unsigned int num_points){
++
++ // float* pointer1 = (float*)&phase_rad_init;
++ // *pointer1 = 0;
++ // float* pointer2 = (float*)&phase_step_rad;
++ // *pointer2 = 0.5;
++
++ const unsigned int sse_iters = num_points / 8;
++
++ __m256 _ps256_minus_cephes_DP1 = _mm256_set1_ps(-0.78515625f);
++ __m256 _ps256_minus_cephes_DP2 = _mm256_set1_ps(-2.4187564849853515625e-4f);
++ __m256 _ps256_minus_cephes_DP3 = _mm256_set1_ps(-3.77489497744594108e-8f);
++ __m256 _ps256_sign_mask = _mm256_set1_ps(-0.f);
++ __m128i _pi32avx_1 = _mm_set1_epi32(1);
++ __m128i _pi32avx_inv1 = _mm_set1_epi32(~1);
++ __m128i _pi32avx_2 = _mm_set1_epi32(2);
++ __m128i _pi32avx_4 = _mm_set1_epi32(4);
++ __m256 _ps256_cephes_FOPI = _mm256_set1_ps(1.27323954473516f); // 4 / PI
++ __m256 _ps256_sincof_p0 = _mm256_set1_ps(-1.9515295891E-4f);
++ __m256 _ps256_sincof_p1 = _mm256_set1_ps( 8.3321608736E-3f);
++ __m256 _ps256_sincof_p2 = _mm256_set1_ps(-1.6666654611E-1f);
++ __m256 _ps256_coscof_p0 = _mm256_set1_ps( 2.443315711809948E-005f);
++ __m256 _ps256_coscof_p1 = _mm256_set1_ps(-1.388731625493765E-003f);
++ __m256 _ps256_coscof_p2 = _mm256_set1_ps( 4.166664568298827E-002f);
++ __m256 _ps256_1 = _mm256_set1_ps(1.f);
++ __m256 _ps256_0p5 = _mm256_set1_ps(0.5f);
++
++ __m256 phase_step_rad_array = _mm256_set1_ps(8*phase_step_rad);
++
++ __m256 phase_rad_array, x, s, c, swap_sign_bit_sin, sign_bit_cos, poly_mask, z, tmp, y, y2, ysin1, ysin2;
++ __m256 xmm1, xmm2, xmm3, sign_bit_sin;
++ __m256i imm0, imm2, imm4;
++ __m128i imm0_1, imm0_2, imm2_1, imm2_2, imm4_1, imm4_2;
++ __VOLK_ATTR_ALIGNED(32) float sin_value[8];
++ __VOLK_ATTR_ALIGNED(32) float cos_value[8];
++
++ phase_rad_array = _mm256_set_ps (phase_rad_init+7*phase_step_rad, phase_rad_init+6*phase_step_rad, phase_rad_init+5*phase_step_rad, phase_rad_init+4*phase_step_rad, phase_rad_init+3*phase_step_rad, phase_rad_init+2*phase_step_rad, phase_rad_init+phase_step_rad, phase_rad_init);
++
++ for(int i = 0; i < sse_iters; i++)
++ {
++
++ x = phase_rad_array;
++
++ /* extract the sign bit (upper one) */
++ sign_bit_sin = _mm256_and_ps(x, _ps256_sign_mask);
++
++ /* take the absolute value */
++ x = _mm256_xor_ps(x, sign_bit_sin);
++
++ /* scale by 4/Pi */
++ y = _mm256_mul_ps(x, _ps256_cephes_FOPI);
++
++ /* we use SSE2 routines to perform the integer ops */
++
++ //COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y),imm2_1,imm2_2);
++ y = _mm256_cvttps_epi32(y);
++ imm2_1 = _mm256_extractf128_ps (y, 0);
++ imm2_2 = _mm256_extractf128_ps (y, 1);
++
++ imm2_1 = _mm_add_epi32(imm2_1, _pi32avx_1);
++ imm2_2 = _mm_add_epi32(imm2_2, _pi32avx_1);
++
++ imm2_1 = _mm_and_si128(imm2_1, _pi32avx_inv1);
++ imm2_2 = _mm_and_si128(imm2_2, _pi32avx_inv1);
++
++ //COPY_XMM_TO_IMM(imm2_1,imm2_2,imm2);
++ //_mm256_set_m128i not defined in some versions of immintrin.h
++ //imm2 = _mm256_set_m128i (imm2_2, imm2_1);
++ imm2 = _mm256_insertf128_si256(_mm256_castsi128_si256(imm2_1),(imm2_2),1);
++
++ y = _mm256_cvtepi32_ps(imm2);
++
++ imm4_1 = imm2_1;
++ imm4_2 = imm2_2;
++
++ imm0_1 = _mm_and_si128(imm2_1, _pi32avx_4);
++ imm0_2 = _mm_and_si128(imm2_2, _pi32avx_4);
++
++ imm0_1 = _mm_slli_epi32(imm0_1, 29);
++ imm0_2 = _mm_slli_epi32(imm0_2, 29);
++
++ //COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0);
++ //_mm256_set_m128i not defined in some versions of immintrin.h
++ //imm0 = _mm256_set_m128i (imm0_2, imm0_1);
++ imm0 = _mm256_insertf128_si256(_mm256_castsi128_si256(imm0_1),(imm0_2),1);
++
++ imm2_1 = _mm_and_si128(imm2_1, _pi32avx_2);
++ imm2_2 = _mm_and_si128(imm2_2, _pi32avx_2);
++
++ imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128());
++ imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128());
++
++ //COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
++ //_mm256_set_m128i not defined in some versions of immintrin.h
++ //imm2 = _mm256_set_m128i (imm2_2, imm2_1);
++ imm2 = _mm256_insertf128_si256(_mm256_castsi128_si256(imm2_1),(imm2_2),1);
++
++ swap_sign_bit_sin = _mm256_castsi256_ps(imm0);
++ poly_mask = _mm256_castsi256_ps(imm2);
++
++ /* The magic pass: "Extended precision modular arithmetic"
++ x = ((x - y * DP1) - y * DP2) - y * DP3; */
++ xmm1 = _ps256_minus_cephes_DP1;
++ xmm2 = _ps256_minus_cephes_DP2;
++ xmm3 = _ps256_minus_cephes_DP3;
++ xmm1 = _mm256_mul_ps(y, xmm1);
++ xmm2 = _mm256_mul_ps(y, xmm2);
++ xmm3 = _mm256_mul_ps(y, xmm3);
++ x = _mm256_add_ps(x, xmm1);
++ x = _mm256_add_ps(x, xmm2);
++ x = _mm256_add_ps(x, xmm3);
++
++ imm4_1 = _mm_sub_epi32(imm4_1, _pi32avx_2);
++ imm4_2 = _mm_sub_epi32(imm4_2, _pi32avx_2);
++
++ imm4_1 = _mm_andnot_si128(imm4_1, _pi32avx_4);
++ imm4_2 = _mm_andnot_si128(imm4_2, _pi32avx_4);
++
++ imm4_1 = _mm_slli_epi32(imm4_1, 29);
++ imm4_2 = _mm_slli_epi32(imm4_2, 29);
++
++ //COPY_XMM_TO_IMM(imm4_1, imm4_2, imm4);
++ //_mm256_set_m128i not defined in some versions of immintrin.h
++ //imm4 = _mm256_set_m128i (imm4_2, imm4_1);
++ imm4 = _mm256_insertf128_si256(_mm256_castsi128_si256(imm4_1),(imm4_2),1);
++
++ sign_bit_cos = _mm256_castsi256_ps(imm4);
++
++ sign_bit_sin = _mm256_xor_ps(sign_bit_sin, swap_sign_bit_sin);
++
++ /* Evaluate the first polynom (0 <= x <= Pi/4) */
++ z = _mm256_mul_ps(x,x);
++ y = _ps256_coscof_p0;
++
++ y = _mm256_mul_ps(y, z);
++ y = _mm256_add_ps(y, _ps256_coscof_p1);
++ y = _mm256_mul_ps(y, z);
++ y = _mm256_add_ps(y, _ps256_coscof_p2);
++ y = _mm256_mul_ps(y, z);
++ y = _mm256_mul_ps(y, z);
++ tmp = _mm256_mul_ps(z, _ps256_0p5);
++ y = _mm256_sub_ps(y, tmp);
++ y = _mm256_add_ps(y, _ps256_1);
++
++ /* Evaluate the second polynom (Pi/4 <= x <= 0) */
++
++ y2 = _ps256_sincof_p0;
++ y2 = _mm256_mul_ps(y2, z);
++ y2 = _mm256_add_ps(y2, _ps256_sincof_p1);
++ y2 = _mm256_mul_ps(y2, z);
++ y2 = _mm256_add_ps(y2, _ps256_sincof_p2);
++ y2 = _mm256_mul_ps(y2, z);
++ y2 = _mm256_mul_ps(y2, x);
++ y2 = _mm256_add_ps(y2, x);
++
++ /* select the correct result from the two polynoms */
++ xmm3 = poly_mask;
++ ysin2 = _mm256_and_ps(xmm3, y2);
++ ysin1 = _mm256_andnot_ps(xmm3, y);
++ y2 = _mm256_sub_ps(y2,ysin2);
++ y = _mm256_sub_ps(y, ysin1);
++
++ xmm1 = _mm256_add_ps(ysin1,ysin2);
++ xmm2 = _mm256_add_ps(y,y2);
++
++ /* update the sign */
++ s = _mm256_xor_ps(xmm1, sign_bit_sin);
++ c = _mm256_xor_ps(xmm2, sign_bit_cos);
++
++ //GNSS-SDR needs to return -sin
++ s = _mm256_xor_ps(s, _ps256_sign_mask);
++
++ _mm256_store_ps ((float*)sin_value, s);
++ _mm256_store_ps ((float*)cos_value, c);
++
++ for(int i = 0; i < 8; i++)
++ {
++ d_carr_sign[i] = lv_cmake(cos_value[i], sin_value[i]);
++ }
++ d_carr_sign += 8;
++
++ phase_rad_array = _mm256_add_ps (phase_rad_array, phase_step_rad_array);
++ }
++
++ if (num_points%8!=0)
++ {
++ __VOLK_ATTR_ALIGNED(32) float phase_rad_store[8];
++ _mm256_store_ps ((float*)phase_rad_store, phase_rad_array);
++
++ float phase_rad = phase_rad_store[0];
++
++ for(int i = 0; i < num_points%8; i++)
++ {
++ *d_carr_sign = lv_cmake(cos(phase_rad), -sin(phase_rad));
++ d_carr_sign++;
++ phase_rad += phase_step_rad;
++ }
++ }
++}
++#endif /* LV_HAVE_AVX */
++
++#ifdef LV_HAVE_SSE2
++#include <emmintrin.h>
++/*!
++ \brief Accumulates the values in the input buffer
++ \param result The accumulated result
++ \param inputBuffer The buffer of data to be accumulated
++ \param num_points The number of values in inputBuffer to be accumulated
++ */
++static inline void volk_gnsssdr_s32f_x2_update_local_carrier_32fc_a_sse2(lv_32fc_t* d_carr_sign, const float phase_rad_init, const float phase_step_rad, unsigned int num_points){
++
++// float* pointer1 = (float*)&phase_rad_init;
++// *pointer1 = 0;
++// float* pointer2 = (float*)&phase_step_rad;
++// *pointer2 = 0.5;
++
++ const unsigned int sse_iters = num_points / 4;
++
++ __m128 _ps_minus_cephes_DP1 = _mm_set1_ps(-0.78515625f);
++ __m128 _ps_minus_cephes_DP2 = _mm_set1_ps(-2.4187564849853515625e-4f);
++ __m128 _ps_minus_cephes_DP3 = _mm_set1_ps(-3.77489497744594108e-8f);
++ __m128 _ps_sign_mask = _mm_set1_ps(-0.f);
++ __m128i _pi32_1 = _mm_set1_epi32(1);
++ __m128i _pi32_inv1 = _mm_set1_epi32(~1);
++ __m128i _pi32_2 = _mm_set1_epi32(2);
++ __m128i _pi32_4 = _mm_set1_epi32(4);
++ __m128 _ps_cephes_FOPI = _mm_set1_ps(1.27323954473516f); // 4 / PI
++ __m128 _ps_sincof_p0 = _mm_set1_ps(-1.9515295891E-4f);
++ __m128 _ps_sincof_p1 = _mm_set1_ps( 8.3321608736E-3f);
++ __m128 _ps_sincof_p2 = _mm_set1_ps(-1.6666654611E-1f);
++ __m128 _ps_coscof_p0 = _mm_set1_ps( 2.443315711809948E-005f);
++ __m128 _ps_coscof_p1 = _mm_set1_ps(-1.388731625493765E-003f);
++ __m128 _ps_coscof_p2 = _mm_set1_ps( 4.166664568298827E-002f);
++ __m128 _ps_1 = _mm_set1_ps(1.f);
++ __m128 _ps_0p5 = _mm_set1_ps(0.5f);
++
++ __m128 phase_step_rad_array = _mm_set1_ps(4*phase_step_rad);
++
++ __m128 phase_rad_array, x, s, c, swap_sign_bit_sin, sign_bit_cos, poly_mask, z, tmp, y, y2, ysin1, ysin2;
++ __m128 xmm1, xmm2, xmm3, sign_bit_sin;
++ __m128i emm0, emm2, emm4;
++ __VOLK_ATTR_ALIGNED(16) float sin_value[4];
++ __VOLK_ATTR_ALIGNED(16) float cos_value[4];
++
++ phase_rad_array = _mm_set_ps (phase_rad_init+3*phase_step_rad, phase_rad_init+2*phase_step_rad, phase_rad_init+phase_step_rad, phase_rad_init);
++
++ for(int i = 0; i < sse_iters; i++)
++ {
++ x = phase_rad_array;
++
++ /* extract the sign bit (upper one) */
++ sign_bit_sin = _mm_and_ps(x, _ps_sign_mask);
++
++ /* take the absolute value */
++ x = _mm_xor_ps(x, sign_bit_sin);
++
++ /* scale by 4/Pi */
++ y = _mm_mul_ps(x, _ps_cephes_FOPI);
++
++ /* store the integer part of y in emm2 */
++ emm2 = _mm_cvttps_epi32(y);
++
++ /* j=(j+1) & (~1) (see the cephes sources) */
++ emm2 = _mm_add_epi32(emm2, _pi32_1);
++ emm2 = _mm_and_si128(emm2, _pi32_inv1);
++ y = _mm_cvtepi32_ps(emm2);
++
++ emm4 = emm2;
++
++ /* get the swap sign flag for the sine */
++ emm0 = _mm_and_si128(emm2, _pi32_4);
++ emm0 = _mm_slli_epi32(emm0, 29);
++ swap_sign_bit_sin = _mm_castsi128_ps(emm0);
++
++ /* get the polynom selection mask for the sine*/
++ emm2 = _mm_and_si128(emm2, _pi32_2);
++ emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
++ poly_mask = _mm_castsi128_ps(emm2);
++
++ /* The magic pass: "Extended precision modular arithmetic"
++ x = ((x - y * DP1) - y * DP2) - y * DP3; */
++ xmm1 = _mm_mul_ps(y, _ps_minus_cephes_DP1);
++ xmm2 = _mm_mul_ps(y, _ps_minus_cephes_DP2);
++ xmm3 = _mm_mul_ps(y, _ps_minus_cephes_DP3);
++ x = _mm_add_ps(_mm_add_ps(x, xmm1), _mm_add_ps(xmm2, xmm3));
++
++ emm4 = _mm_sub_epi32(emm4, _pi32_2);
++ emm4 = _mm_andnot_si128(emm4, _pi32_4);
++ emm4 = _mm_slli_epi32(emm4, 29);
++ sign_bit_cos = _mm_castsi128_ps(emm4);
++
++ sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
++
++ /* Evaluate the first polynom (0 <= x <= Pi/4) */
++ z = _mm_mul_ps(x,x);
++ y = _ps_coscof_p0;
++ y = _mm_mul_ps(y, z);
++ y = _mm_add_ps(y, _ps_coscof_p1);
++ y = _mm_mul_ps(y, z);
++ y = _mm_add_ps(y, _ps_coscof_p2);
++ y = _mm_mul_ps(y, _mm_mul_ps(z, z));
++ tmp = _mm_mul_ps(z, _ps_0p5);
++ y = _mm_sub_ps(y, tmp);
++ y = _mm_add_ps(y, _ps_1);
++
++ /* Evaluate the second polynom (Pi/4 <= x <= 0) */
++ y2 = _ps_sincof_p0;
++ y2 = _mm_mul_ps(y2, z);
++ y2 = _mm_add_ps(y2, _ps_sincof_p1);
++ y2 = _mm_mul_ps(y2, z);
++ y2 = _mm_add_ps(y2, _ps_sincof_p2);
++ y2 = _mm_mul_ps(y2, _mm_mul_ps(z, x));
++ y2 = _mm_add_ps(y2, x);
++
++ /* select the correct result from the two polynoms */
++ xmm3 = poly_mask;
++ ysin2 = _mm_and_ps(xmm3, y2);
++ ysin1 = _mm_andnot_ps(xmm3, y);
++ y2 = _mm_sub_ps(y2,ysin2);
++ y = _mm_sub_ps(y, ysin1);
++
++ xmm1 = _mm_add_ps(ysin1,ysin2);
++ xmm2 = _mm_add_ps(y,y2);
++
++ /* update the sign */
++ s = _mm_xor_ps(xmm1, sign_bit_sin);
++ c = _mm_xor_ps(xmm2, sign_bit_cos);
++
++ //GNSS-SDR needs to return -sin
++ s = _mm_xor_ps(s, _ps_sign_mask);
++
++ _mm_store_ps ((float*)sin_value, s);
++ _mm_store_ps ((float*)cos_value, c);
++
++ for(int i = 0; i < 4; i++)
++ {
++ d_carr_sign[i] = lv_cmake(cos_value[i], sin_value[i]);
++ }
++ d_carr_sign += 4;
++
++ phase_rad_array = _mm_add_ps (phase_rad_array, phase_step_rad_array);
++ }
++
++ if (num_points%4!=0)
++ {
++ __VOLK_ATTR_ALIGNED(16) float phase_rad_store[4];
++ _mm_store_si128 ((__m128i*)phase_rad_store, phase_rad_array);
++
++ float phase_rad = phase_rad_store[0];
++
++ for(int i = 0; i < num_points%4; i++)
++ {
++ *d_carr_sign = lv_cmake(cos(phase_rad), -sin(phase_rad));
++ d_carr_sign++;
++ phase_rad += phase_step_rad;
++ }
++ }
++}
++#endif /* LV_HAVE_SSE2 */
++
++#ifdef LV_HAVE_GENERIC
++/*!
++ \brief Accumulates the values in the input buffer
++ \param result The accumulated result
++ \param inputBuffer The buffer of data to be accumulated
++ \param num_points The number of values in inputBuffer to be accumulated
++ */
++static inline void volk_gnsssdr_s32f_x2_update_local_carrier_32fc_a_generic(lv_32fc_t* d_carr_sign, const float phase_rad_init, const float phase_step_rad, unsigned int num_points){
++
++// float* pointer1 = (float*)&phase_rad_init;
++// *pointer1 = 0;
++// float* pointer2 = (float*)&phase_step_rad;
++// *pointer2 = 0.5;
++
++ float phase_rad = phase_rad_init;
++ for(int i = 0; i < num_points; i++)
++ {
++ *d_carr_sign = lv_cmake(cos(phase_rad), -sin(phase_rad));
++ d_carr_sign++;
++ phase_rad += phase_step_rad;
++ }
++}
++#endif /* LV_HAVE_GENERIC */
++#endif /* INCLUDED_volk_gnsssdr_32fc_s32f_x2_update_local_carrier_32fc_a_H */
++
+diff -rupN /Users/andres/Desktop/volk_gnsssdr/lib/CMakeLists.txt /Users/andres/Desktop/volk_gnsssdr_original/lib/CMakeLists.txt
+--- /Users/andres/Desktop/volk_gnsssdr/lib/CMakeLists.txt 2014-10-17 04:26:38.000000000 +0200
++++ /Users/andres/Desktop/volk_gnsssdr_original/lib/CMakeLists.txt 2014-10-17 04:17:37.000000000 +0200
+@@ -517,7 +517,19 @@ if(MSVC)
+ endif()
+
+ #create the volk_gnsssdr runtime library
+-add_library(volk_gnsssdr SHARED ${volk_gnsssdr_sources})
++
++#MODIFICATIONS BY GNSS-SDR
++file(GLOB orc ${CMAKE_SOURCE_DIR}/orc/*.orc)
++file(GLOB CommonMacros ${CMAKE_SOURCE_DIR}/kernels/CommonMacros/*.h ${CMAKE_SOURCE_DIR}/kernels/CommonMacros/README.txt)
++
++#add_library(volk_gnsssdr SHARED ${volk_gnsssdr_sources})
++add_library(volk_gnsssdr SHARED ${volk_gnsssdr_sources} ${h_files} ${CommonMacros} ${orc})
++
++source_group("Kernels" FILES ${h_files})
++source_group("Common Macros" FILES ${CommonMacros})
++source_group("ORC Files" FILES ${orc})
++#END OF MODIFICATIONS
++
+ target_link_libraries(volk_gnsssdr ${volk_gnsssdr_libraries})
+ set_target_properties(volk_gnsssdr PROPERTIES SOVERSION ${LIBVER})
+ set_target_properties(volk_gnsssdr PROPERTIES DEFINE_SYMBOL "volk_gnsssdr_EXPORTS")
+diff -rupN /Users/andres/Desktop/volk_gnsssdr/lib/qa_utils.cc /Users/andres/Desktop/volk_gnsssdr_original/lib/qa_utils.cc
+--- /Users/andres/Desktop/volk_gnsssdr/lib/qa_utils.cc 2014-10-17 04:26:39.000000000 +0200
++++ /Users/andres/Desktop/volk_gnsssdr_original/lib/qa_utils.cc 2014-10-17 04:21:03.000000000 +0200
+@@ -217,6 +217,72 @@ inline void run_cast_test3_s32fc(volk_gn
+ while(iter--) func(buffs[0], buffs[1], buffs[2], scalar, vlen, arch.c_str());
+ }
+
++//ADDED BY GNSS-SDR. START
++inline void run_cast_test1_s8i(volk_gnsssdr_fn_1arg_s8i func, std::vector<void *> &buffs, char scalar, unsigned int vlen, unsigned int iter, std::string arch) {
++ while(iter--) func(buffs[0], scalar, vlen, arch.c_str());
++}
++
++inline void run_cast_test2_s8i(volk_gnsssdr_fn_2arg_s8i func, std::vector<void *> &buffs, char scalar, unsigned int vlen, unsigned int iter, std::string arch) {
++ while(iter--) func(buffs[0], buffs[1], scalar, vlen, arch.c_str());
++}
++
++inline void run_cast_test3_s8i(volk_gnsssdr_fn_3arg_s8i func, std::vector<void *> &buffs, char scalar, unsigned int vlen, unsigned int iter, std::string arch) {
++ while(iter--) func(buffs[0], buffs[1], buffs[2], scalar, vlen, arch.c_str());
++}
++
++inline void run_cast_test1_s8ic(volk_gnsssdr_fn_1arg_s8ic func, std::vector<void *> &buffs, lv_8sc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) {
++ while(iter--) func(buffs[0], scalar, vlen, arch.c_str());
++}
++
++inline void run_cast_test2_s8ic(volk_gnsssdr_fn_2arg_s8ic func, std::vector<void *> &buffs, lv_8sc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) {
++ while(iter--) func(buffs[0], buffs[1], scalar, vlen, arch.c_str());
++}
++
++inline void run_cast_test3_s8ic(volk_gnsssdr_fn_3arg_s8ic func, std::vector<void *> &buffs, lv_8sc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) {
++ while(iter--) func(buffs[0], buffs[1], buffs[2], scalar, vlen, arch.c_str());
++}
++
++inline void run_cast_test8(volk_gnsssdr_fn_8arg func, std::vector<void *> &buffs, unsigned int vlen, unsigned int iter, std::string arch) {
++ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], vlen, arch.c_str());
++}
++
++inline void run_cast_test8_s8i(volk_gnsssdr_fn_8arg_s8i func, std::vector<void *> &buffs, char scalar, unsigned int vlen, unsigned int iter, std::string arch) {
++ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], scalar, vlen, arch.c_str());
++}
++
++inline void run_cast_test8_s8ic(volk_gnsssdr_fn_8arg_s8ic func, std::vector<void *> &buffs, lv_8sc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) {
++ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], scalar, vlen, arch.c_str());
++}
++
++inline void run_cast_test8_s32f(volk_gnsssdr_fn_8arg_s32f func, std::vector<void *> &buffs, float scalar, unsigned int vlen, unsigned int iter, std::string arch) {
++ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], scalar, vlen, arch.c_str());
++}
++
++inline void run_cast_test8_s32fc(volk_gnsssdr_fn_8arg_s32fc func, std::vector<void *> &buffs, lv_32fc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) {
++ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], scalar, vlen, arch.c_str());
++}
++
++inline void run_cast_test12(volk_gnsssdr_fn_12arg func, std::vector<void *> &buffs, unsigned int vlen, unsigned int iter, std::string arch) {
++ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], buffs[8], buffs[9], buffs[10], buffs[11], vlen, arch.c_str());
++}
++
++inline void run_cast_test12_s8i(volk_gnsssdr_fn_12arg_s8i func, std::vector<void *> &buffs, char scalar, unsigned int vlen, unsigned int iter, std::string arch) {
++ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], buffs[8], buffs[9], buffs[10], buffs[11], scalar, vlen, arch.c_str());
++}
++
++inline void run_cast_test12_s8ic(volk_gnsssdr_fn_12arg_s8ic func, std::vector<void *> &buffs, lv_8sc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) {
++ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], buffs[8], buffs[9], buffs[10], buffs[11], scalar, vlen, arch.c_str());
++}
++
++inline void run_cast_test12_s32f(volk_gnsssdr_fn_12arg_s32f func, std::vector<void *> &buffs, float scalar, unsigned int vlen, unsigned int iter, std::string arch) {
++ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], buffs[8], buffs[9], buffs[10], buffs[11], scalar, vlen, arch.c_str());
++}
++
++inline void run_cast_test12_s32fc(volk_gnsssdr_fn_12arg_s32fc func, std::vector<void *> &buffs, lv_32fc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) {
++ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], buffs[8], buffs[9], buffs[10], buffs[11], scalar, vlen, arch.c_str());
++}
++//ADDED BY GNSS-SDR. END
++
+ // This function is a nop that helps resolve GNU Radio bugs 582 and 583.
+ // Without this the cast in run_volk_gnsssdr_tests for tol_i = static_cast<int>(float tol)
+ // won't happen on armhf (reported on cortex A9 and A15).
+@@ -426,7 +492,17 @@ bool run_volk_gnsssdr_tests(volk_gnsssdr
+ } else {
+ run_cast_test1_s32f((volk_gnsssdr_fn_1arg_s32f)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]);
+ }
+- } else throw "unsupported 1 arg function >1 scalars";
++ }
++ //ADDED BY GNSS-SDR. START
++ else if(inputsc.size() == 1 && !inputsc[0].is_float) {
++ if(inputsc[0].is_complex) {
++ run_cast_test1_s8ic((volk_gnsssdr_fn_1arg_s8ic)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]);
++ } else {
++ run_cast_test1_s8i((volk_gnsssdr_fn_1arg_s8i)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]);
++ }
++ }
++ //ADDED BY GNSS-SDR. END
++ else throw "unsupported 1 arg function >1 scalars";
+ break;
+ case 2:
+ if(inputsc.size() == 0) {
+@@ -437,7 +513,17 @@ bool run_volk_gnsssdr_tests(volk_gnsssdr
+ } else {
+ run_cast_test2_s32f((volk_gnsssdr_fn_2arg_s32f)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]);
+ }
+- } else throw "unsupported 2 arg function >1 scalars";
++ }
++ //ADDED BY GNSS-SDR. START
++ else if(inputsc.size() == 1 && !inputsc[0].is_float) {
++ if(inputsc[0].is_complex) {
++ run_cast_test2_s8ic((volk_gnsssdr_fn_2arg_s8ic)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]);
++ } else {
++ run_cast_test2_s8i((volk_gnsssdr_fn_2arg_s8i)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]);
++ }
++ }
++ //ADDED BY GNSS-SDR. END
++ else throw "unsupported 2 arg function >1 scalars";
+ break;
+ case 3:
+ if(inputsc.size() == 0) {
+@@ -448,11 +534,61 @@ bool run_volk_gnsssdr_tests(volk_gnsssdr
+ } else {
+ run_cast_test3_s32f((volk_gnsssdr_fn_3arg_s32f)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]);
+ }
+- } else throw "unsupported 3 arg function >1 scalars";
++ }
++ //ADDED BY GNSS-SDR. START
++ else if(inputsc.size() == 1 && !inputsc[0].is_float) {
++ if(inputsc[0].is_complex) {
++ run_cast_test3_s8ic((volk_gnsssdr_fn_3arg_s8ic)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]);
++ } else {
++ run_cast_test3_s8i((volk_gnsssdr_fn_3arg_s8i)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]);
++ }
++ }
++ //ADDED BY GNSS-SDR. END
++ else throw "unsupported 3 arg function >1 scalars";
+ break;
+ case 4:
+ run_cast_test4((volk_gnsssdr_fn_4arg)(manual_func), test_data[i], vlen, iter, arch_list[i]);
+ break;
++ //ADDED BY GNSS-SDR. START
++ case 8:
++ if(inputsc.size() == 0) {
++ run_cast_test8((volk_gnsssdr_fn_8arg)(manual_func), test_data[i], vlen, iter, arch_list[i]);
++ } else if(inputsc.size() == 1 && inputsc[0].is_float) {
++ if(inputsc[0].is_complex) {
++ run_cast_test8_s32fc((volk_gnsssdr_fn_8arg_s32fc)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]);
++ } else {
++ run_cast_test8_s32f((volk_gnsssdr_fn_8arg_s32f)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]);
++ }
++ }
++ else if(inputsc.size() == 1 && !inputsc[0].is_float) {
++ if(inputsc[0].is_complex) {
++ run_cast_test8_s8ic((volk_gnsssdr_fn_8arg_s8ic)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]);
++ } else {
++ run_cast_test8_s8i((volk_gnsssdr_fn_8arg_s8i)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]);
++ }
++ }
++ else throw "unsupported 8 arg function >1 scalars";
++ break;
++ case 12:
++ if(inputsc.size() == 0) {
++ run_cast_test12((volk_gnsssdr_fn_12arg)(manual_func), test_data[i], vlen, iter, arch_list[i]);
++ } else if(inputsc.size() == 1 && inputsc[0].is_float) {
++ if(inputsc[0].is_complex) {
++ run_cast_test12_s32fc((volk_gnsssdr_fn_12arg_s32fc)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]);
++ } else {
++ run_cast_test12_s32f((volk_gnsssdr_fn_12arg_s32f)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]);
++ }
++ }
++ else if(inputsc.size() == 1 && !inputsc[0].is_float) {
++ if(inputsc[0].is_complex) {
++ run_cast_test12_s8ic((volk_gnsssdr_fn_12arg_s8ic)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]);
++ } else {
++ run_cast_test12_s8i((volk_gnsssdr_fn_12arg_s8i)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]);
++ }
++ }
++ else throw "unsupported 12 arg function >1 scalars";
++ break;
++ //ADDED BY GNSS-SDR. END
+ default:
+ throw "no function handler for this signature";
+ break;
+diff -rupN /Users/andres/Desktop/volk_gnsssdr/lib/qa_utils.h /Users/andres/Desktop/volk_gnsssdr_original/lib/qa_utils.h
+--- /Users/andres/Desktop/volk_gnsssdr/lib/qa_utils.h 2014-10-17 04:26:39.000000000 +0200
++++ /Users/andres/Desktop/volk_gnsssdr_original/lib/qa_utils.h 2014-10-17 04:21:51.000000000 +0200
+@@ -77,4 +77,26 @@ typedef void (*volk_gnsssdr_fn_1arg_s32f
+ typedef void (*volk_gnsssdr_fn_2arg_s32fc)(void *, void *, lv_32fc_t, unsigned int, const char*);
+ typedef void (*volk_gnsssdr_fn_3arg_s32fc)(void *, void *, void *, lv_32fc_t, unsigned int, const char*);
+
++//ADDED BY GNSS-SDR. START
++typedef void (*volk_gnsssdr_fn_1arg_s8i)(void *, char, unsigned int, const char*); //one input vector, one scalar char input
++typedef void (*volk_gnsssdr_fn_2arg_s8i)(void *, void *, char, unsigned int, const char*);
++typedef void (*volk_gnsssdr_fn_3arg_s8i)(void *, void *, void *, char, unsigned int, const char*);
++typedef void (*volk_gnsssdr_fn_1arg_s8ic)(void *, lv_8sc_t, unsigned int, const char*); //one input vector, one scalar lv_8sc_t vector input
++typedef void (*volk_gnsssdr_fn_2arg_s8ic)(void *, void *, lv_8sc_t, unsigned int, const char*);
++typedef void (*volk_gnsssdr_fn_3arg_s8ic)(void *, void *, void *, lv_8sc_t, unsigned int, const char*);
++
++typedef void (*volk_gnsssdr_fn_8arg)(void *, void *, void *, void *, void *, void *, void *, void *, unsigned int, const char*);
++typedef void (*volk_gnsssdr_fn_8arg_s32f)(void *, void *, void *, void *, void *, void *, void *, void *, float, unsigned int, const char*);
++typedef void (*volk_gnsssdr_fn_8arg_s32fc)(void *, void *, void *, void *, void *, void *, void *, void *, lv_32fc_t, unsigned int, const char*);
++typedef void (*volk_gnsssdr_fn_8arg_s8i)(void *, void *, void *, void *, void *, void *, void *, void *, char, unsigned int, const char*);
++typedef void (*volk_gnsssdr_fn_8arg_s8ic)(void *, void *, void *, void *, void *, void *, void *, void *, lv_8sc_t, unsigned int, const char*);
++
++typedef void (*volk_gnsssdr_fn_12arg)(void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, unsigned int, const char*);
++typedef void (*volk_gnsssdr_fn_12arg_s32f)(void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, float, unsigned int, const char*);
++typedef void (*volk_gnsssdr_fn_12arg_s32fc)(void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, lv_32fc_t, unsigned int, const char*);
++typedef void (*volk_gnsssdr_fn_12arg_s8i)(void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, char, unsigned int, const char*);
++typedef void (*volk_gnsssdr_fn_12arg_s8ic)(void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, lv_8sc_t, unsigned int, const char*);
++//ADDED BY GNSS-SDR. END
++
++
+ #endif //VOLK_QA_UTILS_H
+diff -rupN /Users/andres/Desktop/volk_gnsssdr/lib/testqa.cc /Users/andres/Desktop/volk_gnsssdr_original/lib/testqa.cc
+--- /Users/andres/Desktop/volk_gnsssdr/lib/testqa.cc 2014-10-17 04:26:39.000000000 +0200
++++ /Users/andres/Desktop/volk_gnsssdr_original/lib/testqa.cc 2014-10-15 01:55:08.000000000 +0200
+@@ -24,6 +24,58 @@
+ #include <volk_gnsssdr/volk_gnsssdr.h>
+ #include <boost/test/unit_test.hpp>
+
++//VOLK PROTOKERNELS OBTAINED FROM THE GNURADIO BASE
++VOLK_RUN_TESTS(volk_gnsssdr_32fc_x2_multiply_32fc, 1e-4, 0, 20462, 1);
++VOLK_RUN_TESTS(volk_gnsssdr_32fc_x2_dot_prod_32fc, 1e-4, 0, 204603, 1);
++VOLK_RUN_TESTS(volk_gnsssdr_32fc_s32fc_multiply_32fc, 1e-4, 0, 20462, 1);
++VOLK_RUN_TESTS(volk_gnsssdr_32fc_conjugate_32fc, 1e-4, 0, 20462, 1);
++VOLK_RUN_TESTS(volk_gnsssdr_32f_x2_add_32f, 1e-4, 0, 20462, 1);
++VOLK_RUN_TESTS(volk_gnsssdr_32f_index_max_16u, 3, 0, 20462, 1);
++VOLK_RUN_TESTS(volk_gnsssdr_32f_accumulator_s32f, 1e-4, 0, 20462, 1);
++VOLK_RUN_TESTS(volk_gnsssdr_32fc_magnitude_squared_32f, 1e-4, 0, 20462, 1);
++VOLK_RUN_TESTS(volk_gnsssdr_32f_s32f_convert_16i, 3, 0, 20462, 1);
++
++//GNSS-SDR PROTO-KERNELS
++VOLK_RUN_TESTS(volk_gnsssdr_8ic_x2_multiply_8ic, 1e-4, 0, 20462, 1);
++VOLK_RUN_TESTS(volk_gnsssdr_8u_x2_multiply_8u, 1e-4, 0, 20462, 1);
++VOLK_RUN_TESTS(volk_gnsssdr_8ic_x2_dot_prod_8ic, 1e-4, 0, 204603, 1);
++VOLK_RUN_TESTS(volk_gnsssdr_8ic_s8ic_multiply_8ic, 1e-4, 0, 20462, 1);
++VOLK_RUN_TESTS(volk_gnsssdr_8ic_conjugate_8ic, 1e-4, 0, 20462, 1);
++VOLK_RUN_TESTS(volk_gnsssdr_8i_x2_add_8i, 1e-4, 0, 20462, 1);
++VOLK_RUN_TESTS(volk_gnsssdr_8i_index_max_16u, 3, 0, 20462, 1);
++VOLK_RUN_TESTS(volk_gnsssdr_8i_accumulator_s8i, 1e-4, 0, 20462, 1);
++VOLK_RUN_TESTS(volk_gnsssdr_8ic_magnitude_squared_8i, 1e-4, 0, 20462, 1);
++
++VOLK_RUN_TESTS(volk_gnsssdr_8i_max_s8i, 3, 0, 20462, 1);
++VOLK_RUN_TESTS(volk_gnsssdr_64f_accumulator_64f, 3, 0, 20462, 1);
++
++VOLK_RUN_TESTS(volk_gnsssdr_32fc_convert_16ic, 3, 0, 20462, 1);
++VOLK_RUN_TESTS(volk_gnsssdr_32fc_s32f_convert_8ic, 3, 0, 20462, 1);
++VOLK_RUN_TESTS(volk_gnsssdr_32fc_convert_8ic, 3, 0, 20462, 1);
++VOLK_RUN_TESTS(volk_gnsssdr_16i_s32f_convert_32f, 3, 0, 20462, 1);
++
++VOLK_RUN_TESTS(volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3, 1e-4, 0, 20462, 1);
++VOLK_RUN_TESTS(volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3, 1e-4, 0, 20462, 1);
++VOLK_RUN_TESTS(volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3, 1e-4, 0, 20462, 1);
++VOLK_RUN_TESTS(volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3, 1e-4, 0, 20462, 1);
++VOLK_RUN_TESTS(volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3, 1e-4, 0, 20462, 1);
++
++VOLK_RUN_TESTS(volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5, 1e-4, 0, 20462, 1);
++VOLK_RUN_TESTS(volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5, 1e-4, 0, 20462, 1);
++VOLK_RUN_TESTS(volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5, 1e-4, 0, 20462, 1);
++VOLK_RUN_TESTS(volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5, 1e-4, 0, 20462, 1);
++VOLK_RUN_TESTS(volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5, 1e-4, 0, 20462, 1);
++VOLK_RUN_TESTS(volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5, 1e-4, 0, 20462, 1);
++
++VOLK_RUN_TESTS(volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc, 1e-4, 0, 20462, 1);
++VOLK_RUN_TESTS(volk_gnsssdr_s32f_x2_update_local_carrier_32fc, 1e-4, 0, 20462, 1);
++
++
++
++
++
++
++
+ //VOLK_RUN_TESTS(volk_gnsssdr_16i_x5_add_quad_16i_x4, 1e-4, 2046, 10000);
+ //VOLK_RUN_TESTS(volk_gnsssdr_16i_branch_4_state_8, 1e-4, 2046, 10000);
+ //VOLK_RUN_TESTS(volk_gnsssdr_16i_max_star_16i, 0, 0, 20462, 10000);
+diff -rupN /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_32f_x2_add_32f.orc /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_32f_x2_add_32f.orc
+--- /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_32f_x2_add_32f.orc 1970-01-01 01:00:00.000000000 +0100
++++ /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_32f_x2_add_32f.orc 2014-10-15 01:55:08.000000000 +0200
+@@ -0,0 +1,5 @@
++.function volk_gnsssdr_32f_x2_add_32f_a_orc_impl
++.dest 4 dst
++.source 4 src1
++.source 4 src2
++addf dst, src1, src2
+diff -rupN /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_32fc_s32fc_multiply_32fc.orc /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_32fc_s32fc_multiply_32fc.orc
+--- /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_32fc_s32fc_multiply_32fc.orc 1970-01-01 01:00:00.000000000 +0100
++++ /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_32fc_s32fc_multiply_32fc.orc 2014-10-15 01:55:08.000000000 +0200
+@@ -0,0 +1,18 @@
++.function volk_gnsssdr_32fc_s32fc_multiply_32fc_a_orc_impl
++.source 8 src1
++.floatparam 8 scalar
++.dest 8 dst
++.temp 8 iqprod
++.temp 4 real
++.temp 4 imag
++.temp 4 ac
++.temp 4 bd
++.temp 8 swapped
++x2 mulf iqprod, src1, scalar
++splitql bd, ac, iqprod
++subf real, ac, bd
++swaplq swapped, src1
++x2 mulf iqprod, swapped, scalar
++splitql bd, ac, iqprod
++addf imag, ac, bd
++mergelq dst, real, imag
+diff -rupN /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_32fc_x2_multiply_32fc.orc /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_32fc_x2_multiply_32fc.orc
+--- /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_32fc_x2_multiply_32fc.orc 1970-01-01 01:00:00.000000000 +0100
++++ /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_32fc_x2_multiply_32fc.orc 2014-10-15 01:55:08.000000000 +0200
+@@ -0,0 +1,18 @@
++.function volk_gnsssdr_32fc_x2_multiply_32fc_a_orc_impl
++.source 8 src1
++.source 8 src2
++.dest 8 dst
++.temp 8 iqprod
++.temp 4 real
++.temp 4 imag
++.temp 4 ac
++.temp 4 bd
++.temp 8 swapped
++x2 mulf iqprod, src1, src2
++splitql bd, ac, iqprod
++subf real, ac, bd
++swaplq swapped, src1
++x2 mulf iqprod, swapped, src2
++splitql bd, ac, iqprod
++addf imag, ac, bd
++mergelq dst, real, imag
+diff -rupN /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_8i_accumulator_s8i.orc /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_8i_accumulator_s8i.orc
+--- /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_8i_accumulator_s8i.orc 1970-01-01 01:00:00.000000000 +0100
++++ /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_8i_accumulator_s8i.orc 2014-10-15 01:55:08.000000000 +0200
+@@ -0,0 +1,40 @@
++#/*!
++# * \file volk_gnsssdr_8i_accumulator_s8i.orc
++# * \brief ORC implementation: 8 bits (char) scalar accumulator
++# * \authors <ul>
++# * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
++# * </ul>
++# *
++# * ORC code that implements an accumulator of char values
++# *
++# * -------------------------------------------------------------------------
++# *
++# * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
++# *
++# * GNSS-SDR is a software defined Global Navigation
++# * Satellite Systems receiver
++# *
++# * This file is part of GNSS-SDR.
++# *
++# * GNSS-SDR is free software: you can redistribute it and/or modify
++# * it under the terms of the GNU General Public License as published by
++# * the Free Software Foundation, either version 3 of the License, or
++# * at your option) any later version.
++# *
++# * GNSS-SDR is distributed in the hope that it will be useful,
++# * but WITHOUT ANY WARRANTY; without even the implied warranty of
++# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++# * GNU General Public License for more details.
++# *
++# * You should have received a copy of the GNU General Public License
++# * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
++# *
++# * -------------------------------------------------------------------------
++# */
++
++.function volk_gnsssdr_8i_accumulator_s8i_a_orc_impl
++.source 1 src1
++.accumulator 2 acc
++.temp 2 sum
++mergebw sum, 0, src1
++accw acc, sum
+diff -rupN /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_8i_x2_add_8i.orc /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_8i_x2_add_8i.orc
+--- /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_8i_x2_add_8i.orc 1970-01-01 01:00:00.000000000 +0100
++++ /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_8i_x2_add_8i.orc 2014-10-15 01:55:08.000000000 +0200
+@@ -0,0 +1,39 @@
++#/*!
++# * \file volk_gnsssdr_8i_x2_add_8i.orc
++# * \brief ORC implementation: adds pairs of 8 bits (char) scalars
++# * \authors <ul>
++# * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
++# * </ul>
++# *
++# * ORC code that adds pairs of 8 bits (char) scalars
++# *
++# * -------------------------------------------------------------------------
++# *
++# * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
++# *
++# * GNSS-SDR is a software defined Global Navigation
++# * Satellite Systems receiver
++# *
++# * This file is part of GNSS-SDR.
++# *
++# * GNSS-SDR is free software: you can redistribute it and/or modify
++# * it under the terms of the GNU General Public License as published by
++# * the Free Software Foundation, either version 3 of the License, or
++# * at your option) any later version.
++# *
++# * GNSS-SDR is distributed in the hope that it will be useful,
++# * but WITHOUT ANY WARRANTY; without even the implied warranty of
++# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++# * GNU General Public License for more details.
++# *
++# * You should have received a copy of the GNU General Public License
++# * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
++# *
++# * -------------------------------------------------------------------------
++# */
++
++.function volk_gnsssdr_8i_x2_add_8i_a_orc_impl
++.dest 1 dst
++.source 1 src1
++.source 1 src2
++addb dst, src1, src2
+diff -rupN /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_8ic_conjugate_8ic.orc /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_8ic_conjugate_8ic.orc
+--- /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_8ic_conjugate_8ic.orc 1970-01-01 01:00:00.000000000 +0100
++++ /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_8ic_conjugate_8ic.orc 2014-10-15 01:55:08.000000000 +0200
+@@ -0,0 +1,42 @@
++#/*!
++# * \file volk_gnsssdr_8ic_conjugate_8ic.orc
++# * \brief ORC implementation: calculates the conjugate of a 16 bits vector
++# * \authors <ul>
++# * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
++# * </ul>
++# *
++# * ORC code that calculates the conjugate of a
++# * 16 bits vector (8 bits the real part and 8 bits the imaginary part)
++# * result = (real*real) + (imag*imag)
++# *
++# * -------------------------------------------------------------------------
++# *
++# * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
++# *
++# * GNSS-SDR is a software defined Global Navigation
++# * Satellite Systems receiver
++# *
++# * This file is part of GNSS-SDR.
++# *
++# * GNSS-SDR is free software: you can redistribute it and/or modify
++# * it under the terms of the GNU General Public License as published by
++# * the Free Software Foundation, either version 3 of the License, or
++# * at your option) any later version.
++# *
++# * GNSS-SDR is distributed in the hope that it will be useful,
++# * but WITHOUT ANY WARRANTY; without even the implied warranty of
++# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++# * GNU General Public License for more details.
++# *
++# * You should have received a copy of the GNU General Public License
++# * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
++# *
++# * -------------------------------------------------------------------------
++# */
++
++.function volk_gnsssdr_8ic_conjugate_8ic_a_orc_impl
++.source 2 src1
++.dest 2 dst
++.temp 2 merged
++mergebw merged, 1, -1
++x2 mullb dst, merged, src1
+diff -rupN /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_8ic_magnitude_squared_8i.orc /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_8ic_magnitude_squared_8i.orc
+--- /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_8ic_magnitude_squared_8i.orc 1970-01-01 01:00:00.000000000 +0100
++++ /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_8ic_magnitude_squared_8i.orc 2014-10-15 01:55:08.000000000 +0200
+@@ -0,0 +1,45 @@
++#/*!
++# * \file volk_gnsssdr_8ic_magnitude_squared_8i.orc
++# * \brief ORC implementation: calculates the magnitude squared of a 16 bits vector
++# * \authors <ul>
++# * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
++# * </ul>
++# *
++# * ORC code that calculates the magnitude squared of a
++# * 16 bits vector (8 bits the real part and 8 bits the imaginary part)
++# * result = (real*real) + (imag*imag)
++# *
++# * -------------------------------------------------------------------------
++# *
++# * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
++# *
++# * GNSS-SDR is a software defined Global Navigation
++# * Satellite Systems receiver
++# *
++# * This file is part of GNSS-SDR.
++# *
++# * GNSS-SDR is free software: you can redistribute it and/or modify
++# * it under the terms of the GNU General Public License as published by
++# * the Free Software Foundation, either version 3 of the License, or
++# * at your option) any later version.
++# *
++# * GNSS-SDR is distributed in the hope that it will be useful,
++# * but WITHOUT ANY WARRANTY; without even the implied warranty of
++# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++# * GNU General Public License for more details.
++# *
++# * You should have received a copy of the GNU General Public License
++# * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
++# *
++# * -------------------------------------------------------------------------
++# */
++
++.function volk_gnsssdr_8ic_magnitude_squared_8i_a_orc_impl
++.source 2 src1
++.dest 1 dst
++.temp 2 iqprod
++.temp 1 ac
++.temp 1 bd
++x2 mullb iqprod, src1, src1
++splitwb bd, ac, iqprod
++addb dst, ac, bd
+diff -rupN /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_8ic_s8ic_multiply_8ic.orc /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_8ic_s8ic_multiply_8ic.orc
+--- /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_8ic_s8ic_multiply_8ic.orc 1970-01-01 01:00:00.000000000 +0100
++++ /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_8ic_s8ic_multiply_8ic.orc 2014-10-15 01:55:08.000000000 +0200
+@@ -0,0 +1,58 @@
++#/*!
++# * \file volk_gnsssdr_8ic_s8ic_multiply_8ic.orc
++# * \brief ORC implementation: multiplies a group of 16 bits vectors by one constant vector
++# * \authors <ul>
++# * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
++# * </ul>
++# *
++# * ORC code that multiplies a group of 16 bits vectors
++# * (8 bits the real part and 8 bits the imaginary part) by one constant vector
++# *
++# * -------------------------------------------------------------------------
++# *
++# * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
++# *
++# * GNSS-SDR is a software defined Global Navigation
++# * Satellite Systems receiver
++# *
++# * This file is part of GNSS-SDR.
++# *
++# * GNSS-SDR is free software: you can redistribute it and/or modify
++# * it under the terms of the GNU General Public License as published by
++# * the Free Software Foundation, either version 3 of the License, or
++# * at your option) any later version.
++# *
++# * GNSS-SDR is distributed in the hope that it will be useful,
++# * but WITHOUT ANY WARRANTY; without even the implied warranty of
++# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++# * GNU General Public License for more details.
++# *
++# * You should have received a copy of the GNU General Public License
++# * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
++# *
++# * -------------------------------------------------------------------------
++# */
++
++.function volk_gnsssdr_8ic_s8ic_multiply_8ic_a_orc_impl
++.source 2 src1
++.param 2 src2real
++.param 2 src2imag
++.dest 2 dst
++.temp 2 iqprod
++.temp 1 real
++.temp 1 imag
++.temp 1 rr
++.temp 1 ii
++.temp 1 ri
++.temp 1 ir
++x2 mullb iqprod, src1, src2real
++splitwb ir, rr, iqprod
++x2 mullb iqprod, src1, src2imag
++splitwb ii, ri, iqprod
++subb real, rr, ii
++addb imag, ri, ir
++mergebw dst, real, imag
++
++
++
++
+diff -rupN /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_8ic_x2_dot_prod_8ic.orc /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_8ic_x2_dot_prod_8ic.orc
+--- /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_8ic_x2_dot_prod_8ic.orc 1970-01-01 01:00:00.000000000 +0100
++++ /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_8ic_x2_dot_prod_8ic.orc 2014-10-15 01:55:08.000000000 +0200
+@@ -0,0 +1,59 @@
++#/*!
++# * \file volk_gnsssdr_8ic_x2_dot_prod_8ic.orc
++# * \brief ORC implementation: multiplies two 16 bits vectors and accumulates them
++# * \authors <ul>
++# * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
++# * </ul>
++# *
++# * ORC code that multiplies two 16 bits vectors (8 bits the real part
++# * and 8 bits the imaginary part) and accumulates them
++# *
++# * -------------------------------------------------------------------------
++# *
++# * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
++# *
++# * GNSS-SDR is a software defined Global Navigation
++# * Satellite Systems receiver
++# *
++# * This file is part of GNSS-SDR.
++# *
++# * GNSS-SDR is free software: you can redistribute it and/or modify
++# * it under the terms of the GNU General Public License as published by
++# * the Free Software Foundation, either version 3 of the License, or
++# * at your option) any later version.
++# *
++# * GNSS-SDR is distributed in the hope that it will be useful,
++# * but WITHOUT ANY WARRANTY; without even the implied warranty of
++# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++# * GNU General Public License for more details.
++# *
++# * You should have received a copy of the GNU General Public License
++# * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
++# *
++# * -------------------------------------------------------------------------
++# */
++
++.function volk_gnsssdr_8ic_x2_dot_prod_8ic_a_orc_impl
++.source 2 src1
++.source 2 src2
++.accumulator 2 accreal
++.accumulator 2 accimag
++.temp 2 iqprod
++.temp 1 real
++.temp 1 imag
++.temp 2 real2
++.temp 2 imag2
++.temp 1 ac
++.temp 1 bd
++.temp 2 swapped
++x2 mullb iqprod, src1, src2
++splitwb bd, ac, iqprod
++subb real, ac, bd
++swapw swapped, src1
++x2 mullb iqprod, swapped, src2
++splitwb bd, ac, iqprod
++addb imag, ac, bd
++mergebw real2, 0, real
++accw accreal, real2
++mergebw imag2, 0, imag
++accw accimag, imag2
+diff -rupN /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_8ic_x2_multiply_8ic.orc /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_8ic_x2_multiply_8ic.orc
+--- /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_8ic_x2_multiply_8ic.orc 1970-01-01 01:00:00.000000000 +0100
++++ /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_8ic_x2_multiply_8ic.orc 2014-10-15 01:55:08.000000000 +0200
+@@ -0,0 +1,57 @@
++#/*!
++# * \file volk_gnsssdr_8ic_x2_multiply_8ic.orc
++# * \brief ORC implementation: multiplies two 16 bits vectors
++# * \authors <ul>
++# * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
++# * </ul>
++# *
++# * ORC code that multiplies two 16 bits vectors (8 bits the real part
++# * and 8 bits the imaginary part)
++# *
++# * -------------------------------------------------------------------------
++# *
++# * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
++# *
++# * GNSS-SDR is a software defined Global Navigation
++# * Satellite Systems receiver
++# *
++# * This file is part of GNSS-SDR.
++# *
++# * GNSS-SDR is free software: you can redistribute it and/or modify
++# * it under the terms of the GNU General Public License as published by
++# * the Free Software Foundation, either version 3 of the License, or
++# * at your option) any later version.
++# *
++# * GNSS-SDR is distributed in the hope that it will be useful,
++# * but WITHOUT ANY WARRANTY; without even the implied warranty of
++# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++# * GNU General Public License for more details.
++# *
++# * You should have received a copy of the GNU General Public License
++# * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
++# *
++# * -------------------------------------------------------------------------
++# */
++
++.function volk_gnsssdr_8ic_x2_multiply_8ic_a_orc_impl
++.source 2 src1
++.source 2 src2
++.dest 2 dst
++.temp 2 iqprod
++.temp 1 real
++.temp 1 imag
++.temp 1 ac
++.temp 1 bd
++.temp 2 swapped
++x2 mullb iqprod, src1, src2
++splitwb bd, ac, iqprod
++subb real, ac, bd
++swapw swapped, src1
++x2 mullb iqprod, swapped, src2
++splitwb bd, ac, iqprod
++addb imag, ac, bd
++mergebw dst, real, imag
++
++
++
++
+diff -rupN /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3.orc /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3.orc
+--- /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3.orc 1970-01-01 01:00:00.000000000 +0100
++++ /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3.orc 2014-10-15 01:55:08.000000000 +0200
+@@ -0,0 +1,139 @@
++#/*!
++# * \file volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3.orc
++# * \brief ORC implementation: performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation with 16 bits vectors
++# * \authors <ul>
++# * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
++# * </ul>
++# *
++# * ORC code that performs the carrier wipe-off mixing and the
++# * Early, Prompt, and Late correlation with 16 bits vectors (8 bits the
++# * real part and 8 bits the imaginary part):
++# * - The carrier wipe-off is done by multiplying the input signal by the
++# * carrier (multiplication of 16 bits vectors) It returns the input
++# * signal in base band (BB)
++# * - Early values are calculated by multiplying the input signal in BB by the
++# * early code (multiplication of 16 bits vectors), accumulating the results
++# * - Prompt values are calculated by multiplying the input signal in BB by the
++# * prompt code (multiplication of 16 bits vectors), accumulating the results
++# * - Late values are calculated by multiplying the input signal in BB by the
++# * late code (multiplication of 16 bits vectors), accumulating the results
++# *
++# * -------------------------------------------------------------------------
++# *
++# * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
++# *
++# * GNSS-SDR is a software defined Global Navigation
++# * Satellite Systems receiver
++# *
++# * This file is part of GNSS-SDR.
++# *
++# * GNSS-SDR is free software: you can redistribute it and/or modify
++# * it under the terms of the GNU General Public License as published by
++# * the Free Software Foundation, either version 3 of the License, or
++# * at your option) any later version.
++# *
++# * GNSS-SDR is distributed in the hope that it will be useful,
++# * but WITHOUT ANY WARRANTY; without even the implied warranty of
++# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++# * GNU General Public License for more details.
++# *
++# * You should have received a copy of the GNU General Public License
++# * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
++# *
++# * -------------------------------------------------------------------------
++# */
++
++.function volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_first_a_orc_impl
++.source 2 input
++.source 2 carrier
++.source 2 E_code
++.source 2 P_code
++.accumulator 2 E_out_real
++.accumulator 2 E_out_imag
++.accumulator 2 P_out_real
++.accumulator 2 P_out_imag
++.temp 2 bb_signal_sample
++.temp 2 iqprod
++.temp 1 real
++.temp 1 imag
++.temp 1 ac
++.temp 1 bd
++.temp 2 swapped
++
++.temp 2 real2
++.temp 2 imag2
++
++x2 mullb iqprod, input, carrier
++splitwb bd, ac, iqprod
++subb real, ac, bd
++swapw swapped, input
++x2 mullb iqprod, swapped, carrier
++splitwb bd, ac, iqprod
++addb imag, ac, bd
++mergebw bb_signal_sample, real, imag
++
++swapw swapped, bb_signal_sample
++
++x2 mullb iqprod, bb_signal_sample, E_code
++splitwb bd, ac, iqprod
++subb real, ac, bd
++x2 mullb iqprod, swapped, E_code
++splitwb bd, ac, iqprod
++addb imag, ac, bd
++mergebw real2, 0, real
++mergebw imag2, 0, imag
++accw E_out_real, real2
++accw E_out_imag, imag2
++
++x2 mullb iqprod, bb_signal_sample, P_code
++splitwb bd, ac, iqprod
++subb real, ac, bd
++x2 mullb iqprod, swapped, P_code
++splitwb bd, ac, iqprod
++addb imag, ac, bd
++mergebw real2, 0, real
++mergebw imag2, 0, imag
++accw P_out_real, real2
++accw P_out_imag, imag2
++
++.function volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_second_a_orc_impl
++.source 2 input
++.source 2 carrier
++.source 2 L_code
++.accumulator 2 L_out_real
++.accumulator 2 L_out_imag
++
++.temp 2 bb_signal_sample
++.temp 2 iqprod
++.temp 1 real
++.temp 1 imag
++.temp 1 ac
++.temp 1 bd
++.temp 2 swapped
++
++.temp 2 real2
++.temp 2 imag2
++
++x2 mullb iqprod, input, carrier
++splitwb bd, ac, iqprod
++subb real, ac, bd
++swapw swapped, input
++x2 mullb iqprod, swapped, carrier
++splitwb bd, ac, iqprod
++addb imag, ac, bd
++mergebw bb_signal_sample, real, imag
++
++swapw swapped, bb_signal_sample
++
++x2 mullb iqprod, bb_signal_sample, L_code
++splitwb bd, ac, iqprod
++subb real, ac, bd
++x2 mullb iqprod, swapped, L_code
++splitwb bd, ac, iqprod
++addb imag, ac, bd
++mergebw real2, 0, real
++mergebw imag2, 0, imag
++accw L_out_real, real2
++accw L_out_imag, imag2
++
++
+diff -rupN /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_8u_x2_multiply_8u.orc /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_8u_x2_multiply_8u.orc
+--- /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_8u_x2_multiply_8u.orc 1970-01-01 01:00:00.000000000 +0100
++++ /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_8u_x2_multiply_8u.orc 2014-10-15 01:55:08.000000000 +0200
+@@ -0,0 +1,39 @@
++#/*!
++# * \file volk_gnsssdr_8u_x2_multiply_8u.orc
++# * \brief ORC implementation: multiplies unsigned char values
++# * \authors <ul>
++# * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
++# * </ul>
++# *
++# * ORC code that multiplies unsigned char values (8 bits data)
++# *
++# * -------------------------------------------------------------------------
++# *
++# * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
++# *
++# * GNSS-SDR is a software defined Global Navigation
++# * Satellite Systems receiver
++# *
++# * This file is part of GNSS-SDR.
++# *
++# * GNSS-SDR is free software: you can redistribute it and/or modify
++# * it under the terms of the GNU General Public License as published by
++# * the Free Software Foundation, either version 3 of the License, or
++# * at your option) any later version.
++# *
++# * GNSS-SDR is distributed in the hope that it will be useful,
++# * but WITHOUT ANY WARRANTY; without even the implied warranty of
++# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++# * GNU General Public License for more details.
++# *
++# * You should have received a copy of the GNU General Public License
++# * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
++# *
++# * -------------------------------------------------------------------------
++# */
++
++.function volk_gnsssdr_8u_x2_multiply_8u_a_orc_impl
++.source 1 src1
++.source 1 src2
++.dest 1 dst
++mullb dst, src1, src2
+diff -rupN /Users/andres/Desktop/volk_gnsssdr/patches for generating volk_gnsssdr/2014-10-17_Patch.patch /Users/andres/Desktop/volk_gnsssdr_original/patches for generating volk_gnsssdr/2014-10-17_Patch.patch
+--- /Users/andres/Desktop/volk_gnsssdr/patches for generating volk_gnsssdr/2014-10-17_Patch.patch 1970-01-01 01:00:00.000000000 +0100
++++ /Users/andres/Desktop/volk_gnsssdr_original/patches for generating volk_gnsssdr/2014-10-17_Patch.patch 2014-10-17 03:39:01.000000000 +0200
+@@ -0,0 +1,471 @@
++Binary files /Users/andres/Desktop/volk_gnsssdr/.DS_Store and /Users/andres/Desktop/volk_gnsssdr_original/.DS_Store differ
++diff -rupN /Users/andres/Desktop/volk_gnsssdr/lib/CMakeLists.txt /Users/andres/Desktop/volk_gnsssdr_original/lib/CMakeLists.txt
++--- /Users/andres/Desktop/volk_gnsssdr/lib/CMakeLists.txt 2014-10-17 03:00:41.000000000 +0200
+++++ /Users/andres/Desktop/volk_gnsssdr_original/lib/CMakeLists.txt 2014-10-15 01:55:08.000000000 +0200
++@@ -406,8 +406,10 @@ if(${CMAKE_VERSION} VERSION_GREATER "2.8
++ # if we find one that matches our current system architecture
++ # set up the assembler flags and include the source files
++ foreach(ARCH ${ASM_ARCHS_AVAILABLE})
+++ message(STATUS "--==>> -CFLAGS1: ${FULL_C_FLAGS}")
++ string(REGEX MATCH "${ARCH}" ASM_ARCH "${FULL_C_FLAGS}")
++ if( ASM_ARCH STREQUAL "armv7" )
+++ set(ASM-ATT $ENV{ASM})
++ message(STATUS "---- Adding ASM files") # we always use ATT syntax
++ message(STATUS "-- Detected armv7 architecture; enabling ASM")
++ # setup architecture specific assembler flags
++@@ -420,20 +422,13 @@ if(${CMAKE_VERSION} VERSION_GREATER "2.8
++ message(STATUS "Adding source file: ${asm_file}")
++ endforeach(asm_file)
++ endif()
++- enable_language(ASM)
++- set(CMAKE_ASM_FLAGS ${ARCH_ASM_FLAGS})
++- message(STATUS "c flags: ${FULL_C_FLAGS}")
++- message(STATUS "asm flags: ${CMAKE_ASM_FLAGS}")
+++ set(CMAKE_ASM-ATT_FLAGS_INIT ${ARCH_ASM_FLAGS})
+++ enable_language(ASM-ATT) # this must be after flags_init
+++ message(STATUS "asm flags: ${CMAKE_ASM-ATT_FLAGS}")
++ endforeach(ARCH)
++
++ else(${CMAKE_VERSION} VERSION_GREATER "2.8.9")
++ message(STATUS "Not enabling ASM support. CMake >= 2.8.10 required.")
++- foreach(machine_name ${available_machines})
++- string(REGEX MATCH "neon" NEON_MACHINE ${machine_name})
++- if( NEON_MACHINE STREQUAL "neon")
++- message(FATAL_ERROR "CMake >= 2.8.10 is required for ARM NEON support")
++- endif()
++- endforeach()
++ endif(${CMAKE_VERSION} VERSION_GREATER "2.8.9")
++
++ ########################################################################
++@@ -517,11 +512,24 @@ if(MSVC)
++ endif()
++
++ #create the volk_gnsssdr runtime library
++-add_library(volk_gnsssdr SHARED ${volk_gnsssdr_sources})
+++
+++#MODIFICATIONS BY GNSS-SDR
+++file(GLOB orc ${CMAKE_SOURCE_DIR}/orc/*.orc)
+++file(GLOB CommonMacros ${CMAKE_SOURCE_DIR}/kernels/CommonMacros/*.h ${CMAKE_SOURCE_DIR}/kernels/CommonMacros/README.txt)
+++
+++#add_library(volk_gnsssdr SHARED ${volk_gnsssdr_sources})
+++add_library(volk_gnsssdr SHARED ${volk_gnsssdr_sources} ${h_files} ${CommonMacros} ${orc})
+++
+++source_group("Kernels" FILES ${h_files})
+++source_group("Common Macros" FILES ${CommonMacros})
+++source_group("ORC Files" FILES ${orc})
+++#END OF MODIFICATIONS
+++
++ target_link_libraries(volk_gnsssdr ${volk_gnsssdr_libraries})
++ set_target_properties(volk_gnsssdr PROPERTIES SOVERSION ${LIBVER})
++ set_target_properties(volk_gnsssdr PROPERTIES DEFINE_SYMBOL "volk_gnsssdr_EXPORTS")
++
+++
++ install(TARGETS volk_gnsssdr
++ LIBRARY DESTINATION lib${LIB_SUFFIX} COMPONENT "volk_gnsssdr_runtime" # .so file
++ ARCHIVE DESTINATION lib${LIB_SUFFIX} COMPONENT "volk_gnsssdr_devel" # .lib file
++diff -rupN /Users/andres/Desktop/volk_gnsssdr/lib/qa_utils.cc /Users/andres/Desktop/volk_gnsssdr_original/lib/qa_utils.cc
++--- /Users/andres/Desktop/volk_gnsssdr/lib/qa_utils.cc 2014-10-17 03:00:41.000000000 +0200
+++++ /Users/andres/Desktop/volk_gnsssdr_original/lib/qa_utils.cc 2014-10-17 01:54:35.000000000 +0200
++@@ -5,9 +5,7 @@
++ #include <boost/tokenizer.hpp>
++ #include <boost/xpressive/xpressive.hpp>
++ #include <iostream>
++-#include <fstream>
++ #include <vector>
++-#include <map>
++ #include <list>
++ #include <ctime>
++ #include <cmath>
++@@ -217,6 +215,72 @@ inline void run_cast_test3_s32fc(volk_gn
++ while(iter--) func(buffs[0], buffs[1], buffs[2], scalar, vlen, arch.c_str());
++ }
++
+++//ADDED BY GNSS-SDR. START
+++inline void run_cast_test1_s8i(volk_gnsssdr_fn_1arg_s8i func, std::vector<void *> &buffs, char scalar, unsigned int vlen, unsigned int iter, std::string arch) {
+++ while(iter--) func(buffs[0], scalar, vlen, arch.c_str());
+++}
+++
+++inline void run_cast_test2_s8i(volk_gnsssdr_fn_2arg_s8i func, std::vector<void *> &buffs, char scalar, unsigned int vlen, unsigned int iter, std::string arch) {
+++ while(iter--) func(buffs[0], buffs[1], scalar, vlen, arch.c_str());
+++}
+++
+++inline void run_cast_test3_s8i(volk_gnsssdr_fn_3arg_s8i func, std::vector<void *> &buffs, char scalar, unsigned int vlen, unsigned int iter, std::string arch) {
+++ while(iter--) func(buffs[0], buffs[1], buffs[2], scalar, vlen, arch.c_str());
+++}
+++
+++inline void run_cast_test1_s8ic(volk_gnsssdr_fn_1arg_s8ic func, std::vector<void *> &buffs, lv_8sc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) {
+++ while(iter--) func(buffs[0], scalar, vlen, arch.c_str());
+++}
+++
+++inline void run_cast_test2_s8ic(volk_gnsssdr_fn_2arg_s8ic func, std::vector<void *> &buffs, lv_8sc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) {
+++ while(iter--) func(buffs[0], buffs[1], scalar, vlen, arch.c_str());
+++}
+++
+++inline void run_cast_test3_s8ic(volk_gnsssdr_fn_3arg_s8ic func, std::vector<void *> &buffs, lv_8sc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) {
+++ while(iter--) func(buffs[0], buffs[1], buffs[2], scalar, vlen, arch.c_str());
+++}
+++
+++inline void run_cast_test8(volk_gnsssdr_fn_8arg func, std::vector<void *> &buffs, unsigned int vlen, unsigned int iter, std::string arch) {
+++ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], vlen, arch.c_str());
+++}
+++
+++inline void run_cast_test8_s8i(volk_gnsssdr_fn_8arg_s8i func, std::vector<void *> &buffs, char scalar, unsigned int vlen, unsigned int iter, std::string arch) {
+++ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], scalar, vlen, arch.c_str());
+++}
+++
+++inline void run_cast_test8_s8ic(volk_gnsssdr_fn_8arg_s8ic func, std::vector<void *> &buffs, lv_8sc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) {
+++ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], scalar, vlen, arch.c_str());
+++}
+++
+++inline void run_cast_test8_s32f(volk_gnsssdr_fn_8arg_s32f func, std::vector<void *> &buffs, float scalar, unsigned int vlen, unsigned int iter, std::string arch) {
+++ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], scalar, vlen, arch.c_str());
+++}
+++
+++inline void run_cast_test8_s32fc(volk_gnsssdr_fn_8arg_s32fc func, std::vector<void *> &buffs, lv_32fc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) {
+++ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], scalar, vlen, arch.c_str());
+++}
+++
+++inline void run_cast_test12(volk_gnsssdr_fn_12arg func, std::vector<void *> &buffs, unsigned int vlen, unsigned int iter, std::string arch) {
+++ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], buffs[8], buffs[9], buffs[10], buffs[11], vlen, arch.c_str());
+++}
+++
+++inline void run_cast_test12_s8i(volk_gnsssdr_fn_12arg_s8i func, std::vector<void *> &buffs, char scalar, unsigned int vlen, unsigned int iter, std::string arch) {
+++ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], buffs[8], buffs[9], buffs[10], buffs[11], scalar, vlen, arch.c_str());
+++}
+++
+++inline void run_cast_test12_s8ic(volk_gnsssdr_fn_12arg_s8ic func, std::vector<void *> &buffs, lv_8sc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) {
+++ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], buffs[8], buffs[9], buffs[10], buffs[11], scalar, vlen, arch.c_str());
+++}
+++
+++inline void run_cast_test12_s32f(volk_gnsssdr_fn_12arg_s32f func, std::vector<void *> &buffs, float scalar, unsigned int vlen, unsigned int iter, std::string arch) {
+++ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], buffs[8], buffs[9], buffs[10], buffs[11], scalar, vlen, arch.c_str());
+++}
+++
+++inline void run_cast_test12_s32fc(volk_gnsssdr_fn_12arg_s32fc func, std::vector<void *> &buffs, lv_32fc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) {
+++ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], buffs[8], buffs[9], buffs[10], buffs[11], scalar, vlen, arch.c_str());
+++}
+++//ADDED BY GNSS-SDR. END
+++
++ // This function is a nop that helps resolve GNU Radio bugs 582 and 583.
++ // Without this the cast in run_volk_gnsssdr_tests for tol_i = static_cast<int>(float tol)
++ // won't happen on armhf (reported on cortex A9 and A15).
++@@ -330,9 +394,9 @@ bool run_volk_gnsssdr_tests(volk_gnsssdr
++ lv_32fc_t scalar,
++ int vlen,
++ int iter,
++- std::vector<volk_gnsssdr_test_results_t> *results,
++- std::string puppet_master_name,
++- bool benchmark_mode,
+++ std::vector<std::string> *best_arch_vector = 0,
+++ std::string puppet_master_name = "NULL",
+++ bool benchmark_mode,
++ std::string kernel_regex
++ ) {
++ boost::xpressive::sregex kernel_expression = boost::xpressive::sregex::compile(kernel_regex);
++@@ -340,12 +404,6 @@ bool run_volk_gnsssdr_tests(volk_gnsssdr
++ // in this case we have a regex and are only looking to test one kernel
++ return false;
++ }
++- if(results) {
++- results->push_back(volk_gnsssdr_test_results_t());
++- results->back().name = name;
++- results->back().vlen = vlen;
++- results->back().iter = iter;
++- }
++ std::cout << "RUN_VOLK_TESTS: " << name << "(" << vlen << "," << iter << ")" << std::endl;
++
++ // The multiply and lv_force_cast_hf are work arounds for GNU Radio bugs 582 and 583
++@@ -426,7 +484,17 @@ bool run_volk_gnsssdr_tests(volk_gnsssdr
++ } else {
++ run_cast_test1_s32f((volk_gnsssdr_fn_1arg_s32f)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]);
++ }
++- } else throw "unsupported 1 arg function >1 scalars";
+++ }
+++ //ADDED BY GNSS-SDR. START
+++ else if(inputsc.size() == 1 && !inputsc[0].is_float) {
+++ if(inputsc[0].is_complex) {
+++ run_cast_test1_s8ic((volk_gnsssdr_fn_1arg_s8ic)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]);
+++ } else {
+++ run_cast_test1_s8i((volk_gnsssdr_fn_1arg_s8i)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]);
+++ }
+++ }
+++ //ADDED BY GNSS-SDR. END
+++ else throw "unsupported 1 arg function >1 scalars";
++ break;
++ case 2:
++ if(inputsc.size() == 0) {
++@@ -437,7 +505,17 @@ bool run_volk_gnsssdr_tests(volk_gnsssdr
++ } else {
++ run_cast_test2_s32f((volk_gnsssdr_fn_2arg_s32f)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]);
++ }
++- } else throw "unsupported 2 arg function >1 scalars";
+++ }
+++ //ADDED BY GNSS-SDR. START
+++ else if(inputsc.size() == 1 && !inputsc[0].is_float) {
+++ if(inputsc[0].is_complex) {
+++ run_cast_test2_s8ic((volk_gnsssdr_fn_2arg_s8ic)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]);
+++ } else {
+++ run_cast_test2_s8i((volk_gnsssdr_fn_2arg_s8i)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]);
+++ }
+++ }
+++ //ADDED BY GNSS-SDR. END
+++ else throw "unsupported 2 arg function >1 scalars";
++ break;
++ case 3:
++ if(inputsc.size() == 0) {
++@@ -448,11 +526,61 @@ bool run_volk_gnsssdr_tests(volk_gnsssdr
++ } else {
++ run_cast_test3_s32f((volk_gnsssdr_fn_3arg_s32f)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]);
++ }
++- } else throw "unsupported 3 arg function >1 scalars";
+++ }
+++ //ADDED BY GNSS-SDR. START
+++ else if(inputsc.size() == 1 && !inputsc[0].is_float) {
+++ if(inputsc[0].is_complex) {
+++ run_cast_test3_s8ic((volk_gnsssdr_fn_3arg_s8ic)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]);
+++ } else {
+++ run_cast_test3_s8i((volk_gnsssdr_fn_3arg_s8i)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]);
+++ }
+++ }
+++ //ADDED BY GNSS-SDR. END
+++ else throw "unsupported 3 arg function >1 scalars";
++ break;
++ case 4:
++ run_cast_test4((volk_gnsssdr_fn_4arg)(manual_func), test_data[i], vlen, iter, arch_list[i]);
++ break;
+++ //ADDED BY GNSS-SDR. START
+++ case 8:
+++ if(inputsc.size() == 0) {
+++ run_cast_test8((volk_gnsssdr_fn_8arg)(manual_func), test_data[i], vlen, iter, arch_list[i]);
+++ } else if(inputsc.size() == 1 && inputsc[0].is_float) {
+++ if(inputsc[0].is_complex) {
+++ run_cast_test8_s32fc((volk_gnsssdr_fn_8arg_s32fc)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]);
+++ } else {
+++ run_cast_test8_s32f((volk_gnsssdr_fn_8arg_s32f)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]);
+++ }
+++ }
+++ else if(inputsc.size() == 1 && !inputsc[0].is_float) {
+++ if(inputsc[0].is_complex) {
+++ run_cast_test8_s8ic((volk_gnsssdr_fn_8arg_s8ic)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]);
+++ } else {
+++ run_cast_test8_s8i((volk_gnsssdr_fn_8arg_s8i)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]);
+++ }
+++ }
+++ else throw "unsupported 8 arg function >1 scalars";
+++ break;
+++ case 12:
+++ if(inputsc.size() == 0) {
+++ run_cast_test12((volk_gnsssdr_fn_12arg)(manual_func), test_data[i], vlen, iter, arch_list[i]);
+++ } else if(inputsc.size() == 1 && inputsc[0].is_float) {
+++ if(inputsc[0].is_complex) {
+++ run_cast_test12_s32fc((volk_gnsssdr_fn_12arg_s32fc)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]);
+++ } else {
+++ run_cast_test12_s32f((volk_gnsssdr_fn_12arg_s32f)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]);
+++ }
+++ }
+++ else if(inputsc.size() == 1 && !inputsc[0].is_float) {
+++ if(inputsc[0].is_complex) {
+++ run_cast_test12_s8ic((volk_gnsssdr_fn_12arg_s8ic)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]);
+++ } else {
+++ run_cast_test12_s8i((volk_gnsssdr_fn_12arg_s8i)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]);
+++ }
+++ }
+++ else throw "unsupported 12 arg function >1 scalars";
+++ break;
+++ //ADDED BY GNSS-SDR. END
++ default:
++ throw "no function handler for this signature";
++ break;
++@@ -461,13 +589,6 @@ bool run_volk_gnsssdr_tests(volk_gnsssdr
++ end = clock();
++ double arch_time = 1000.0 * (double)(end-start)/(double)CLOCKS_PER_SEC;
++ std::cout << arch_list[i] << " completed in " << arch_time << "ms" << std::endl;
++- if(results) {
++- volk_gnsssdr_test_time_t result;
++- result.name = arch_list[i];
++- result.time = arch_time;
++- result.units = "ms";
++- results->back().results[result.name] = result;
++- }
++
++ profile_times.push_back(arch_time);
++ }
++@@ -568,14 +689,13 @@ bool run_volk_gnsssdr_tests(volk_gnsssdr
++
++ std::cout << "Best aligned arch: " << best_arch_a << std::endl;
++ std::cout << "Best unaligned arch: " << best_arch_u << std::endl;
++- if(results) {
+++ if(best_arch_vector) {
++ if(puppet_master_name == "NULL") {
++- results->back().config_name = name;
++- } else {
++- results->back().config_name = puppet_master_name;
+++ best_arch_vector->push_back(name + " " + best_arch_a + " " + best_arch_u);
+++ }
+++ else {
+++ best_arch_vector->push_back(puppet_master_name + " " + best_arch_a + " " + best_arch_u);
++ }
++- results->back().best_arch_a = best_arch_a;
++- results->back().best_arch_u = best_arch_u;
++ }
++
++ return fail_global;
++diff -rupN /Users/andres/Desktop/volk_gnsssdr/lib/qa_utils.h /Users/andres/Desktop/volk_gnsssdr_original/lib/qa_utils.h
++--- /Users/andres/Desktop/volk_gnsssdr/lib/qa_utils.h 2014-10-17 03:00:41.000000000 +0200
+++++ /Users/andres/Desktop/volk_gnsssdr_original/lib/qa_utils.h 2014-10-15 01:55:08.000000000 +0200
++@@ -3,10 +3,7 @@
++
++ #include <cstdlib>
++ #include <string>
++-#include <iostream>
++-#include <fstream>
++ #include <vector>
++-#include <map>
++ #include <volk_gnsssdr/volk_gnsssdr.h>
++ #include <volk_gnsssdr/volk_gnsssdr_common.h>
++
++@@ -24,46 +21,10 @@ volk_gnsssdr_type_t volk_gnsssdr_type_fr
++ float uniform(void);
++ void random_floats(float *buf, unsigned n);
++
++-class volk_gnsssdr_test_time_t {
++- public:
++- std::string name;
++- double time;
++- std::string units;
++-};
+++bool run_volk_gnsssdr_tests(volk_gnsssdr_func_desc_t, void(*)(), std::string, float, lv_32fc_t, int, int, std::vector<std::string> *, std::string, bool benchmark_mode=false, std::string kernel_regex="");
++
++-class volk_gnsssdr_test_results_t {
++- public:
++- std::string name;
++- std::string config_name;
++- int vlen;
++- int iter;
++- std::map<std::string, volk_gnsssdr_test_time_t> results;
++- std::string best_arch_a;
++- std::string best_arch_u;
++-};
++
++-bool run_volk_gnsssdr_tests(
++- volk_gnsssdr_func_desc_t,
++- void(*)(),
++- std::string,
++- float,
++- lv_32fc_t,
++- int,
++- int,
++- std::vector<volk_gnsssdr_test_results_t> *results = NULL,
++- std::string puppet_master_name = "NULL",
++- bool benchmark_mode=false,
++- std::string kernel_regex=""
++- );
++-
++-
++-#define VOLK_RUN_TESTS(func, tol, scalar, len, iter) \
++- BOOST_AUTO_TEST_CASE(func##_test) { \
++- BOOST_CHECK_EQUAL(run_volk_gnsssdr_tests( \
++- func##_get_func_desc(), (void (*)())func##_manual, \
++- std::string(#func), tol, scalar, len, iter, 0, "NULL"), \
++- 0); \
++- }
+++#define VOLK_RUN_TESTS(func, tol, scalar, len, iter) BOOST_AUTO_TEST_CASE(func##_test) { BOOST_CHECK_EQUAL(run_volk_gnsssdr_tests(func##_get_func_desc(), (void (*)())func##_manual, std::string(#func), tol, scalar, len, iter, 0, "NULL"), 0); }
++ #define VOLK_PROFILE(func, tol, scalar, len, iter, results, bnmode, kernel_regex) run_volk_gnsssdr_tests(func##_get_func_desc(), (void (*)())func##_manual, std::string(#func), tol, scalar, len, iter, results, "NULL", bnmode, kernel_regex)
++ #define VOLK_PUPPET_PROFILE(func, puppet_master_func, tol, scalar, len, iter, results, bnmode, kernel_regex) run_volk_gnsssdr_tests(func##_get_func_desc(), (void (*)())func##_manual, std::string(#func), tol, scalar, len, iter, results, std::string(#puppet_master_func), bnmode, kernel_regex)
++ typedef void (*volk_gnsssdr_fn_1arg)(void *, unsigned int, const char*); //one input, operate in place
++@@ -77,4 +38,25 @@ typedef void (*volk_gnsssdr_fn_1arg_s32f
++ typedef void (*volk_gnsssdr_fn_2arg_s32fc)(void *, void *, lv_32fc_t, unsigned int, const char*);
++ typedef void (*volk_gnsssdr_fn_3arg_s32fc)(void *, void *, void *, lv_32fc_t, unsigned int, const char*);
++
+++//ADDED BY GNSS-SDR. START
+++typedef void (*volk_gnsssdr_fn_1arg_s8i)(void *, char, unsigned int, const char*); //one input vector, one scalar char input
+++typedef void (*volk_gnsssdr_fn_2arg_s8i)(void *, void *, char, unsigned int, const char*);
+++typedef void (*volk_gnsssdr_fn_3arg_s8i)(void *, void *, void *, char, unsigned int, const char*);
+++typedef void (*volk_gnsssdr_fn_1arg_s8ic)(void *, lv_8sc_t, unsigned int, const char*); //one input vector, one scalar lv_8sc_t vector input
+++typedef void (*volk_gnsssdr_fn_2arg_s8ic)(void *, void *, lv_8sc_t, unsigned int, const char*);
+++typedef void (*volk_gnsssdr_fn_3arg_s8ic)(void *, void *, void *, lv_8sc_t, unsigned int, const char*);
+++
+++typedef void (*volk_gnsssdr_fn_8arg)(void *, void *, void *, void *, void *, void *, void *, void *, unsigned int, const char*);
+++typedef void (*volk_gnsssdr_fn_8arg_s32f)(void *, void *, void *, void *, void *, void *, void *, void *, float, unsigned int, const char*);
+++typedef void (*volk_gnsssdr_fn_8arg_s32fc)(void *, void *, void *, void *, void *, void *, void *, void *, lv_32fc_t, unsigned int, const char*);
+++typedef void (*volk_gnsssdr_fn_8arg_s8i)(void *, void *, void *, void *, void *, void *, void *, void *, char, unsigned int, const char*);
+++typedef void (*volk_gnsssdr_fn_8arg_s8ic)(void *, void *, void *, void *, void *, void *, void *, void *, lv_8sc_t, unsigned int, const char*);
+++
+++typedef void (*volk_gnsssdr_fn_12arg)(void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, unsigned int, const char*);
+++typedef void (*volk_gnsssdr_fn_12arg_s32f)(void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, float, unsigned int, const char*);
+++typedef void (*volk_gnsssdr_fn_12arg_s32fc)(void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, lv_32fc_t, unsigned int, const char*);
+++typedef void (*volk_gnsssdr_fn_12arg_s8i)(void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, char, unsigned int, const char*);
+++typedef void (*volk_gnsssdr_fn_12arg_s8ic)(void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, lv_8sc_t, unsigned int, const char*);
+++//ADDED BY GNSS-SDR. END
+++
++ #endif //VOLK_QA_UTILS_H
++diff -rupN /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr.tmpl.h /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr.tmpl.h
++--- /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr.tmpl.h 2014-10-17 03:00:41.000000000 +0200
+++++ /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr.tmpl.h 2014-10-15 01:55:08.000000000 +0200
++@@ -19,8 +19,8 @@
++ * Boston, MA 02110-1301, USA.
++ */
++
++-#ifndef INCLUDED_VOLK_RUNTIME
++-#define INCLUDED_VOLK_RUNTIME
+++#ifndef INCLUDED_VOLK_GNSSSDR_RUNTIME
+++#define INCLUDED_VOLK_GNSSSDR_RUNTIME
++
++ #include <volk_gnsssdr/volk_gnsssdr_typedefs.h>
++ #include <volk_gnsssdr/volk_gnsssdr_config_fixed.h>
++diff -rupN /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr_config_fixed.tmpl.h /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr_config_fixed.tmpl.h
++--- /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr_config_fixed.tmpl.h 2014-10-17 03:00:41.000000000 +0200
+++++ /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr_config_fixed.tmpl.h 2014-10-15 01:55:08.000000000 +0200
++@@ -19,8 +19,8 @@
++ * Boston, MA 02110-1301, USA.
++ */
++
++-#ifndef INCLUDED_VOLK_CONFIG_FIXED_H
++-#define INCLUDED_VOLK_CONFIG_FIXED_H
+++#ifndef INCLUDED_VOLK_GNSSSDR_CONFIG_FIXED_H
+++#define INCLUDED_VOLK_GNSSSDR_CONFIG_FIXED_H
++
++ #for $i, $arch in enumerate($archs)
++ #define LV_$(arch.name.upper()) $i
++diff -rupN /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr_cpu.tmpl.h /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr_cpu.tmpl.h
++--- /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr_cpu.tmpl.h 2014-10-17 03:00:41.000000000 +0200
+++++ /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr_cpu.tmpl.h 2014-10-15 01:55:08.000000000 +0200
++@@ -19,8 +19,8 @@
++ * Boston, MA 02110-1301, USA.
++ */
++
++-#ifndef INCLUDED_VOLK_CPU_H
++-#define INCLUDED_VOLK_CPU_H
+++#ifndef INCLUDED_VOLK_GNSSSDR_CPU_H
+++#define INCLUDED_VOLK_GNSSSDR_CPU_H
++
++ #include <volk_gnsssdr/volk_gnsssdr_common.h>
++
++diff -rupN /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr_machines.tmpl.h /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr_machines.tmpl.h
++--- /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr_machines.tmpl.h 2014-10-17 03:00:41.000000000 +0200
+++++ /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr_machines.tmpl.h 2014-10-15 01:55:08.000000000 +0200
++@@ -19,8 +19,8 @@
++ * Boston, MA 02110-1301, USA.
++ */
++
++-#ifndef INCLUDED_LIBVOLK_MACHINES_H
++-#define INCLUDED_LIBVOLK_MACHINES_H
+++#ifndef INCLUDED_LIBVOLK_GNSSSDR_MACHINES_H
+++#define INCLUDED_LIBVOLK_GNSSSDR_MACHINES_H
++
++ #include <volk_gnsssdr/volk_gnsssdr_common.h>
++ #include <volk_gnsssdr/volk_gnsssdr_typedefs.h>
++diff -rupN /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr_typedefs.tmpl.h /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr_typedefs.tmpl.h
++--- /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr_typedefs.tmpl.h 2014-10-17 03:00:41.000000000 +0200
+++++ /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr_typedefs.tmpl.h 2014-10-15 01:55:08.000000000 +0200
++@@ -19,8 +19,8 @@
++ * Boston, MA 02110-1301, USA.
++ */
++
++-#ifndef INCLUDED_VOLK_TYPEDEFS
++-#define INCLUDED_VOLK_TYPEDEFS
+++#ifndef INCLUDED_VOLK_GNSSSDR_TYPEDEFS
+++#define INCLUDED_VOLK_GNSSSDR_TYPEDEFS
++
++ #include <inttypes.h>
++ #include <volk_gnsssdr/volk_gnsssdr_complex.h>
+diff -rupN /Users/andres/Desktop/volk_gnsssdr/patches for generating volk_gnsssdr/2014-10-17_Patch_with_protokernels.patch /Users/andres/Desktop/volk_gnsssdr_original/patches for generating volk_gnsssdr/2014-10-17_Patch_with_protokernels.patch
+--- /Users/andres/Desktop/volk_gnsssdr/patches for generating volk_gnsssdr/2014-10-17_Patch_with_protokernels.patch 1970-01-01 01:00:00.000000000 +0100
++++ /Users/andres/Desktop/volk_gnsssdr_original/patches for generating volk_gnsssdr/2014-10-17_Patch_with_protokernels.patch 2014-10-17 03:35:38.000000000 +0200
+@@ -0,0 +1,19299 @@
++Binary files /Users/andres/Desktop/volk_gnsssdr/.DS_Store and /Users/andres/Desktop/volk_gnsssdr_original/.DS_Store differ
++diff -rupN /Users/andres/Desktop/volk_gnsssdr/apps/volk_gnsssdr_profile.cc /Users/andres/Desktop/volk_gnsssdr_original/apps/volk_gnsssdr_profile.cc
++--- /Users/andres/Desktop/volk_gnsssdr/apps/volk_gnsssdr_profile.cc 2014-10-17 03:00:41.000000000 +0200
+++++ /Users/andres/Desktop/volk_gnsssdr_original/apps/volk_gnsssdr_profile.cc 2014-10-17 01:45:18.000000000 +0200
++@@ -37,49 +37,6 @@
++
++ namespace fs = boost::filesystem;
++
++-void write_json(std::ofstream &json_file, std::vector<volk_gnsssdr_test_results_t> results) {
++- json_file << "{" << std::endl;
++- json_file << " \"volk_gnsssdr_tests\": [" << std::endl;
++- size_t len = results.size();
++- size_t i = 0;
++- BOOST_FOREACH(volk_gnsssdr_test_results_t &result, results) {
++- json_file << " {" << std::endl;
++- json_file << " \"name\": \"" << result.name << "\"," << std::endl;
++- json_file << " \"vlen\": " << result.vlen << "," << std::endl;
++- json_file << " \"iter\": " << result.iter << "," << std::endl;
++- json_file << " \"best_arch_a\": \"" << result.best_arch_a
++- << "\"," << std::endl;
++- json_file << " \"best_arch_u\": \"" << result.best_arch_u
++- << "\"," << std::endl;
++- json_file << " \"results\": {" << std::endl;
++- size_t results_len = result.results.size();
++- size_t ri = 0;
++- typedef std::pair<std::string, volk_gnsssdr_test_time_t> tpair;
++- BOOST_FOREACH(tpair pair, result.results) {
++- volk_gnsssdr_test_time_t time = pair.second;
++- json_file << " \"" << time.name << "\": {" << std::endl;
++- json_file << " \"name\": \"" << time.name << "\"," << std::endl;
++- json_file << " \"time\": " << time.time << "," << std::endl;
++- json_file << " \"units\": \"" << time.units << "\"" << std::endl;
++- json_file << " }" ;
++- if(ri+1 != results_len) {
++- json_file << ",";
++- }
++- json_file << std::endl;
++- ri++;
++- }
++- json_file << " }" << std::endl;
++- json_file << " }";
++- if(i+1 != len) {
++- json_file << ",";
++- }
++- json_file << std::endl;
++- i++;
++- }
++- json_file << " ]" << std::endl;
++- json_file << "}" << std::endl;
++-}
++-
++ int main(int argc, char *argv[]) {
++ // Adding program options
++ boost::program_options::options_description desc("Options");
++@@ -92,9 +49,6 @@ int main(int argc, char *argv[]) {
++ ("tests-regex,R",
++ boost::program_options::value<std::string>(),
++ "Run tests matching regular expression.")
++- ("json,j",
++- boost::program_options::value<std::string>(),
++- "JSON output file")
++ ;
++
++ // Handle the options that were given
++@@ -102,8 +56,6 @@ int main(int argc, char *argv[]) {
++ bool benchmark_mode;
++ std::string kernel_regex;
++ bool store_results = true;
++- std::ofstream json_file;
++-
++ try {
++ boost::program_options::store(boost::program_options::parse_command_line(argc, argv, desc), vm);
++ boost::program_options::notify(vm);
++@@ -131,14 +83,9 @@ int main(int argc, char *argv[]) {
++ return 0;
++ }
++
++- if ( vm.count("json") )
++- {
++- json_file.open( vm["json"].as<std::string>().c_str() );
++- }
++-
++
++ // Run tests
++- std::vector<volk_gnsssdr_test_results_t> results;
+++ std::vector<std::string> results;
++
++ //VOLK_PROFILE(volk_gnsssdr_16i_x5_add_quad_16i_x4, 1e-4, 2046, 10000, &results, benchmark_mode, kernel_regex);
++ //VOLK_PROFILE(volk_gnsssdr_16i_branch_4_state_8, 1e-4, 2046, 10000, &results, benchmark_mode, kernel_regex);
++@@ -155,6 +102,55 @@ int main(int argc, char *argv[]) {
++
++ // Until we can update the config on a kernel by kernel basis
++ // do not overwrite volk_gnsssdr_config when using a regex.
+++
+++ //GNSS-SDR PROTO-KERNELS
+++ //lv_32fc_t sfv = lv_cmake((float)1, (float)2);
+++ //example: VOLK_PROFILE(volk_gnsssdr_8ic_s8ic_multiply_8ic, 1e-4, sfv, 204602, 1000, &results, benchmark_mode, kernel_regex);
+++
+++ //CAN NOT BE TESTED YET BECAUSE VOLK MODULE DOES NOT SUPPORT IT:
+++ //VOLK_PROFILE(volk_gnsssdr_s32f_x2_update_local_carrier_32fc, 1e-4, 0, 16007, 1, &results, benchmark_mode, kernel_regex);
+++ //VOLK_PROFILE(volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc, 1e-4, 0, 7, 1, &results, benchmark_mode, kernel_regex);
+++
+++ VOLK_PROFILE(volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex);
+++ VOLK_PROFILE(volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex);
+++ VOLK_PROFILE(volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex);
+++ VOLK_PROFILE(volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex);
+++
+++ VOLK_PROFILE(volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex);
+++ VOLK_PROFILE(volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex);
+++ VOLK_PROFILE(volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex);
+++
+++ VOLK_PROFILE(volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex);
+++ VOLK_PROFILE(volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex);
+++ VOLK_PROFILE(volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex);
+++ VOLK_PROFILE(volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex);
+++
+++ VOLK_PROFILE(volk_gnsssdr_32fc_convert_16ic, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex);
+++ VOLK_PROFILE(volk_gnsssdr_32fc_convert_8ic, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex);
+++ VOLK_PROFILE(volk_gnsssdr_32fc_s32f_convert_8ic, 1e-4, 5, 16000, 250, &results, benchmark_mode, kernel_regex);
+++
+++ /*VOLK_PROFILE(volk_gnsssdr_32f_accumulator_s32f, 1e-4, 0, 204602, 10000, &results, benchmark_mode, kernel_regex);
+++ VOLK_PROFILE(volk_gnsssdr_8i_accumulator_s8i, 1e-4, 0, 204602, 10000, &results, benchmark_mode, kernel_regex);
+++ VOLK_PROFILE(volk_gnsssdr_32f_index_max_16u, 3, 0, 204602, 5000, &results, benchmark_mode, kernel_regex);
+++ VOLK_PROFILE(volk_gnsssdr_8i_index_max_16u, 3, 0, 204602, 5000, &results, benchmark_mode, kernel_regex);
+++ VOLK_PROFILE(volk_gnsssdr_8i_max_s8i, 3, 0, 204602, 5000, &results, benchmark_mode, kernel_regex);
+++ VOLK_PROFILE(volk_gnsssdr_32f_x2_add_32f, 1e-4, 0, 204602, 10000, &results, benchmark_mode, kernel_regex);
+++ VOLK_PROFILE(volk_gnsssdr_8i_x2_add_8i, 1e-4, 0, 204602, 10000, &results, benchmark_mode, kernel_regex);
+++ VOLK_PROFILE(volk_gnsssdr_32fc_conjugate_32fc, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
+++ VOLK_PROFILE(volk_gnsssdr_8ic_conjugate_8ic, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
+++ VOLK_PROFILE(volk_gnsssdr_32fc_magnitude_squared_32f, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
+++ VOLK_PROFILE(volk_gnsssdr_8ic_magnitude_squared_8i, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
+++ VOLK_PROFILE(volk_gnsssdr_32fc_s32fc_multiply_32fc, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
+++ VOLK_PROFILE(volk_gnsssdr_8ic_s8ic_multiply_8ic, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
+++ VOLK_PROFILE(volk_gnsssdr_32fc_x2_dot_prod_32fc, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
+++ VOLK_PROFILE(volk_gnsssdr_8ic_x2_dot_prod_8ic, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
+++ VOLK_PROFILE(volk_gnsssdr_32fc_x2_multiply_32fc, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
+++ VOLK_PROFILE(volk_gnsssdr_8ic_x2_multiply_8ic, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
+++ VOLK_PROFILE(volk_gnsssdr_8u_x2_multiply_8u, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
+++ VOLK_PROFILE(volk_gnsssdr_64f_accumulator_64f, 1e-4, 0, 16000, 1000, &results, benchmark_mode, kernel_regex);
+++ VOLK_PROFILE(volk_gnsssdr_32f_s32f_convert_16i, 1e-4, 1, 204602, 250, &results, benchmark_mode, kernel_regex);
+++ VOLK_PROFILE(volk_gnsssdr_16i_s32f_convert_32f, 1e-4, 1, 204602, 250, &results, benchmark_mode, kernel_regex);*/
+++
++ if(store_results) {
++ char path[1024];
++ volk_gnsssdr_get_config_path(path);
++@@ -178,10 +174,8 @@ int main(int argc, char *argv[]) {
++ #the function name is followed by the preferred architecture.\n\
++ ";
++
++- BOOST_FOREACH(volk_gnsssdr_test_results_t result, results) {
++- config << result.config_name << " "
++- << result.best_arch_a << " "
++- << result.best_arch_u << std::endl;
+++ BOOST_FOREACH(std::string result, results) {
+++ config << result << std::endl;
++ }
++ config.close();
++ }
++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/CommonMacros/CommonMacros.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/CommonMacros/CommonMacros.h
++--- /Users/andres/Desktop/volk_gnsssdr/kernels/CommonMacros/CommonMacros.h 1970-01-01 01:00:00.000000000 +0100
+++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/CommonMacros/CommonMacros.h 2014-10-15 01:55:08.000000000 +0200
++@@ -0,0 +1,174 @@
+++/*!
+++ * \file CommonMacros.h
+++ * \brief Common macros used inside the volk protokernels.
+++ * \authors <ul>
+++ * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
+++ * </ul>
+++ *
+++ * -------------------------------------------------------------------------
+++ *
+++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
+++ *
+++ * GNSS-SDR is a software defined Global Navigation
+++ * Satellite Systems receiver
+++ *
+++ * This file is part of GNSS-SDR.
+++ *
+++ * GNSS-SDR is free software: you can redistribute it and/or modify
+++ * it under the terms of the GNU General Public License as published by
+++ * the Free Software Foundation, either version 3 of the License, or
+++ * at your option) any later version.
+++ *
+++ * GNSS-SDR is distributed in the hope that it will be useful,
+++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+++ * GNU General Public License for more details.
+++ *
+++ * You should have received a copy of the GNU General Public License
+++ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
+++ *
+++ * -------------------------------------------------------------------------
+++ */
+++#ifndef INCLUDED_gnsssdr_CommonMacros_u_H
+++#define INCLUDED_gnsssdr_CommonMacros_u_H
+++
+++ #ifdef LV_HAVE_SSE4_1
+++ /*!
+++ \brief Macros for U_SSE4_1
+++ */
+++
+++ #ifndef CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1
+++ #define CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(input1, input2, real, imag)\
+++ imag = _mm_srli_si128 (input1, 2);\
+++ imag = _mm_blend_epi16 (input2, imag, 85);\
+++ real = _mm_slli_si128 (input2, 2);\
+++ real = _mm_blend_epi16 (real, input1, 85);
+++ #endif /* CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1 */
+++
+++ #ifndef CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1
+++ #define CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(input, input_i_1, input_i_2, output_i32, output_ps)\
+++ input_i_1 = _mm_cvtepi16_epi32(input);\
+++ input = _mm_srli_si128 (input, 8);\
+++ input_i_2 = _mm_cvtepi16_epi32(input);\
+++ output_i32 = _mm_add_epi32 (input_i_1, input_i_2);\
+++ output_ps = _mm_cvtepi32_ps(output_i32);
+++ #endif /* CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1 */
+++
+++ #ifndef CM_8IC_CONVERT_AND_ACC_32FC_U_SSE4_1
+++ #define CM_8IC_CONVERT_AND_ACC_32FC_U_SSE4_1(input, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)\
+++ input_i_1 = _mm_cvtepi8_epi32(input);\
+++ input = _mm_srli_si128 (input, 4);\
+++ input_i_2 = _mm_cvtepi8_epi32(input);\
+++ input = _mm_srli_si128 (input, 4);\
+++ output_i32_1 = _mm_add_epi32 (input_i_1, input_i_2);\
+++ input_i_1 = _mm_cvtepi8_epi32(input);\
+++ input = _mm_srli_si128 (input, 4);\
+++ input_i_2 = _mm_cvtepi8_epi32(input);\
+++ input = _mm_srli_si128 (input, 4);\
+++ output_i32_2 = _mm_add_epi32 (input_i_1, input_i_2);\
+++ output_i32 = _mm_add_epi32 (output_i32_1, output_i32_2);\
+++ output_ps = _mm_cvtepi32_ps(output_i32);
+++ #endif /* CM_8IC_CONVERT_AND_ACC_32FC_U_SSE4_1 */
+++
+++ #endif /* LV_HAVE_SSE4_1 */
+++
+++ #ifdef LV_HAVE_SSE2
+++ /*!
+++ \brief Macros for U_SSE2
+++ */
+++
+++ #ifdef LV_HAVE_SSSE3
+++ /*!
+++ \brief Macros for U_SSSE3
+++ */
+++
+++ #ifndef CM_8IC_X2_SCALAR_PRODUCT_16IC_X2_U_SSSE3
+++ #define CM_8IC_X2_SCALAR_PRODUCT_16IC_X2_U_SSSE3(y, x, check_sign_sequence, rearrange_sequence, y_aux, x_abs, real_output, imag_output)\
+++ y_aux = _mm_sign_epi8 (y, x);\
+++ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence);\
+++ real_output = _mm_maddubs_epi16 (x_abs, y_aux);\
+++ \
+++ y_aux = _mm_shuffle_epi8 (y, rearrange_sequence);\
+++ y_aux = _mm_sign_epi8 (y_aux, x);\
+++ imag_output = _mm_maddubs_epi16 (x_abs, y_aux);
+++ #endif /* CM_8IC_X2_SCALAR_PRODUCT_16IC_X2_U_SSSE3 */
+++
+++ #endif /* LV_HAVE_SSSE3 */
+++
+++ #ifndef CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2
+++ #define CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output)\
+++ realx_mult_realy = _mm_mullo_epi16 (realx, realy);\
+++ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);\
+++ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);\
+++ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);\
+++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);\
+++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
+++ #endif /* CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2 */
+++
+++ #ifndef CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2
+++ #define CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(input, mult1, real, imag)\
+++ imag = _mm_srli_si128 (input, 1);\
+++ imag = _mm_and_si128 (imag, mult1);\
+++ real = _mm_and_si128 (input, mult1);
+++ #endif /* CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2 */
+++
+++ #ifndef CM_8IC_CONVERT_AND_ACC_32FC_U_SSE2
+++ #define CM_8IC_CONVERT_AND_ACC_32FC_U_SSE2(input, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)\
+++ input_i_1 = _mm_unpacklo_epi8(_mm_setzero_si128(), input);\
+++ input_i_2 = _mm_unpacklo_epi16(_mm_setzero_si128(), input_i_1);\
+++ input_i_1 = _mm_unpackhi_epi16(_mm_setzero_si128(), input_i_1);\
+++ input_i_1 = _mm_srai_epi32(input_i_1, 24);\
+++ input_i_2 = _mm_srai_epi32(input_i_2, 24);\
+++ output_i32 = _mm_add_epi32(input_i_1, input_i_2);\
+++ output_ps_1 = _mm_cvtepi32_ps(output_i32);\
+++ \
+++ input_i_1 = _mm_unpackhi_epi8(_mm_setzero_si128(), input);\
+++ input_i_2 = _mm_unpacklo_epi16(_mm_setzero_si128(), input_i_1);\
+++ input_i_1 = _mm_unpackhi_epi16(_mm_setzero_si128(), input_i_1);\
+++ input_i_1 = _mm_srai_epi32(input_i_1, 24);\
+++ input_i_2 = _mm_srai_epi32(input_i_2, 24);\
+++ output_i32 = _mm_add_epi32(input_i_1, input_i_2);\
+++ output_ps_2 = _mm_cvtepi32_ps(output_i32);
+++ #endif /* CM_8IC_CONVERT_AND_ACC_32FC_U_SSE2 */
+++
+++ #ifndef CM_8IC_CONTROLMINUS128_8IC_U_SSE2
+++ #define CM_8IC_CONTROLMINUS128_8IC_U_SSE2(y, minus128, minus128control)\
+++ minus128control = _mm_cmpeq_epi8 (y, minus128);\
+++ y = _mm_sub_epi8 (y, minus128control);
+++ #endif /* CM_8IC_CONTROLMINUS128_8IC_U_SSE2 */
+++
+++ #endif /* LV_HAVE_SSE2 */
+++
+++ #ifdef LV_HAVE_GENERIC
+++ /*!
+++ \brief Macros for U_GENERIC
+++ */
+++
+++ #endif /* LV_HAVE_GENERIC */
+++#endif /* INCLUDED_gnsssdr_CommonMacros_u_H */
+++
+++
+++#ifndef INCLUDED_gnsssdr_CommonMacros_a_H
+++#define INCLUDED_gnsssdr_CommonMacros_a_H
+++
+++ #ifdef LV_HAVE_SSE4_1
+++ /*!
+++ \brief Macros for A_SSE4_1
+++ */
+++
+++ #endif /* LV_HAVE_SSE4_1 */
+++
+++ #ifdef LV_HAVE_SSE2
+++ /*!
+++ \brief Macros for U_SSE2
+++ */
+++
+++ #endif /* LV_HAVE_SSE2 */
+++
+++ #ifdef LV_HAVE_GENERIC
+++ /*!
+++ \brief Macros for A_GENERIC
+++ */
+++
+++ #endif /* LV_HAVE_GENERIC */
+++#endif /* INCLUDED_gnsssdr_CommonMacros_a_H */
++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/CommonMacros/CommonMacros_16ic_cw_epl_corr_32fc.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/CommonMacros/CommonMacros_16ic_cw_epl_corr_32fc.h
++--- /Users/andres/Desktop/volk_gnsssdr/kernels/CommonMacros/CommonMacros_16ic_cw_epl_corr_32fc.h 1970-01-01 01:00:00.000000000 +0100
+++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/CommonMacros/CommonMacros_16ic_cw_epl_corr_32fc.h 2014-10-15 01:55:08.000000000 +0200
++@@ -0,0 +1,76 @@
+++/*!
+++ * \file CommonMacros_16ic_cw_corr_32fc.h
+++ * \brief Common macros used inside the 16ic_cw_corr_32fc volk protokernels.
+++ * \authors <ul>
+++ * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
+++ * </ul>
+++ *
+++ * -------------------------------------------------------------------------
+++ *
+++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
+++ *
+++ * GNSS-SDR is a software defined Global Navigation
+++ * Satellite Systems receiver
+++ *
+++ * This file is part of GNSS-SDR.
+++ *
+++ * GNSS-SDR is free software: you can redistribute it and/or modify
+++ * it under the terms of the GNU General Public License as published by
+++ * the Free Software Foundation, either version 3 of the License, or
+++ * at your option) any later version.
+++ *
+++ * GNSS-SDR is distributed in the hope that it will be useful,
+++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+++ * GNU General Public License for more details.
+++ *
+++ * You should have received a copy of the GNU General Public License
+++ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
+++ *
+++ * -------------------------------------------------------------------------
+++ */
+++#ifndef INCLUDED_gnsssdr_CommonMacros_16ic_cw_corr_32fc_u_H
+++#define INCLUDED_gnsssdr_CommonMacros_16ic_cw_corr_32fc_u_H
+++#include "CommonMacros/CommonMacros.h"
+++
+++ #ifdef LV_HAVE_SSE4_1
+++ /*!
+++ \brief Macros for U_SSE4_1
+++ */
+++
+++ #ifndef CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1
+++ #define CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)\
+++ CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy)\
+++ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(real_bb_signal_sample, imag_bb_signal_sample, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output)\
+++ CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(real_output, input_i_1, input_i_2, output_i32, real_output_ps)\
+++ CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(imag_output, input_i_1, input_i_2, output_i32, imag_output_ps)
+++ #endif /* CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1 */
+++
+++ #endif /* LV_HAVE_SSE4_1 */
+++
+++ #ifdef LV_HAVE_GENERIC
+++ /*!
+++ \brief Macros for U_GENERIC
+++ */
+++
+++ #endif /* LV_HAVE_GENERIC */
+++#endif /* INCLUDED_gnsssdr_CommonMacros_16ic_cw_corr_32fc_u_H */
+++
+++
+++#ifndef INCLUDED_gnsssdr_CommonMacros_16ic_cw_corr_32fc_a_H
+++#define INCLUDED_gnsssdr_CommonMacros_16ic_cw_corr_32fc_a_H
+++
+++ #ifdef LV_HAVE_SSE4_1
+++ /*!
+++ \brief Macros for A_SSE4_1
+++ */
+++
+++ #endif /* LV_HAVE_SSE4_1 */
+++
+++ #ifdef LV_HAVE_GENERIC
+++ /*!
+++ \brief Macros for A_GENERIC
+++ */
+++
+++ #endif /* LV_HAVE_GENERIC */
+++#endif /* INCLUDED_gnsssdr_CommonMacros_16ic_cw_corr_32fc_a_H */
++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h
++--- /Users/andres/Desktop/volk_gnsssdr/kernels/CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h 1970-01-01 01:00:00.000000000 +0100
+++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h 2014-10-15 01:55:08.000000000 +0200
++@@ -0,0 +1,114 @@
+++/*!
+++ * \file CommonMacros_8ic_cw_corr_32fc.h
+++ * \brief Common macros used inside the 8ic_cw_corr_32fc volk protokernels.
+++ * \authors <ul>
+++ * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
+++ * </ul>
+++ *
+++ * -------------------------------------------------------------------------
+++ *
+++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
+++ *
+++ * GNSS-SDR is a software defined Global Navigation
+++ * Satellite Systems receiver
+++ *
+++ * This file is part of GNSS-SDR.
+++ *
+++ * GNSS-SDR is free software: you can redistribute it and/or modify
+++ * it under the terms of the GNU General Public License as published by
+++ * the Free Software Foundation, either version 3 of the License, or
+++ * at your option) any later version.
+++ *
+++ * GNSS-SDR is distributed in the hope that it will be useful,
+++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+++ * GNU General Public License for more details.
+++ *
+++ * You should have received a copy of the GNU General Public License
+++ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
+++ *
+++ * -------------------------------------------------------------------------
+++ */
+++#ifndef INCLUDED_gnsssdr_CommonMacros_8ic_cw_corr_32fc_u_H
+++#define INCLUDED_gnsssdr_CommonMacros_8ic_cw_corr_32fc_u_H
+++#include "CommonMacros/CommonMacros.h"
+++
+++ #ifdef LV_HAVE_SSE4_1
+++ /*!
+++ \brief Macros for U_SSE4_1
+++ */
+++
+++ #ifndef CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1
+++ #define CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)\
+++ CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy)\
+++ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(real_bb_signal_sample, imag_bb_signal_sample, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output)\
+++ \
+++ imag_output = _mm_slli_si128 (imag_output, 1);\
+++ output = _mm_blendv_epi8 (imag_output, real_output, mult1);\
+++ \
+++ CM_8IC_CONVERT_AND_ACC_32FC_U_SSE4_1(output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
+++ #endif /* CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1 */
+++
+++ #ifndef CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1
+++ #define CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)\
+++ CM_8IC_CONTROLMINUS128_8IC_U_SSE2(y, minus128, minus128control)\
+++ CM_8IC_X2_SCALAR_PRODUCT_16IC_X2_U_SSSE3(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output)\
+++ CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(real_output, input_i_1, input_i_2, output_i32, real_output_ps)\
+++ CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(imag_output, input_i_1, input_i_2, output_i32, imag_output_ps)
+++ #endif /* CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1 */
+++
+++ #ifndef CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1
+++ #define CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)\
+++ CM_8IC_X2_SCALAR_PRODUCT_16IC_X2_U_SSSE3(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output)\
+++ CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(real_output, input_i_1, input_i_2, output_i32, real_output_ps)\
+++ CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(imag_output, input_i_1, input_i_2, output_i32, imag_output_ps)
+++ #endif /* CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1 */
+++
+++ #endif /* LV_HAVE_SSE4_1 */
+++
+++ #ifdef LV_HAVE_SSE2
+++ /*!
+++ \brief Macros for U_SSE2
+++ */
+++
+++ #ifndef CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2
+++ #define CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)\
+++ CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy)\
+++ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(real_bb_signal_sample, imag_bb_signal_sample, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output)\
+++ \
+++ real_output = _mm_and_si128 (real_output, mult1);\
+++ imag_output = _mm_and_si128 (imag_output, mult1);\
+++ imag_output = _mm_slli_si128 (imag_output, 1);\
+++ output = _mm_or_si128 (real_output, imag_output);\
+++ \
+++ CM_8IC_CONVERT_AND_ACC_32FC_U_SSE2(output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
+++ #endif /* CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2 */
+++
+++ #endif /* LV_HAVE_SSE2 */
+++
+++ #ifdef LV_HAVE_GENERIC
+++ /*!
+++ \brief Macros for U_GENERIC
+++ */
+++
+++ #endif /* LV_HAVE_GENERIC */
+++#endif /* INCLUDED_gnsssdr_CommonMacros_8ic_cw_corr_32fc_u_H */
+++
+++
+++#ifndef INCLUDED_gnsssdr_CommonMacros_8ic_cw_corr_32fc_a_H
+++#define INCLUDED_gnsssdr_CommonMacros_8ic_cw_corr_32fc_a_H
+++
+++ #ifdef LV_HAVE_SSE4_1
+++ /*!
+++ \brief Macros for A_SSE4_1
+++ */
+++
+++ #endif /* LV_HAVE_SSE4_1 */
+++
+++ #ifdef LV_HAVE_GENERIC
+++ /*!
+++ \brief Macros for A_GENERIC
+++ */
+++
+++ #endif /* LV_HAVE_GENERIC */
+++#endif /* INCLUDED_gnsssdr_CommonMacros_8ic_cw_corr_32fc_a_H */
++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/CommonMacros/README.txt /Users/andres/Desktop/volk_gnsssdr_original/kernels/CommonMacros/README.txt
++--- /Users/andres/Desktop/volk_gnsssdr/kernels/CommonMacros/README.txt 1970-01-01 01:00:00.000000000 +0100
+++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/CommonMacros/README.txt 2014-10-15 01:55:08.000000000 +0200
++@@ -0,0 +1,34 @@
+++####################################################################
+++Common Macros inside volk_gnsssdr module
+++####################################################################
+++
+++First of all, sorry for making you need to read this: macros are evil, they can not be debugged, you do not know where the errors come from, syntax is annoying.. BUT this is the only way I found that allows to share one piece of code between various proto-kernels without performance penalties.
+++Inline functions have been tested, and they introduce a really small time penalty, but it becomes huge because of long loops, with thousands of samples.
+++
+++####################################################################
+++Syntax
+++####################################################################
+++
+++In order to allow better understanding of the code I created the macros with an specific syntax.
+++
+++1) Inside CommonMacros.h you will find macros for common operations. I will explain the syntax with an example:
+++
+++example: CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output)
+++
+++First of all, you find the characters “CM”, which means CommonMacros. After that the type and the amount of inputs is placed: “_16IC_X4” (16 bits complex integers, four inputs). The syntax for type is the same as the one used with volk protokernels, refer to GNURadio documentation for more help. The it comes the name of the macro (“_SCALAR_PRODUCT”), and after that the type and the amount of outputs (“_16IC_X2”). Finally it is placed the SSE minimum version needed to run (“_U_SSE2”). In the arguments you will find (from left to right) the inputs (four inputs: realx, imagx, realy, imagy), some variables that the macro needs to work (realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy) and finally the outputs (two outputs: real_output, imag_output).
+++The variables that the macro needs are specified when calling it in order to avoid after-compile problems: if you want to use a macro you will need to declare all the variables it needs before, or you will not be able to compile.
+++
+++2) Inside all the other headers, CommonMacros_XXXXXX.h you will find macros for a specific group of proto-kernels. The syntax is the same as the CommonMacros.h
+++
+++####################################################################
+++Workflow
+++####################################################################
+++
+++In order to use the macros easily, I usually test the code without macros inside a testing proto-kernel, where you are able to test it, debug it and use breakpoints.
+++When it works I place code inside a macro an I test it again.
+++
+++####################################################################
+++Why macros
+++####################################################################
+++1) They are the only way I could find for sharing code between proto-kernels without performance penalty.
+++2) It is true that they are really difficult to debug, but if you work with them responsibly it is not so hard. Volk_gnsssdr checks all the SSE proto-kernels implementations results against the generic implementation results, so if your macro is not working you will appreciate it after profiling it.
++\ No newline at end of file
++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16i_s32f_convert_32f.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_16i_s32f_convert_32f.h
++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16i_s32f_convert_32f.h 1970-01-01 01:00:00.000000000 +0100
+++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_16i_s32f_convert_32f.h 2014-10-15 01:55:08.000000000 +0200
++@@ -0,0 +1,241 @@
+++#ifndef INCLUDED_volk_gnsssdr_16i_s32f_convert_32f_u_H
+++#define INCLUDED_volk_gnsssdr_16i_s32f_convert_32f_u_H
+++
+++#include <inttypes.h>
+++#include <stdio.h>
+++
+++#ifdef LV_HAVE_SSE4_1
+++#include <smmintrin.h>
+++
+++ /*!
+++ \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+++ \param inputVector The 16 bit input data buffer
+++ \param outputVector The floating point output data buffer
+++ \param scalar The value divided against each point in the output buffer
+++ \param num_points The number of data values to be converted
+++ \note Output buffer does NOT need to be properly aligned
+++ */
+++static inline void volk_gnsssdr_16i_s32f_convert_32f_u_sse4_1(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
+++ unsigned int number = 0;
+++ const unsigned int eighthPoints = num_points / 8;
+++
+++ float* outputVectorPtr = outputVector;
+++ __m128 invScalar = _mm_set_ps1(1.0/scalar);
+++ int16_t* inputPtr = (int16_t*)inputVector;
+++ __m128i inputVal;
+++ __m128i inputVal2;
+++ __m128 ret;
+++
+++ for(;number < eighthPoints; number++){
+++
+++ // Load the 8 values
+++ inputVal = _mm_loadu_si128((__m128i*)inputPtr);
+++
+++ // Shift the input data to the right by 64 bits ( 8 bytes )
+++ inputVal2 = _mm_srli_si128(inputVal, 8);
+++
+++ // Convert the lower 4 values into 32 bit words
+++ inputVal = _mm_cvtepi16_epi32(inputVal);
+++ inputVal2 = _mm_cvtepi16_epi32(inputVal2);
+++
+++ ret = _mm_cvtepi32_ps(inputVal);
+++ ret = _mm_mul_ps(ret, invScalar);
+++ _mm_storeu_ps(outputVectorPtr, ret);
+++ outputVectorPtr += 4;
+++
+++ ret = _mm_cvtepi32_ps(inputVal2);
+++ ret = _mm_mul_ps(ret, invScalar);
+++ _mm_storeu_ps(outputVectorPtr, ret);
+++
+++ outputVectorPtr += 4;
+++
+++ inputPtr += 8;
+++ }
+++
+++ number = eighthPoints * 8;
+++ for(; number < num_points; number++){
+++ outputVector[number] =((float)(inputVector[number])) / scalar;
+++ }
+++}
+++#endif /* LV_HAVE_SSE4_1 */
+++
+++#ifdef LV_HAVE_SSE
+++#include <xmmintrin.h>
+++
+++ /*!
+++ \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+++ \param inputVector The 16 bit input data buffer
+++ \param outputVector The floating point output data buffer
+++ \param scalar The value divided against each point in the output buffer
+++ \param num_points The number of data values to be converted
+++ \note Output buffer does NOT need to be properly aligned
+++ */
+++static inline void volk_gnsssdr_16i_s32f_convert_32f_u_sse(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
+++ unsigned int number = 0;
+++ const unsigned int quarterPoints = num_points / 4;
+++
+++ float* outputVectorPtr = outputVector;
+++ __m128 invScalar = _mm_set_ps1(1.0/scalar);
+++ int16_t* inputPtr = (int16_t*)inputVector;
+++ __m128 ret;
+++
+++ for(;number < quarterPoints; number++){
+++ ret = _mm_set_ps((float)(inputPtr[3]), (float)(inputPtr[2]), (float)(inputPtr[1]), (float)(inputPtr[0]));
+++
+++ ret = _mm_mul_ps(ret, invScalar);
+++ _mm_storeu_ps(outputVectorPtr, ret);
+++
+++ inputPtr += 4;
+++ outputVectorPtr += 4;
+++ }
+++
+++ number = quarterPoints * 4;
+++ for(; number < num_points; number++){
+++ outputVector[number] = (float)(inputVector[number]) / scalar;
+++ }
+++}
+++#endif /* LV_HAVE_SSE */
+++
+++#ifdef LV_HAVE_GENERIC
+++ /*!
+++ \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+++ \param inputVector The 16 bit input data buffer
+++ \param outputVector The floating point output data buffer
+++ \param scalar The value divided against each point in the output buffer
+++ \param num_points The number of data values to be converted
+++ \note Output buffer does NOT need to be properly aligned
+++ */
+++static inline void volk_gnsssdr_16i_s32f_convert_32f_generic(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
+++ float* outputVectorPtr = outputVector;
+++ const int16_t* inputVectorPtr = inputVector;
+++ unsigned int number = 0;
+++
+++ for(number = 0; number < num_points; number++){
+++ *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar;
+++ }
+++}
+++#endif /* LV_HAVE_GENERIC */
+++
+++
+++
+++
+++#endif /* INCLUDED_volk_gnsssdr_16i_s32f_convert_32f_u_H */
+++#ifndef INCLUDED_volk_gnsssdr_16i_s32f_convert_32f_a_H
+++#define INCLUDED_volk_gnsssdr_16i_s32f_convert_32f_a_H
+++
+++#include <inttypes.h>
+++#include <stdio.h>
+++
+++#ifdef LV_HAVE_SSE4_1
+++#include <smmintrin.h>
+++
+++ /*!
+++ \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+++ \param inputVector The 16 bit input data buffer
+++ \param outputVector The floating point output data buffer
+++ \param scalar The value divided against each point in the output buffer
+++ \param num_points The number of data values to be converted
+++ */
+++static inline void volk_gnsssdr_16i_s32f_convert_32f_a_sse4_1(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
+++ unsigned int number = 0;
+++ const unsigned int eighthPoints = num_points / 8;
+++
+++ float* outputVectorPtr = outputVector;
+++ __m128 invScalar = _mm_set_ps1(1.0/scalar);
+++ int16_t* inputPtr = (int16_t*)inputVector;
+++ __m128i inputVal;
+++ __m128i inputVal2;
+++ __m128 ret;
+++
+++ for(;number < eighthPoints; number++){
+++
+++ // Load the 8 values
+++ inputVal = _mm_loadu_si128((__m128i*)inputPtr);
+++
+++ // Shift the input data to the right by 64 bits ( 8 bytes )
+++ inputVal2 = _mm_srli_si128(inputVal, 8);
+++
+++ // Convert the lower 4 values into 32 bit words
+++ inputVal = _mm_cvtepi16_epi32(inputVal);
+++ inputVal2 = _mm_cvtepi16_epi32(inputVal2);
+++
+++ ret = _mm_cvtepi32_ps(inputVal);
+++ ret = _mm_mul_ps(ret, invScalar);
+++ _mm_storeu_ps(outputVectorPtr, ret);
+++ outputVectorPtr += 4;
+++
+++ ret = _mm_cvtepi32_ps(inputVal2);
+++ ret = _mm_mul_ps(ret, invScalar);
+++ _mm_storeu_ps(outputVectorPtr, ret);
+++
+++ outputVectorPtr += 4;
+++
+++ inputPtr += 8;
+++ }
+++
+++ number = eighthPoints * 8;
+++ for(; number < num_points; number++){
+++ outputVector[number] =((float)(inputVector[number])) / scalar;
+++ }
+++}
+++#endif /* LV_HAVE_SSE4_1 */
+++
+++#ifdef LV_HAVE_SSE
+++#include <xmmintrin.h>
+++
+++ /*!
+++ \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+++ \param inputVector The 16 bit input data buffer
+++ \param outputVector The floating point output data buffer
+++ \param scalar The value divided against each point in the output buffer
+++ \param num_points The number of data values to be converted
+++ */
+++static inline void volk_gnsssdr_16i_s32f_convert_32f_a_sse(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
+++ unsigned int number = 0;
+++ const unsigned int quarterPoints = num_points / 4;
+++
+++ float* outputVectorPtr = outputVector;
+++ __m128 invScalar = _mm_set_ps1(1.0/scalar);
+++ int16_t* inputPtr = (int16_t*)inputVector;
+++ __m128 ret;
+++
+++ for(;number < quarterPoints; number++){
+++ ret = _mm_set_ps((float)(inputPtr[3]), (float)(inputPtr[2]), (float)(inputPtr[1]), (float)(inputPtr[0]));
+++
+++ ret = _mm_mul_ps(ret, invScalar);
+++ _mm_storeu_ps(outputVectorPtr, ret);
+++
+++ inputPtr += 4;
+++ outputVectorPtr += 4;
+++ }
+++
+++ number = quarterPoints * 4;
+++ for(; number < num_points; number++){
+++ outputVector[number] = (float)(inputVector[number]) / scalar;
+++ }
+++}
+++#endif /* LV_HAVE_SSE */
+++
+++#ifdef LV_HAVE_GENERIC
+++ /*!
+++ \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+++ \param inputVector The 16 bit input data buffer
+++ \param outputVector The floating point output data buffer
+++ \param scalar The value divided against each point in the output buffer
+++ \param num_points The number of data values to be converted
+++ */
+++static inline void volk_gnsssdr_16i_s32f_convert_32f_a_generic(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
+++ float* outputVectorPtr = outputVector;
+++ const int16_t* inputVectorPtr = inputVector;
+++ unsigned int number = 0;
+++
+++ for(number = 0; number < num_points; number++){
+++ *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar;
+++ }
+++}
+++#endif /* LV_HAVE_GENERIC */
+++
+++
+++
+++
+++#endif /* INCLUDED_volk_gnsssdr_16i_s32f_convert_32f_a_H */
++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3.h
++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3.h 1970-01-01 01:00:00.000000000 +0100
+++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3.h 2014-10-15 01:55:08.000000000 +0200
++@@ -0,0 +1,461 @@
+++/*!
+++ * \file volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3.h
+++ * \brief Volk protokernel: performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation with 32 bits vectors
+++ * \authors <ul>
+++ * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
+++ * </ul>
+++ *
+++ * Volk protokernel that performs the carrier wipe-off mixing and the
+++ * Early, Prompt, and Late correlation with 32 bits vectors (16 bits the
+++ * real part and 16 bits the imaginary part):
+++ * - The carrier wipe-off is done by multiplying the input signal by the
+++ * carrier (multiplication of 32 bits vectors) It returns the input
+++ * signal in base band (BB)
+++ * - Early values are calculated by multiplying the input signal in BB by the
+++ * early code (multiplication of 32 bits vectors), accumulating the results
+++ * - Prompt values are calculated by multiplying the input signal in BB by the
+++ * prompt code (multiplication of 32 bits vectors), accumulating the results
+++ * - Late values are calculated by multiplying the input signal in BB by the
+++ * late code (multiplication of 32 bits vectors), accumulating the results
+++ *
+++ * -------------------------------------------------------------------------
+++ *
+++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
+++ *
+++ * GNSS-SDR is a software defined Global Navigation
+++ * Satellite Systems receiver
+++ *
+++ * This file is part of GNSS-SDR.
+++ *
+++ * GNSS-SDR is free software: you can redistribute it and/or modify
+++ * it under the terms of the GNU General Public License as published by
+++ * the Free Software Foundation, either version 3 of the License, or
+++ * at your option) any later version.
+++ *
+++ * GNSS-SDR is distributed in the hope that it will be useful,
+++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+++ * GNU General Public License for more details.
+++ *
+++ * You should have received a copy of the GNU General Public License
+++ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
+++ *
+++ * -------------------------------------------------------------------------
+++ */
+++
+++#ifndef INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_u_H
+++#define INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_u_H
+++
+++#include <inttypes.h>
+++#include <stdio.h>
+++#include <volk_gnsssdr/volk_gnsssdr_complex.h>
+++#include <float.h>
+++#include <string.h>
+++
+++#ifdef LV_HAVE_SSE4_1
+++#include "smmintrin.h"
+++#include "CommonMacros/CommonMacros_16ic_cw_epl_corr_32fc.h"
+++#include "CommonMacros/CommonMacros.h"
+++ /*!
+++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
+++ \param input The input signal input
+++ \param carrier The carrier signal input
+++ \param E_code Early PRN code replica input
+++ \param P_code Early PRN code replica input
+++ \param L_code Early PRN code replica input
+++ \param E_out Early correlation output
+++ \param P_out Early correlation output
+++ \param L_out Early correlation output
+++ \param num_points The number of complex values in vectors
+++ */
+++static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_u_sse4_1(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points)
+++{
+++ const unsigned int sse_iters = num_points / 8;
+++
+++ __m128i x1, x2, y1, y2, real_bb_signal_sample, imag_bb_signal_sample;
+++ __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output;
+++
+++ __m128 real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc;
+++ __m128i input_i_1, input_i_2, output_i32;
+++ __m128 real_output_ps, imag_output_ps;
+++
+++ float E_out_real = 0;
+++ float E_out_imag = 0;
+++ float P_out_real = 0;
+++ float P_out_imag = 0;
+++ float L_out_real = 0;
+++ float L_out_imag = 0;
+++
+++ const lv_16sc_t* input_ptr = input;
+++ const lv_16sc_t* carrier_ptr = carrier;
+++
+++ const lv_16sc_t* E_code_ptr = E_code;
+++ lv_32fc_t* E_out_ptr = E_out;
+++ const lv_16sc_t* L_code_ptr = L_code;
+++ lv_32fc_t* L_out_ptr = L_out;
+++ const lv_16sc_t* P_code_ptr = P_code;
+++ lv_32fc_t* P_out_ptr = P_out;
+++
+++ *E_out_ptr = 0;
+++ *P_out_ptr = 0;
+++ *L_out_ptr = 0;
+++
+++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
+++
+++ real_E_code_acc = _mm_setzero_ps();
+++ imag_E_code_acc = _mm_setzero_ps();
+++ real_P_code_acc = _mm_setzero_ps();
+++ imag_P_code_acc = _mm_setzero_ps();
+++ real_L_code_acc = _mm_setzero_ps();
+++ imag_L_code_acc = _mm_setzero_ps();
+++
+++ if (sse_iters>0)
+++ {
+++ for(int number = 0;number < sse_iters; number++){
+++
+++ //Perform the carrier wipe-off
+++ x1 = _mm_lddqu_si128((__m128i*)input_ptr);
+++ input_ptr += 4;
+++ x2 = _mm_lddqu_si128((__m128i*)input_ptr);
+++
+++ y1 = _mm_lddqu_si128((__m128i*)carrier_ptr);
+++ carrier_ptr += 4;
+++ y2 = _mm_lddqu_si128((__m128i*)carrier_ptr);
+++
+++ CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(x1, x2, realx, imagx)
+++ CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy)
+++ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample)
+++
+++ //Get early values
+++ y1 = _mm_lddqu_si128((__m128i*)E_code_ptr);
+++ E_code_ptr += 4;
+++ y2 = _mm_lddqu_si128((__m128i*)E_code_ptr);
+++
+++ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
+++
+++ //Adds the float 32 results
+++ real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps);
+++ imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps);
+++
+++ //Get prompt values
+++ y1 = _mm_lddqu_si128((__m128i*)P_code_ptr);
+++ P_code_ptr += 4;
+++ y2 = _mm_lddqu_si128((__m128i*)P_code_ptr);
+++
+++ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
+++
+++ real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps);
+++ imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps);
+++
+++ //Get late values
+++ y1 = _mm_lddqu_si128((__m128i*)L_code_ptr);
+++ L_code_ptr += 4;
+++ y2 = _mm_lddqu_si128((__m128i*)L_code_ptr);
+++
+++ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
+++
+++ real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps);
+++ imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps);
+++
+++ input_ptr += 4;
+++ carrier_ptr += 4;
+++ E_code_ptr += 4;
+++ P_code_ptr += 4;
+++ L_code_ptr += 4;
+++ }
+++
+++ __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4];
+++ __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4];
+++ __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4];
+++ __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4];
+++ __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4];
+++ __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4];
+++
+++ _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector
+++ _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector
+++ _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector
+++ _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector
+++ _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector
+++ _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector
+++
+++ for (int i = 0; i<4; ++i)
+++ {
+++ E_out_real += real_E_dotProductVector[i];
+++ E_out_imag += imag_E_dotProductVector[i];
+++ P_out_real += real_P_dotProductVector[i];
+++ P_out_imag += imag_P_dotProductVector[i];
+++ L_out_real += real_L_dotProductVector[i];
+++ L_out_imag += imag_L_dotProductVector[i];
+++ }
+++ *E_out_ptr = lv_cmake(E_out_real, E_out_imag);
+++ *P_out_ptr = lv_cmake(P_out_real, P_out_imag);
+++ *L_out_ptr = lv_cmake(L_out_real, L_out_imag);
+++ }
+++
+++ lv_16sc_t bb_signal_sample;
+++ for(int i=0; i < num_points%8; ++i)
+++ {
+++ //Perform the carrier wipe-off
+++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
+++ // Now get early, late, and prompt values for each
+++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++));
+++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++));
+++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++));
+++ }
+++
+++}
+++#endif /* LV_HAVE_SSE4_1 */
+++
+++#ifdef LV_HAVE_GENERIC
+++/*!
+++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
+++ \param input The input signal input
+++ \param carrier The carrier signal input
+++ \param E_code Early PRN code replica input
+++ \param P_code Early PRN code replica input
+++ \param L_code Early PRN code replica input
+++ \param E_out Early correlation output
+++ \param P_out Early correlation output
+++ \param L_out Early correlation output
+++ \param num_points The number of complex values in vectors
+++ */
+++static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_generic(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points)
+++{
+++ lv_16sc_t bb_signal_sample;
+++ lv_16sc_t tmp1;
+++ lv_16sc_t tmp2;
+++ lv_16sc_t tmp3;
+++
+++ bb_signal_sample = lv_cmake(0, 0);
+++
+++ *E_out = 0;
+++ *P_out = 0;
+++ *L_out = 0;
+++ // perform Early, Prompt and Late correlation
+++
+++ for(int i=0; i < num_points; ++i)
+++ {
+++ //Perform the carrier wipe-off
+++ bb_signal_sample = input[i] * carrier[i];
+++
+++ tmp1 = bb_signal_sample * E_code[i];
+++ tmp2 = bb_signal_sample * P_code[i];
+++ tmp3 = bb_signal_sample * L_code[i];
+++
+++ // Now get early, late, and prompt values for each
+++ *E_out += (lv_32fc_t)tmp1;
+++ *P_out += (lv_32fc_t)tmp2;
+++ *L_out += (lv_32fc_t)tmp3;
+++ }
+++}
+++#endif /* LV_HAVE_GENERIC */
+++#endif /* INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_u_H */
+++
+++
+++#ifndef INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_a_H
+++#define INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_a_H
+++
+++#include <inttypes.h>
+++#include <stdio.h>
+++#include <volk_gnsssdr/volk_gnsssdr_complex.h>
+++#include <float.h>
+++#include <string.h>
+++
+++#ifdef LV_HAVE_SSE4_1
+++#include "smmintrin.h"
+++#include "CommonMacros/CommonMacros_16ic_cw_epl_corr_32fc.h"
+++#include "CommonMacros/CommonMacros.h"
+++/*!
+++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
+++ \param input The input signal input
+++ \param carrier The carrier signal input
+++ \param E_code Early PRN code replica input
+++ \param P_code Early PRN code replica input
+++ \param L_code Early PRN code replica input
+++ \param E_out Early correlation output
+++ \param P_out Early correlation output
+++ \param L_out Early correlation output
+++ \param num_points The number of complex values in vectors
+++ */
+++static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_a_sse4_1(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points)
+++{
+++ const unsigned int sse_iters = num_points / 8;
+++
+++ __m128i x1, x2, y1, y2, real_bb_signal_sample, imag_bb_signal_sample;
+++ __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output;
+++
+++ __m128 real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc;
+++ __m128i input_i_1, input_i_2, output_i32;
+++ __m128 real_output_ps, imag_output_ps;
+++
+++ float E_out_real = 0;
+++ float E_out_imag = 0;
+++ float P_out_real = 0;
+++ float P_out_imag = 0;
+++ float L_out_real = 0;
+++ float L_out_imag = 0;
+++
+++ const lv_16sc_t* input_ptr = input;
+++ const lv_16sc_t* carrier_ptr = carrier;
+++
+++ const lv_16sc_t* E_code_ptr = E_code;
+++ lv_32fc_t* E_out_ptr = E_out;
+++ const lv_16sc_t* L_code_ptr = L_code;
+++ lv_32fc_t* L_out_ptr = L_out;
+++ const lv_16sc_t* P_code_ptr = P_code;
+++ lv_32fc_t* P_out_ptr = P_out;
+++
+++ *E_out_ptr = 0;
+++ *P_out_ptr = 0;
+++ *L_out_ptr = 0;
+++
+++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
+++
+++ real_E_code_acc = _mm_setzero_ps();
+++ imag_E_code_acc = _mm_setzero_ps();
+++ real_P_code_acc = _mm_setzero_ps();
+++ imag_P_code_acc = _mm_setzero_ps();
+++ real_L_code_acc = _mm_setzero_ps();
+++ imag_L_code_acc = _mm_setzero_ps();
+++
+++ if (sse_iters>0)
+++ {
+++ for(int number = 0;number < sse_iters; number++){
+++
+++ //Perform the carrier wipe-off
+++ x1 = _mm_load_si128((__m128i*)input_ptr);
+++ input_ptr += 4;
+++ x2 = _mm_load_si128((__m128i*)input_ptr);
+++
+++ y1 = _mm_load_si128((__m128i*)carrier_ptr);
+++ carrier_ptr += 4;
+++ y2 = _mm_load_si128((__m128i*)carrier_ptr);
+++
+++ CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(x1, x2, realx, imagx)
+++ CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy)
+++ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample)
+++
+++ //Get early values
+++ y1 = _mm_load_si128((__m128i*)E_code_ptr);
+++ E_code_ptr += 4;
+++ y2 = _mm_load_si128((__m128i*)E_code_ptr);
+++
+++ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
+++
+++ //Adds the float 32 results
+++ real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps);
+++ imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps);
+++
+++ //Get prompt values
+++ y1 = _mm_load_si128((__m128i*)P_code_ptr);
+++ P_code_ptr += 4;
+++ y2 = _mm_load_si128((__m128i*)P_code_ptr);
+++
+++ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
+++
+++ real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps);
+++ imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps);
+++
+++ //Get late values
+++ y1 = _mm_load_si128((__m128i*)L_code_ptr);
+++ L_code_ptr += 4;
+++ y2 = _mm_load_si128((__m128i*)L_code_ptr);
+++
+++ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
+++
+++ real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps);
+++ imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps);
+++
+++ input_ptr += 4;
+++ carrier_ptr += 4;
+++ E_code_ptr += 4;
+++ P_code_ptr += 4;
+++ L_code_ptr += 4;
+++ }
+++
+++ __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4];
+++ __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4];
+++ __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4];
+++ __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4];
+++ __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4];
+++ __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4];
+++
+++ _mm_store_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector
+++ _mm_store_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector
+++ _mm_store_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector
+++ _mm_store_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector
+++ _mm_store_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector
+++ _mm_store_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector
+++
+++ for (int i = 0; i<4; ++i)
+++ {
+++ E_out_real += real_E_dotProductVector[i];
+++ E_out_imag += imag_E_dotProductVector[i];
+++ P_out_real += real_P_dotProductVector[i];
+++ P_out_imag += imag_P_dotProductVector[i];
+++ L_out_real += real_L_dotProductVector[i];
+++ L_out_imag += imag_L_dotProductVector[i];
+++ }
+++ *E_out_ptr = lv_cmake(E_out_real, E_out_imag);
+++ *P_out_ptr = lv_cmake(P_out_real, P_out_imag);
+++ *L_out_ptr = lv_cmake(L_out_real, L_out_imag);
+++ }
+++
+++ lv_16sc_t bb_signal_sample;
+++ for(int i=0; i < num_points%8; ++i)
+++ {
+++ //Perform the carrier wipe-off
+++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
+++ // Now get early, late, and prompt values for each
+++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++));
+++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++));
+++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++));
+++ }
+++
+++}
+++#endif /* LV_HAVE_SSE4_1 */
+++
+++#ifdef LV_HAVE_GENERIC
+++/*!
+++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
+++ \param input The input signal input
+++ \param carrier The carrier signal input
+++ \param E_code Early PRN code replica input
+++ \param P_code Early PRN code replica input
+++ \param L_code Early PRN code replica input
+++ \param E_out Early correlation output
+++ \param P_out Early correlation output
+++ \param L_out Early correlation output
+++ \param num_points The number of complex values in vectors
+++ */
+++static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_a_generic(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points)
+++{
+++ lv_16sc_t bb_signal_sample;
+++ lv_16sc_t tmp1;
+++ lv_16sc_t tmp2;
+++ lv_16sc_t tmp3;
+++
+++ bb_signal_sample = lv_cmake(0, 0);
+++
+++ *E_out = 0;
+++ *P_out = 0;
+++ *L_out = 0;
+++ // perform Early, Prompt and Late correlation
+++
+++ for(int i=0; i < num_points; ++i)
+++ {
+++ //Perform the carrier wipe-off
+++ bb_signal_sample = input[i] * carrier[i];
+++
+++ tmp1 = bb_signal_sample * E_code[i];
+++ tmp2 = bb_signal_sample * P_code[i];
+++ tmp3 = bb_signal_sample * L_code[i];
+++
+++ // Now get early, late, and prompt values for each
+++ *E_out += (lv_32fc_t)tmp1;
+++ *P_out += (lv_32fc_t)tmp2;
+++ *L_out += (lv_32fc_t)tmp3;
+++ }
+++}
+++#endif /* LV_HAVE_GENERIC */
+++#endif /* INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_a_H */
++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3.h
++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3.h 1970-01-01 01:00:00.000000000 +0100
+++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3.h 2014-10-15 01:55:08.000000000 +0200
++@@ -0,0 +1,1568 @@
+++/*!
+++ * \file volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3.h
+++ * \brief Volk protokernel: performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation with 32 bits vectors using different methods: inside u_sse4_1_first there is one method, inside u_sse4_1_second there is another... This protokernel has been created to test the performance of different methods.
+++ * \authors <ul>
+++ * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
+++ * </ul>
+++ *
+++ * Volk protokernel that performs the carrier wipe-off mixing and the
+++ * Early, Prompt, and Late correlation with 32 bits vectors (16 bits the
+++ * real part and 16 bits the imaginary part):
+++ * - The carrier wipe-off is done by multiplying the input signal by the
+++ * carrier (multiplication of 32 bits vectors) It returns the input
+++ * signal in base band (BB)
+++ * - Early values are calculated by multiplying the input signal in BB by the
+++ * early code (multiplication of 32 bits vectors), accumulating the results
+++ * - Prompt values are calculated by multiplying the input signal in BB by the
+++ * prompt code (multiplication of 32 bits vectors), accumulating the results
+++ * - Late values are calculated by multiplying the input signal in BB by the
+++ * late code (multiplication of 32 bits vectors), accumulating the results
+++ *
+++ * -------------------------------------------------------------------------
+++ *
+++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
+++ *
+++ * GNSS-SDR is a software defined Global Navigation
+++ * Satellite Systems receiver
+++ *
+++ * This file is part of GNSS-SDR.
+++ *
+++ * GNSS-SDR is free software: you can redistribute it and/or modify
+++ * it under the terms of the GNU General Public License as published by
+++ * the Free Software Foundation, either version 3 of the License, or
+++ * at your option) any later version.
+++ *
+++ * GNSS-SDR is distributed in the hope that it will be useful,
+++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+++ * GNU General Public License for more details.
+++ *
+++ * You should have received a copy of the GNU General Public License
+++ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
+++ *
+++ * -------------------------------------------------------------------------
+++ */
+++
+++#ifndef INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_u_H
+++#define INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_u_H
+++
+++#include <inttypes.h>
+++#include <stdio.h>
+++#include <volk_gnsssdr/volk_gnsssdr_complex.h>
+++#include <float.h>
+++#include <string.h>
+++
+++#ifdef LV_HAVE_SSE4_1
+++#include "smmintrin.h"
+++ /*!
+++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
+++ \param input The input signal input
+++ \param carrier The carrier signal input
+++ \param E_code Early PRN code replica input
+++ \param P_code Early PRN code replica input
+++ \param L_code Early PRN code replica input
+++ \param E_out Early correlation output
+++ \param P_out Early correlation output
+++ \param L_out Early correlation output
+++ \param num_points The number of complex values in vectors
+++ */
+++static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_u_sse4_1_first(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points)
+++{
+++ const unsigned int sse_iters = num_points / 4;
+++
+++ __m128i x, y, yaux, yl, yh, tmp1, tmp2, z, bb_signal_sample, bb_signal_sample_suffled;
+++
+++ __m128 z_ps_1, z_ps_2, z_E, z_P, z_L;
+++ __m128i z_i_1, z_i_2;
+++
+++ lv_32fc_t dotProduct_E;
+++ lv_32fc_t dotProduct_P;
+++ lv_32fc_t dotProduct_L;
+++
+++ z_E = _mm_setzero_ps();
+++ z_P = _mm_setzero_ps();
+++ z_L = _mm_setzero_ps();
+++
+++ const lv_16sc_t* _input = input;
+++ const lv_16sc_t* _carrier = carrier;
+++ const lv_16sc_t* _E_code = E_code;
+++ const lv_16sc_t* _P_code = P_code;
+++ const lv_16sc_t* _L_code = L_code;
+++
+++ if (sse_iters>0)
+++ {
+++ for(int number = 0;number < sse_iters; number++)
+++ {
+++ //Perform the carrier wipe-off
+++ x = _mm_lddqu_si128((__m128i*)_input); // Load the ar + ai, br + bi as ar,ai,br,bi
+++ y = _mm_lddqu_si128((__m128i*)_carrier); // Load the cr + ci, dr + di as cr,ci,dr,di
+++
+++ // Load yl with cr,cr,dr,dr
+++ // Load yh with ci,ci,di,di
+++ yaux = _mm_shuffle_epi8 (y, _mm_set_epi8 (15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0));
+++ yl = _mm_unpacklo_epi16(yaux, yaux);
+++ yh = _mm_unpackhi_epi16(yaux, yaux);
+++
+++ tmp1 = _mm_mullo_epi16(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+++
+++ x = _mm_shuffle_epi8 (x, _mm_set_epi8 (13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2)); // Re-arrange x to be ai,ar,bi,br
+++
+++ tmp2 = _mm_mullo_epi16(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+++
+++ tmp2 = _mm_mullo_epi16(tmp2,_mm_set_epi16 (1, -1, 1, -1, 1, -1, 1, -1));
+++ bb_signal_sample = _mm_add_epi16(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+++ bb_signal_sample_suffled = _mm_shuffle_epi8 (bb_signal_sample, _mm_set_epi8 (13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2)); // Re-arrange bb_signal_sample to be ai,ar,bi,br
+++
+++ // correlation E,P,L (3x vector scalar product)
+++ // Early
+++ y = _mm_lddqu_si128((__m128i*)_E_code); // Load the cr + ci, dr + di as cr,ci,dr,di
+++
+++ yaux = _mm_shuffle_epi8 (y, _mm_set_epi8 (15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0));
+++ yl = _mm_unpacklo_epi16(yaux, yaux);
+++ yh = _mm_unpackhi_epi16(yaux, yaux);
+++
+++ tmp1 = _mm_mullo_epi16(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+++
+++ tmp2 = _mm_mullo_epi16(bb_signal_sample_suffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+++
+++ tmp2 = _mm_mullo_epi16(tmp2,_mm_set_epi16 (1, -1, 1, -1, 1, -1, 1, -1));
+++ z = _mm_add_epi16(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+++
+++ z_i_1 = _mm_cvtepi16_epi32(z);
+++ z_ps_1 = _mm_cvtepi32_ps(z_i_1);
+++ z = _mm_srli_si128 (z, 8);
+++ z_i_2 = _mm_cvtepi16_epi32(z);
+++ z_ps_2 = _mm_cvtepi32_ps(z_i_2);
+++
+++ z_E = _mm_add_ps(z_E, z_ps_1); // Add the complex multiplication results together
+++ z_E = _mm_add_ps(z_E, z_ps_2); // Add the complex multiplication results together
+++
+++ // Prompt
+++ y = _mm_lddqu_si128((__m128i*)_P_code); // Load the cr + ci, dr + di as cr,ci,dr,di
+++
+++ yaux = _mm_shuffle_epi8 (y, _mm_set_epi8 (15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0));
+++ yl = _mm_unpacklo_epi16(yaux, yaux);
+++ yh = _mm_unpackhi_epi16(yaux, yaux);
+++
+++ tmp1 = _mm_mullo_epi16(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+++
+++ tmp2 = _mm_mullo_epi16(bb_signal_sample_suffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+++
+++ tmp2 = _mm_mullo_epi16(tmp2,_mm_set_epi16 (1, -1, 1, -1, 1, -1, 1, -1));
+++ z = _mm_add_epi16(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+++
+++ z_i_1 = _mm_cvtepi16_epi32(z);
+++ z_ps_1 = _mm_cvtepi32_ps(z_i_1);
+++ z = _mm_srli_si128 (z, 8);
+++ z_i_2 = _mm_cvtepi16_epi32(z);
+++ z_ps_2 = _mm_cvtepi32_ps(z_i_2);
+++
+++ z_P = _mm_add_ps(z_P, z_ps_1); // Add the complex multiplication results together
+++ z_P = _mm_add_ps(z_P, z_ps_2); // Add the complex multiplication results together
+++
+++ // Late
+++ y = _mm_lddqu_si128((__m128i*)_L_code); // Load the cr + ci, dr + di as cr,ci,dr,di
+++
+++ yaux = _mm_shuffle_epi8 (y, _mm_set_epi8 (15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0));
+++ yl = _mm_unpacklo_epi16(yaux, yaux);
+++ yh = _mm_unpackhi_epi16(yaux, yaux);
+++
+++ tmp1 = _mm_mullo_epi16(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+++
+++ tmp2 = _mm_mullo_epi16(bb_signal_sample_suffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+++
+++ tmp2 = _mm_mullo_epi16(tmp2,_mm_set_epi16 (1, -1, 1, -1, 1, -1, 1, -1));
+++ z = _mm_add_epi16(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+++
+++ z_i_1 = _mm_cvtepi16_epi32(z);
+++ z_ps_1 = _mm_cvtepi32_ps(z_i_1);
+++ z = _mm_srli_si128 (z, 8);
+++ z_i_2 = _mm_cvtepi16_epi32(z);
+++ z_ps_2 = _mm_cvtepi32_ps(z_i_2);
+++
+++ z_L = _mm_add_ps(z_L, z_ps_1); // Add the complex multiplication results together
+++ z_L = _mm_add_ps(z_L, z_ps_2); // Add the complex multiplication results together
+++
+++ _input += 4;
+++ _carrier += 4;
+++ _E_code += 4;
+++ _L_code += 4;
+++ _P_code += 4;
+++ }
+++
+++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_E[2];
+++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_P[2];
+++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_L[2];
+++
+++ _mm_storeu_ps((float*)dotProductVector_E,z_E); // Store the results back into the dot product vector
+++ _mm_storeu_ps((float*)dotProductVector_P,z_P); // Store the results back into the dot product vector
+++ _mm_storeu_ps((float*)dotProductVector_L,z_L); // Store the results back into the dot product vector
+++
+++ dotProduct_E = ( dotProductVector_E[0] + dotProductVector_E[1] );
+++ dotProduct_P = ( dotProductVector_P[0] + dotProductVector_P[1] );
+++ dotProduct_L = ( dotProductVector_L[0] + dotProductVector_L[1] );
+++ }
+++
+++ for(int i=0; i < num_points%4; ++i)
+++ {
+++ dotProduct_E += (lv_32fc_t)((*_input) * (*_E_code++)*(*_carrier));
+++ dotProduct_P += (lv_32fc_t)((*_input) * (*_P_code++)*(*_carrier));
+++ dotProduct_L += (lv_32fc_t)((*_input++) * (*_L_code++)*(*_carrier++));
+++ }
+++
+++ *E_out = dotProduct_E;
+++ *P_out = dotProduct_P;
+++ *L_out = dotProduct_L;
+++
+++
+++
+++}
+++#endif /* LV_HAVE_SSE4_1 */
+++
+++#ifdef LV_HAVE_SSE4_1
+++#include "smmintrin.h"
+++/*!
+++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
+++ \param input The input signal input
+++ \param carrier The carrier signal input
+++ \param E_code Early PRN code replica input
+++ \param P_code Early PRN code replica input
+++ \param L_code Early PRN code replica input
+++ \param E_out Early correlation output
+++ \param P_out Early correlation output
+++ \param L_out Early correlation output
+++ \param num_points The number of complex values in vectors
+++ */
+++static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_u_sse4_1_second(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points)
+++{
+++ const unsigned int sse_iters = num_points / 8;
+++
+++ __m128i x1, x2, y1, y2, real_bb_signal_sample, imag_bb_signal_sample;
+++ __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output;
+++
+++ __m128 real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc;
+++ __m128i real_output_i_1, real_output_i_2, imag_output_i_1, imag_output_i_2;
+++ __m128 real_output_ps_1, real_output_ps_2, imag_output_ps_1, imag_output_ps_2;
+++
+++ float E_out_real = 0;
+++ float E_out_imag = 0;
+++ float P_out_real = 0;
+++ float P_out_imag = 0;
+++ float L_out_real = 0;
+++ float L_out_imag = 0;
+++
+++ const lv_16sc_t* input_ptr = input;
+++ const lv_16sc_t* carrier_ptr = carrier;
+++
+++ const lv_16sc_t* E_code_ptr = E_code;
+++ lv_32fc_t* E_out_ptr = E_out;
+++ const lv_16sc_t* L_code_ptr = L_code;
+++ lv_32fc_t* L_out_ptr = L_out;
+++ const lv_16sc_t* P_code_ptr = P_code;
+++ lv_32fc_t* P_out_ptr = P_out;
+++
+++ *E_out_ptr = 0;
+++ *P_out_ptr = 0;
+++ *L_out_ptr = 0;
+++
+++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
+++
+++ real_E_code_acc = _mm_setzero_ps();
+++ imag_E_code_acc = _mm_setzero_ps();
+++ real_P_code_acc = _mm_setzero_ps();
+++ imag_P_code_acc = _mm_setzero_ps();
+++ real_L_code_acc = _mm_setzero_ps();
+++ imag_L_code_acc = _mm_setzero_ps();
+++
+++ if (sse_iters>0)
+++ {
+++ for(int number = 0;number < sse_iters; number++){
+++
+++ //Perform the carrier wipe-off
+++ x1 = _mm_lddqu_si128((__m128i*)input_ptr);
+++ input_ptr += 4;
+++ x2 = _mm_lddqu_si128((__m128i*)input_ptr);
+++
+++ y1 = _mm_lddqu_si128((__m128i*)carrier_ptr);
+++ carrier_ptr += 4;
+++ y2 = _mm_lddqu_si128((__m128i*)carrier_ptr);
+++
+++ imagx = _mm_srli_si128 (x1, 2);
+++ imagx = _mm_blend_epi16 (x2, imagx, 85);
+++ realx = _mm_slli_si128 (x2, 2);
+++ realx = _mm_blend_epi16 (realx, x1, 85);
+++
+++ imagy = _mm_srli_si128 (y1, 2);
+++ imagy = _mm_blend_epi16 (y2, imagy, 85);
+++ realy = _mm_slli_si128 (y2, 2);
+++ realy = _mm_blend_epi16 (realy, y1, 85);
+++
+++ realx_mult_realy = _mm_mullo_epi16 (realx, realy);
+++ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);
+++ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);
+++ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);
+++
+++ real_bb_signal_sample = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
+++ imag_bb_signal_sample = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
+++
+++ //Get early values
+++ y1 = _mm_lddqu_si128((__m128i*)E_code_ptr);
+++ E_code_ptr += 4;
+++ y2 = _mm_lddqu_si128((__m128i*)E_code_ptr);
+++
+++ imagy = _mm_srli_si128 (y1, 2);
+++ imagy = _mm_blend_epi16 (y2, imagy, 85);
+++ realy = _mm_slli_si128 (y2, 2);
+++ realy = _mm_blend_epi16 (realy, y1, 85);
+++
+++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy);
+++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy);
+++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy);
+++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy);
+++
+++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
+++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
+++
+++ real_output_i_1 = _mm_cvtepi16_epi32(real_output);
+++ real_output_ps_1 = _mm_cvtepi32_ps(real_output_i_1);
+++ real_output = _mm_srli_si128 (real_output, 8);
+++ real_output_i_2 = _mm_cvtepi16_epi32(real_output);
+++ real_output_ps_2 = _mm_cvtepi32_ps(real_output_i_2);
+++
+++ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output);
+++ imag_output_ps_1 = _mm_cvtepi32_ps(imag_output_i_1);
+++ imag_output = _mm_srli_si128 (imag_output, 8);
+++ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output);
+++ imag_output_ps_2 = _mm_cvtepi32_ps(imag_output_i_2);
+++
+++ real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps_1);
+++ real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps_2);
+++ imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps_1);
+++ imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps_2);
+++
+++ //Get prompt values
+++ y1 = _mm_lddqu_si128((__m128i*)P_code_ptr);
+++ P_code_ptr += 4;
+++ y2 = _mm_lddqu_si128((__m128i*)P_code_ptr);
+++
+++ imagy = _mm_srli_si128 (y1, 2);
+++ imagy = _mm_blend_epi16 (y2, imagy, 85);
+++ realy = _mm_slli_si128 (y2, 2);
+++ realy = _mm_blend_epi16 (realy, y1, 85);
+++
+++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy);
+++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy);
+++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy);
+++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy);
+++
+++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
+++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
+++
+++ real_output_i_1 = _mm_cvtepi16_epi32(real_output);
+++ real_output_ps_1 = _mm_cvtepi32_ps(real_output_i_1);
+++ real_output = _mm_srli_si128 (real_output, 8);
+++ real_output_i_2 = _mm_cvtepi16_epi32(real_output);
+++ real_output_ps_2 = _mm_cvtepi32_ps(real_output_i_2);
+++
+++ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output);
+++ imag_output_ps_1 = _mm_cvtepi32_ps(imag_output_i_1);
+++ imag_output = _mm_srli_si128 (imag_output, 8);
+++ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output);
+++ imag_output_ps_2 = _mm_cvtepi32_ps(imag_output_i_2);
+++
+++ real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps_1);
+++ real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps_2);
+++ imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps_1);
+++ imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps_2);
+++
+++ //Get late values
+++ y1 = _mm_lddqu_si128((__m128i*)L_code_ptr);
+++ L_code_ptr += 4;
+++ y2 = _mm_lddqu_si128((__m128i*)L_code_ptr);
+++
+++ imagy = _mm_srli_si128 (y1, 2);
+++ imagy = _mm_blend_epi16 (y2, imagy, 85);
+++ realy = _mm_slli_si128 (y2, 2);
+++ realy = _mm_blend_epi16 (realy, y1, 85);
+++
+++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy);
+++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy);
+++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy);
+++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy);
+++
+++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
+++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
+++
+++ real_output_i_1 = _mm_cvtepi16_epi32(real_output);
+++ real_output_ps_1 = _mm_cvtepi32_ps(real_output_i_1);
+++ real_output = _mm_srli_si128 (real_output, 8);
+++ real_output_i_2 = _mm_cvtepi16_epi32(real_output);
+++ real_output_ps_2 = _mm_cvtepi32_ps(real_output_i_2);
+++
+++ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output);
+++ imag_output_ps_1 = _mm_cvtepi32_ps(imag_output_i_1);
+++ imag_output = _mm_srli_si128 (imag_output, 8);
+++ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output);
+++ imag_output_ps_2 = _mm_cvtepi32_ps(imag_output_i_2);
+++
+++ real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps_1);
+++ real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps_2);
+++ imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps_1);
+++ imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps_2);
+++
+++ input_ptr += 4;
+++ carrier_ptr += 4;
+++ E_code_ptr += 4;
+++ L_code_ptr += 4;
+++ P_code_ptr += 4;
+++ }
+++
+++ __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4];
+++ __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4];
+++ __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4];
+++ __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4];
+++ __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4];
+++ __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4];
+++
+++ _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector
+++ _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector
+++ _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector
+++ _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector
+++ _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector
+++ _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector
+++
+++ for (int i = 0; i<4; ++i)
+++ {
+++ E_out_real += real_E_dotProductVector[i];
+++ E_out_imag += imag_E_dotProductVector[i];
+++ P_out_real += real_P_dotProductVector[i];
+++ P_out_imag += imag_P_dotProductVector[i];
+++ L_out_real += real_L_dotProductVector[i];
+++ L_out_imag += imag_L_dotProductVector[i];
+++ }
+++ *E_out_ptr = lv_cmake(E_out_real, E_out_imag);
+++ *P_out_ptr = lv_cmake(P_out_real, P_out_imag);
+++ *L_out_ptr = lv_cmake(L_out_real, L_out_imag);
+++ }
+++
+++ lv_16sc_t bb_signal_sample;
+++ for(int i=0; i < num_points%8; ++i)
+++ {
+++ //Perform the carrier wipe-off
+++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
+++ // Now get early, late, and prompt values for each
+++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++));
+++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++));
+++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++));
+++ }
+++}
+++#endif /* LV_HAVE_SSE4_1 */
+++
+++#ifdef LV_HAVE_SSE4_1
+++#include "smmintrin.h"
+++/*!
+++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
+++ \param input The input signal input
+++ \param carrier The carrier signal input
+++ \param E_code Early PRN code replica input
+++ \param P_code Early PRN code replica input
+++ \param L_code Early PRN code replica input
+++ \param E_out Early correlation output
+++ \param P_out Early correlation output
+++ \param L_out Early correlation output
+++ \param num_points The number of complex values in vectors
+++ */
+++static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_u_sse4_1_third(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points)
+++{
+++ const unsigned int sse_iters = num_points / 8;
+++ unsigned int index = 0;
+++ unsigned int indexPlus4 = 0;
+++
+++ __m128i x1, x2, y1, y2, real_bb_signal_sample, imag_bb_signal_sample;
+++ __m128i realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, real_output_i32, imag_output_i32;
+++
+++ __m128 real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc;
+++ __m128i real_output_i_1, real_output_i_2, imag_output_i_1, imag_output_i_2;
+++ __m128 real_output_ps, imag_output_ps;
+++
+++ float E_out_real = 0;
+++ float E_out_imag = 0;
+++ float P_out_real = 0;
+++ float P_out_imag = 0;
+++ float L_out_real = 0;
+++ float L_out_imag = 0;
+++
+++ const lv_16sc_t* input_ptr = input;
+++ const lv_16sc_t* carrier_ptr = carrier;
+++
+++ const lv_16sc_t* E_code_ptr = E_code;
+++ lv_32fc_t* E_out_ptr = E_out;
+++ const lv_16sc_t* L_code_ptr = L_code;
+++ lv_32fc_t* L_out_ptr = L_out;
+++ const lv_16sc_t* P_code_ptr = P_code;
+++ lv_32fc_t* P_out_ptr = P_out;
+++
+++ *E_out_ptr = 0;
+++ *P_out_ptr = 0;
+++ *L_out_ptr = 0;
+++
+++ real_E_code_acc = _mm_setzero_ps();
+++ imag_E_code_acc = _mm_setzero_ps();
+++ real_P_code_acc = _mm_setzero_ps();
+++ imag_P_code_acc = _mm_setzero_ps();
+++ real_L_code_acc = _mm_setzero_ps();
+++ imag_L_code_acc = _mm_setzero_ps();
+++
+++ if (sse_iters>0)
+++ {
+++ for(index = 0;index < 8*sse_iters; index+=8){
+++ indexPlus4 = index + 4;
+++ //Perform the carrier wipe-off
+++ x1 = _mm_lddqu_si128((__m128i*)&input_ptr[index]);
+++ x2 = _mm_lddqu_si128((__m128i*)&input_ptr[indexPlus4]);
+++
+++ y1 = _mm_lddqu_si128((__m128i*)&carrier_ptr[index]);
+++ y2 = _mm_lddqu_si128((__m128i*)&carrier_ptr[indexPlus4]);
+++
+++ imagx = _mm_srli_si128 (x1, 2);
+++ imagx = _mm_blend_epi16 (x2, imagx, 85);
+++ realx = _mm_slli_si128 (x2, 2);
+++ realx = _mm_blend_epi16 (realx, x1, 85);
+++
+++ imagy = _mm_srli_si128 (y1, 2);
+++ imagy = _mm_blend_epi16 (y2, imagy, 85);
+++ realy = _mm_slli_si128 (y2, 2);
+++ realy = _mm_blend_epi16 (realy, y1, 85);
+++
+++ realx_mult_realy = _mm_mullo_epi16 (realx, realy);
+++ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);
+++ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);
+++ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);
+++
+++ real_bb_signal_sample = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
+++ imag_bb_signal_sample = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
+++
+++ //Get early values
+++ y1 = _mm_lddqu_si128((__m128i*)&E_code_ptr[index]);
+++ y2 = _mm_lddqu_si128((__m128i*)&E_code_ptr[indexPlus4]);
+++
+++ imagy = _mm_srli_si128 (y1, 2);
+++ imagy = _mm_blend_epi16 (y2, imagy, 85);
+++ realy = _mm_slli_si128 (y2, 2);
+++ realy = _mm_blend_epi16 (realy, y1, 85);
+++
+++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy);
+++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy);
+++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy);
+++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy);
+++
+++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
+++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
+++
+++ real_output_i_1 = _mm_cvtepi16_epi32(real_output);
+++ real_output = _mm_srli_si128 (real_output, 8);
+++ real_output_i_2 = _mm_cvtepi16_epi32(real_output);
+++ real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2);
+++ real_output_ps = _mm_cvtepi32_ps(real_output_i32);
+++
+++ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output);
+++ imag_output = _mm_srli_si128 (imag_output, 8);
+++ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output);
+++ imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2);
+++ imag_output_ps = _mm_cvtepi32_ps(imag_output_i32);
+++
+++ real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps);
+++ imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps);
+++
+++ //Get prompt values
+++ y1 = _mm_lddqu_si128((__m128i*)&P_code_ptr[index]);
+++ y2 = _mm_lddqu_si128((__m128i*)&P_code_ptr[indexPlus4]);
+++
+++ imagy = _mm_srli_si128 (y1, 2);
+++ imagy = _mm_blend_epi16 (y2, imagy, 85);
+++ realy = _mm_slli_si128 (y2, 2);
+++ realy = _mm_blend_epi16 (realy, y1, 85);
+++
+++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy);
+++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy);
+++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy);
+++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy);
+++
+++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
+++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
+++
+++ real_output_i_1 = _mm_cvtepi16_epi32(real_output);
+++ real_output = _mm_srli_si128 (real_output, 8);
+++ real_output_i_2 = _mm_cvtepi16_epi32(real_output);
+++ real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2);
+++ real_output_ps = _mm_cvtepi32_ps(real_output_i32);
+++
+++ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output);
+++ imag_output = _mm_srli_si128 (imag_output, 8);
+++ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output);
+++ imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2);
+++ imag_output_ps = _mm_cvtepi32_ps(imag_output_i32);
+++
+++ real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps);
+++ imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps);
+++
+++ //Get late values
+++ y1 = _mm_lddqu_si128((__m128i*)&L_code_ptr[index]);
+++ y2 = _mm_lddqu_si128((__m128i*)&L_code_ptr[indexPlus4]);
+++
+++ imagy = _mm_srli_si128 (y1, 2);
+++ imagy = _mm_blend_epi16 (y2, imagy, 85);
+++ realy = _mm_slli_si128 (y2, 2);
+++ realy = _mm_blend_epi16 (realy, y1, 85);
+++
+++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy);
+++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy);
+++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy);
+++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy);
+++
+++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
+++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
+++
+++ real_output_i_1 = _mm_cvtepi16_epi32(real_output);
+++ real_output = _mm_srli_si128 (real_output, 8);
+++ real_output_i_2 = _mm_cvtepi16_epi32(real_output);
+++ real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2);
+++ real_output_ps = _mm_cvtepi32_ps(real_output_i32);
+++
+++ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output);
+++ imag_output = _mm_srli_si128 (imag_output, 8);
+++ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output);
+++ imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2);
+++ imag_output_ps = _mm_cvtepi32_ps(imag_output_i32);
+++
+++ real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps);
+++ imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps);
+++ }
+++
+++ __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4];
+++ __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4];
+++ __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4];
+++ __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4];
+++ __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4];
+++ __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4];
+++
+++ _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector
+++ _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector
+++ _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector
+++ _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector
+++ _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector
+++ _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector
+++
+++ for (int i = 0; i<4; ++i)
+++ {
+++ E_out_real += real_E_dotProductVector[i];
+++ E_out_imag += imag_E_dotProductVector[i];
+++ P_out_real += real_P_dotProductVector[i];
+++ P_out_imag += imag_P_dotProductVector[i];
+++ L_out_real += real_L_dotProductVector[i];
+++ L_out_imag += imag_L_dotProductVector[i];
+++ }
+++ *E_out_ptr = lv_cmake(E_out_real, E_out_imag);
+++ *P_out_ptr = lv_cmake(P_out_real, P_out_imag);
+++ *L_out_ptr = lv_cmake(L_out_real, L_out_imag);
+++ }
+++
+++ lv_16sc_t bb_signal_sample;
+++ for(; index < num_points; index++)
+++ {
+++ //Perform the carrier wipe-off
+++ bb_signal_sample = input_ptr[index] * carrier_ptr[index];
+++ // Now get early, late, and prompt values for each
+++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * E_code_ptr[index]);
+++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * P_code_ptr[index]);
+++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * L_code_ptr[index]);
+++ }
+++}
+++#endif /* LV_HAVE_SSE4_1 */
+++
+++#ifdef LV_HAVE_SSE4_1
+++#include "smmintrin.h"
+++/*!
+++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
+++ \param input The input signal input
+++ \param carrier The carrier signal input
+++ \param E_code Early PRN code replica input
+++ \param P_code Early PRN code replica input
+++ \param L_code Early PRN code replica input
+++ \param E_out Early correlation output
+++ \param P_out Early correlation output
+++ \param L_out Early correlation output
+++ \param num_points The number of complex values in vectors
+++ */
+++static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_u_sse4_1_fourth(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points)
+++{
+++ const unsigned int sse_iters = num_points / 8;
+++
+++ __m128i x1, x2, y1, y2, real_bb_signal_sample, imag_bb_signal_sample;
+++ __m128i realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, real_output_i32, imag_output_i32;
+++
+++ __m128 real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc;
+++ __m128i real_output_i_1, real_output_i_2, imag_output_i_1, imag_output_i_2;
+++ __m128 real_output_ps, imag_output_ps;
+++
+++ float E_out_real = 0;
+++ float E_out_imag = 0;
+++ float P_out_real = 0;
+++ float P_out_imag = 0;
+++ float L_out_real = 0;
+++ float L_out_imag = 0;
+++
+++ const lv_16sc_t* input_ptr = input;
+++ const lv_16sc_t* carrier_ptr = carrier;
+++
+++ const lv_16sc_t* E_code_ptr = E_code;
+++ lv_32fc_t* E_out_ptr = E_out;
+++ const lv_16sc_t* L_code_ptr = L_code;
+++ lv_32fc_t* L_out_ptr = L_out;
+++ const lv_16sc_t* P_code_ptr = P_code;
+++ lv_32fc_t* P_out_ptr = P_out;
+++
+++ *E_out_ptr = 0;
+++ *P_out_ptr = 0;
+++ *L_out_ptr = 0;
+++
+++ real_E_code_acc = _mm_setzero_ps();
+++ imag_E_code_acc = _mm_setzero_ps();
+++ real_P_code_acc = _mm_setzero_ps();
+++ imag_P_code_acc = _mm_setzero_ps();
+++ real_L_code_acc = _mm_setzero_ps();
+++ imag_L_code_acc = _mm_setzero_ps();
+++
+++ if (sse_iters>0)
+++ {
+++ for(int number = 0;number < sse_iters; number++){
+++
+++ //Perform the carrier wipe-off
+++ x1 = _mm_lddqu_si128((__m128i*)input_ptr);
+++ input_ptr += 4;
+++ x2 = _mm_lddqu_si128((__m128i*)input_ptr);
+++
+++ y1 = _mm_lddqu_si128((__m128i*)carrier_ptr);
+++ carrier_ptr += 4;
+++ y2 = _mm_lddqu_si128((__m128i*)carrier_ptr);
+++
+++ imagx = _mm_srli_si128 (x1, 2);
+++ imagx = _mm_blend_epi16 (x2, imagx, 85);
+++ realx = _mm_slli_si128 (x2, 2);
+++ realx = _mm_blend_epi16 (realx, x1, 85);
+++
+++ imagy = _mm_srli_si128 (y1, 2);
+++ imagy = _mm_blend_epi16 (y2, imagy, 85);
+++ realy = _mm_slli_si128 (y2, 2);
+++ realy = _mm_blend_epi16 (realy, y1, 85);
+++
+++ realx_mult_realy = _mm_mullo_epi16 (realx, realy);
+++ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);
+++ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);
+++ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);
+++
+++ real_bb_signal_sample = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
+++ imag_bb_signal_sample = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
+++
+++ //Get early values
+++ y1 = _mm_lddqu_si128((__m128i*)E_code_ptr);
+++ E_code_ptr += 4;
+++ y2 = _mm_lddqu_si128((__m128i*)E_code_ptr);
+++
+++ imagy = _mm_srli_si128 (y1, 2);
+++ imagy = _mm_blend_epi16 (y2, imagy, 85);
+++ realy = _mm_slli_si128 (y2, 2);
+++ realy = _mm_blend_epi16 (realy, y1, 85);
+++
+++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy);
+++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy);
+++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy);
+++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy);
+++
+++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
+++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
+++
+++ real_output_i_1 = _mm_cvtepi16_epi32(real_output);
+++ real_output = _mm_srli_si128 (real_output, 8);
+++ real_output_i_2 = _mm_cvtepi16_epi32(real_output);
+++ real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2);
+++ real_output_ps = _mm_cvtepi32_ps(real_output_i32);
+++
+++ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output);
+++ imag_output = _mm_srli_si128 (imag_output, 8);
+++ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output);
+++ imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2);
+++ imag_output_ps = _mm_cvtepi32_ps(imag_output_i32);
+++
+++ real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps);
+++ imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps);
+++
+++ //Get prompt values
+++ y1 = _mm_lddqu_si128((__m128i*)P_code_ptr);
+++ P_code_ptr += 4;
+++ y2 = _mm_lddqu_si128((__m128i*)P_code_ptr);
+++
+++ imagy = _mm_srli_si128 (y1, 2);
+++ imagy = _mm_blend_epi16 (y2, imagy, 85);
+++ realy = _mm_slli_si128 (y2, 2);
+++ realy = _mm_blend_epi16 (realy, y1, 85);
+++
+++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy);
+++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy);
+++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy);
+++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy);
+++
+++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
+++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
+++
+++ real_output_i_1 = _mm_cvtepi16_epi32(real_output);
+++ real_output = _mm_srli_si128 (real_output, 8);
+++ real_output_i_2 = _mm_cvtepi16_epi32(real_output);
+++ real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2);
+++ real_output_ps = _mm_cvtepi32_ps(real_output_i32);
+++
+++ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output);
+++ imag_output = _mm_srli_si128 (imag_output, 8);
+++ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output);
+++ imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2);
+++ imag_output_ps = _mm_cvtepi32_ps(imag_output_i32);
+++
+++ real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps);
+++ imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps);
+++
+++ //Get late values
+++ y1 = _mm_lddqu_si128((__m128i*)L_code_ptr);
+++ L_code_ptr += 4;
+++ y2 = _mm_lddqu_si128((__m128i*)L_code_ptr);
+++
+++ imagy = _mm_srli_si128 (y1, 2);
+++ imagy = _mm_blend_epi16 (y2, imagy, 85);
+++ realy = _mm_slli_si128 (y2, 2);
+++ realy = _mm_blend_epi16 (realy, y1, 85);
+++
+++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy);
+++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy);
+++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy);
+++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy);
+++
+++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
+++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
+++
+++ real_output_i_1 = _mm_cvtepi16_epi32(real_output);
+++ real_output = _mm_srli_si128 (real_output, 8);
+++ real_output_i_2 = _mm_cvtepi16_epi32(real_output);
+++ real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2);
+++ real_output_ps = _mm_cvtepi32_ps(real_output_i32);
+++
+++ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output);
+++ imag_output = _mm_srli_si128 (imag_output, 8);
+++ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output);
+++ imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2);
+++ imag_output_ps = _mm_cvtepi32_ps(imag_output_i32);
+++
+++ real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps);
+++ imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps);
+++
+++ input_ptr += 4;
+++ carrier_ptr += 4;
+++ E_code_ptr += 4;
+++ L_code_ptr += 4;
+++ P_code_ptr += 4;
+++ }
+++
+++ __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4];
+++ __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4];
+++ __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4];
+++ __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4];
+++ __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4];
+++ __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4];
+++
+++ _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector
+++ _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector
+++ _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector
+++ _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector
+++ _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector
+++ _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector
+++
+++ for (int i = 0; i<4; ++i)
+++ {
+++ E_out_real += real_E_dotProductVector[i];
+++ E_out_imag += imag_E_dotProductVector[i];
+++ P_out_real += real_P_dotProductVector[i];
+++ P_out_imag += imag_P_dotProductVector[i];
+++ L_out_real += real_L_dotProductVector[i];
+++ L_out_imag += imag_L_dotProductVector[i];
+++ }
+++ *E_out_ptr = lv_cmake(E_out_real, E_out_imag);
+++ *P_out_ptr = lv_cmake(P_out_real, P_out_imag);
+++ *L_out_ptr = lv_cmake(L_out_real, L_out_imag);
+++ }
+++
+++ lv_16sc_t bb_signal_sample;
+++ for(int i=0; i < num_points%8; ++i)
+++ {
+++ //Perform the carrier wipe-off
+++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
+++ // Now get early, late, and prompt values for each
+++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++));
+++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++));
+++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++));
+++ }
+++}
+++#endif /* LV_HAVE_SSE4_1 */
+++
+++#ifdef LV_HAVE_SSE4_1
+++#include "smmintrin.h"
+++#include "CommonMacros/CommonMacros.h"
+++/*!
+++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
+++ \param input The input signal input
+++ \param carrier The carrier signal input
+++ \param E_code Early PRN code replica input
+++ \param P_code Early PRN code replica input
+++ \param L_code Early PRN code replica input
+++ \param E_out Early correlation output
+++ \param P_out Early correlation output
+++ \param L_out Early correlation output
+++ \param num_points The number of complex values in vectors
+++ */
+++
+++static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_u_sse4_1_fifth(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points)
+++{
+++ const unsigned int sse_iters = num_points / 8;
+++
+++ __m128i realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy;
+++ __m128i input_i_1, input_i_2, output_i32;
+++
+++ __m128i x1, x2, y1, y2, real_bb_signal_sample, imag_bb_signal_sample;
+++ __m128i realx, imagx, realy, imagy, real_output, imag_output;
+++
+++ __m128 real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc;
+++ __m128 real_output_ps, imag_output_ps;
+++
+++ float E_out_real = 0;
+++ float E_out_imag = 0;
+++ float P_out_real = 0;
+++ float P_out_imag = 0;
+++ float L_out_real = 0;
+++ float L_out_imag = 0;
+++
+++ const lv_16sc_t* input_ptr = input;
+++ const lv_16sc_t* carrier_ptr = carrier;
+++
+++ const lv_16sc_t* E_code_ptr = E_code;
+++ lv_32fc_t* E_out_ptr = E_out;
+++ const lv_16sc_t* L_code_ptr = L_code;
+++ lv_32fc_t* L_out_ptr = L_out;
+++ const lv_16sc_t* P_code_ptr = P_code;
+++ lv_32fc_t* P_out_ptr = P_out;
+++
+++ *E_out_ptr = 0;
+++ *P_out_ptr = 0;
+++ *L_out_ptr = 0;
+++
+++ real_E_code_acc = _mm_setzero_ps();
+++ imag_E_code_acc = _mm_setzero_ps();
+++ real_P_code_acc = _mm_setzero_ps();
+++ imag_P_code_acc = _mm_setzero_ps();
+++ real_L_code_acc = _mm_setzero_ps();
+++ imag_L_code_acc = _mm_setzero_ps();
+++
+++ if (sse_iters>0)
+++ {
+++ for(int number = 0;number < sse_iters; number++){
+++
+++ //Perform the carrier wipe-off
+++ x1 = _mm_lddqu_si128((__m128i*)input_ptr);
+++ input_ptr += 4;
+++ x2 = _mm_lddqu_si128((__m128i*)input_ptr);
+++
+++ y1 = _mm_lddqu_si128((__m128i*)carrier_ptr);
+++ carrier_ptr += 4;
+++ y2 = _mm_lddqu_si128((__m128i*)carrier_ptr);
+++
+++ CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(x1, x2, realx, imagx)
+++ CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy)
+++ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample)
+++
+++ //Get early values
+++ y1 = _mm_lddqu_si128((__m128i*)E_code_ptr);
+++ E_code_ptr += 4;
+++ y2 = _mm_lddqu_si128((__m128i*)E_code_ptr);
+++
+++ CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy)
+++ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(real_bb_signal_sample, imag_bb_signal_sample, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output)
+++
+++ CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(real_output, input_i_1, input_i_2, output_i32, real_output_ps)
+++ CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(imag_output, input_i_1, input_i_2, output_i32, imag_output_ps)
+++
+++ real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps);
+++ imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps);
+++
+++ //Get prompt values
+++ y1 = _mm_lddqu_si128((__m128i*)P_code_ptr);
+++ P_code_ptr += 4;
+++ y2 = _mm_lddqu_si128((__m128i*)P_code_ptr);
+++
+++ CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy)
+++ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(real_bb_signal_sample, imag_bb_signal_sample, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output)
+++
+++ CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(real_output, input_i_1, input_i_2, output_i32, real_output_ps)
+++ CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(imag_output, input_i_1, input_i_2, output_i32, imag_output_ps)
+++
+++ real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps);
+++ imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps);
+++
+++ //Get late values
+++ y1 = _mm_lddqu_si128((__m128i*)L_code_ptr);
+++ L_code_ptr += 4;
+++ y2 = _mm_lddqu_si128((__m128i*)L_code_ptr);
+++
+++ CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy)
+++ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(real_bb_signal_sample, imag_bb_signal_sample, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output)
+++
+++ CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(real_output, input_i_1, input_i_2, output_i32, real_output_ps)
+++ CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(imag_output, input_i_1, input_i_2, output_i32, imag_output_ps)
+++
+++ real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps);
+++ imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps);
+++
+++ input_ptr += 4;
+++ carrier_ptr += 4;
+++ E_code_ptr += 4;
+++ L_code_ptr += 4;
+++ P_code_ptr += 4;
+++ }
+++
+++ __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4];
+++ __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4];
+++ __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4];
+++ __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4];
+++ __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4];
+++ __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4];
+++
+++ _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector
+++ _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector
+++ _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector
+++ _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector
+++ _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector
+++ _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector
+++
+++ for (int i = 0; i<4; ++i)
+++ {
+++ E_out_real += real_E_dotProductVector[i];
+++ E_out_imag += imag_E_dotProductVector[i];
+++ P_out_real += real_P_dotProductVector[i];
+++ P_out_imag += imag_P_dotProductVector[i];
+++ L_out_real += real_L_dotProductVector[i];
+++ L_out_imag += imag_L_dotProductVector[i];
+++ }
+++ *E_out_ptr = lv_cmake(E_out_real, E_out_imag);
+++ *P_out_ptr = lv_cmake(P_out_real, P_out_imag);
+++ *L_out_ptr = lv_cmake(L_out_real, L_out_imag);
+++ }
+++
+++ lv_16sc_t bb_signal_sample;
+++ for(int i=0; i < num_points%8; ++i)
+++ {
+++ //Perform the carrier wipe-off
+++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
+++ // Now get early, late, and prompt values for each
+++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++));
+++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++));
+++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++));
+++ }
+++}
+++#endif /* LV_HAVE_SSE4_1 */
+++
+++#ifdef LV_HAVE_SSE4_1
+++#include "smmintrin.h"
+++#include "CommonMacros/CommonMacros_16ic_cw_epl_corr_32fc.h"
+++#include "CommonMacros/CommonMacros.h"
+++/*!
+++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
+++ \param input The input signal input
+++ \param carrier The carrier signal input
+++ \param E_code Early PRN code replica input
+++ \param P_code Early PRN code replica input
+++ \param L_code Early PRN code replica input
+++ \param E_out Early correlation output
+++ \param P_out Early correlation output
+++ \param L_out Early correlation output
+++ \param num_points The number of complex values in vectors
+++ */
+++
+++static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_u_sse4_1_sixth(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points)
+++{
+++ const unsigned int sse_iters = num_points / 8;
+++
+++ __m128i realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy;
+++ __m128i input_i_1, input_i_2, output_i32;
+++
+++ __m128i x1, x2, y1, y2, real_bb_signal_sample, imag_bb_signal_sample;
+++ __m128i realx, imagx, realy, imagy, real_output, imag_output;
+++
+++ __m128 real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc;
+++ __m128 real_output_ps, imag_output_ps;
+++
+++ float E_out_real = 0;
+++ float E_out_imag = 0;
+++ float P_out_real = 0;
+++ float P_out_imag = 0;
+++ float L_out_real = 0;
+++ float L_out_imag = 0;
+++
+++ const lv_16sc_t* input_ptr = input;
+++ const lv_16sc_t* carrier_ptr = carrier;
+++
+++ const lv_16sc_t* E_code_ptr = E_code;
+++ lv_32fc_t* E_out_ptr = E_out;
+++ const lv_16sc_t* L_code_ptr = L_code;
+++ lv_32fc_t* L_out_ptr = L_out;
+++ const lv_16sc_t* P_code_ptr = P_code;
+++ lv_32fc_t* P_out_ptr = P_out;
+++
+++ *E_out_ptr = 0;
+++ *P_out_ptr = 0;
+++ *L_out_ptr = 0;
+++
+++ real_E_code_acc = _mm_setzero_ps();
+++ imag_E_code_acc = _mm_setzero_ps();
+++ real_P_code_acc = _mm_setzero_ps();
+++ imag_P_code_acc = _mm_setzero_ps();
+++ real_L_code_acc = _mm_setzero_ps();
+++ imag_L_code_acc = _mm_setzero_ps();
+++
+++ if (sse_iters>0)
+++ {
+++ for(int number = 0;number < sse_iters; number++){
+++
+++ //Perform the carrier wipe-off
+++ x1 = _mm_lddqu_si128((__m128i*)input_ptr);
+++ input_ptr += 4;
+++ x2 = _mm_lddqu_si128((__m128i*)input_ptr);
+++
+++ y1 = _mm_lddqu_si128((__m128i*)carrier_ptr);
+++ carrier_ptr += 4;
+++ y2 = _mm_lddqu_si128((__m128i*)carrier_ptr);
+++
+++ CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(x1, x2, realx, imagx)
+++ CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy)
+++ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample)
+++
+++ //Get early values
+++ y1 = _mm_lddqu_si128((__m128i*)E_code_ptr);
+++ E_code_ptr += 4;
+++ y2 = _mm_lddqu_si128((__m128i*)E_code_ptr);
+++
+++ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
+++
+++ real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps);
+++ imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps);
+++
+++ //Get prompt values
+++ y1 = _mm_lddqu_si128((__m128i*)P_code_ptr);
+++ P_code_ptr += 4;
+++ y2 = _mm_lddqu_si128((__m128i*)P_code_ptr);
+++
+++ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
+++
+++ real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps);
+++ imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps);
+++
+++ //Get late values
+++ y1 = _mm_lddqu_si128((__m128i*)L_code_ptr);
+++ L_code_ptr += 4;
+++ y2 = _mm_lddqu_si128((__m128i*)L_code_ptr);
+++
+++ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
+++
+++ real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps);
+++ imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps);
+++
+++ input_ptr += 4;
+++ carrier_ptr += 4;
+++ E_code_ptr += 4;
+++ L_code_ptr += 4;
+++ P_code_ptr += 4;
+++ }
+++
+++ __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4];
+++ __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4];
+++ __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4];
+++ __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4];
+++ __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4];
+++ __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4];
+++
+++ _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector
+++ _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector
+++ _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector
+++ _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector
+++ _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector
+++ _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector
+++
+++ for (int i = 0; i<4; ++i)
+++ {
+++ E_out_real += real_E_dotProductVector[i];
+++ E_out_imag += imag_E_dotProductVector[i];
+++ P_out_real += real_P_dotProductVector[i];
+++ P_out_imag += imag_P_dotProductVector[i];
+++ L_out_real += real_L_dotProductVector[i];
+++ L_out_imag += imag_L_dotProductVector[i];
+++ }
+++ *E_out_ptr = lv_cmake(E_out_real, E_out_imag);
+++ *P_out_ptr = lv_cmake(P_out_real, P_out_imag);
+++ *L_out_ptr = lv_cmake(L_out_real, L_out_imag);
+++ }
+++
+++ lv_16sc_t bb_signal_sample;
+++ for(int i=0; i < num_points%8; ++i)
+++ {
+++ //Perform the carrier wipe-off
+++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
+++ // Now get early, late, and prompt values for each
+++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++));
+++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++));
+++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++));
+++ }
+++}
+++#endif /* LV_HAVE_SSE4_1 */
+++
+++#ifdef LV_HAVE_GENERIC
+++/*!
+++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
+++ \param input The input signal input
+++ \param carrier The carrier signal input
+++ \param E_code Early PRN code replica input
+++ \param P_code Early PRN code replica input
+++ \param L_code Early PRN code replica input
+++ \param E_out Early correlation output
+++ \param P_out Early correlation output
+++ \param L_out Early correlation output
+++ \param num_points The number of complex values in vectors
+++ */
+++static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_generic(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points)
+++{
+++ lv_16sc_t bb_signal_sample;
+++ lv_16sc_t tmp1;
+++ lv_16sc_t tmp2;
+++ lv_16sc_t tmp3;
+++
+++ bb_signal_sample = lv_cmake(0, 0);
+++
+++ *E_out = 0;
+++ *P_out = 0;
+++ *L_out = 0;
+++ // perform Early, Prompt and Late correlation
+++
+++ for(int i=0; i < num_points; ++i)
+++ {
+++ //Perform the carrier wipe-off
+++ bb_signal_sample = input[i] * carrier[i];
+++
+++ tmp1 = bb_signal_sample * E_code[i];
+++ tmp2 = bb_signal_sample * P_code[i];
+++ tmp3 = bb_signal_sample * L_code[i];
+++
+++ // Now get early, late, and prompt values for each
+++ *E_out += (lv_32fc_t)tmp1;
+++ *P_out += (lv_32fc_t)tmp2;
+++ *L_out += (lv_32fc_t)tmp3;
+++ }
+++}
+++#endif /* LV_HAVE_GENERIC */
+++#endif /* INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_u_H */
+++
+++
+++#ifndef INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_a_H
+++#define INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_a_H
+++
+++#include <inttypes.h>
+++#include <stdio.h>
+++#include <volk_gnsssdr/volk_gnsssdr_complex.h>
+++#include <float.h>
+++#include <string.h>
+++//
+++//#ifdef LV_HAVE_SSE4_1
+++//#include "smmintrin.h"
+++///*!
+++// \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
+++// \param input The input signal input
+++// \param carrier The carrier signal input
+++// \param E_code Early PRN code replica input
+++// \param P_code Early PRN code replica input
+++// \param L_code Early PRN code replica input
+++// \param E_out Early correlation output
+++// \param P_out Early correlation output
+++// \param L_out Early correlation output
+++// \param num_points The number of complex values in vectors
+++// */
+++//static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_a_sse4_1(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points)
+++//{
+++// const unsigned int sse_iters = num_points / 8;
+++//
+++// __m128i x1, x2, y1, y2, real_bb_signal_sample, imag_bb_signal_sample;
+++// __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output;
+++//
+++// __m128 real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc;
+++// __m128i real_output_i_1, real_output_i_2, imag_output_i_1, imag_output_i_2;
+++// __m128 real_output_ps_1, real_output_ps_2, imag_output_ps_1, imag_output_ps_2;
+++//
+++// float E_out_real = 0;
+++// float E_out_imag = 0;
+++// float P_out_real = 0;
+++// float P_out_imag = 0;
+++// float L_out_real = 0;
+++// float L_out_imag = 0;
+++//
+++// const lv_16sc_t* input_ptr = input;
+++// const lv_16sc_t* carrier_ptr = carrier;
+++//
+++// const lv_16sc_t* E_code_ptr = E_code;
+++// lv_32fc_t* E_out_ptr = E_out;
+++// const lv_16sc_t* L_code_ptr = L_code;
+++// lv_32fc_t* L_out_ptr = L_out;
+++// const lv_16sc_t* P_code_ptr = P_code;
+++// lv_32fc_t* P_out_ptr = P_out;
+++//
+++// *E_out_ptr = 0;
+++// *P_out_ptr = 0;
+++// *L_out_ptr = 0;
+++//
+++// mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
+++//
+++// real_E_code_acc = _mm_setzero_ps();
+++// imag_E_code_acc = _mm_setzero_ps();
+++// real_P_code_acc = _mm_setzero_ps();
+++// imag_P_code_acc = _mm_setzero_ps();
+++// real_L_code_acc = _mm_setzero_ps();
+++// imag_L_code_acc = _mm_setzero_ps();
+++//
+++// if (sse_iters>0)
+++// {
+++// for(int number = 0;number < sse_iters; number++){
+++//
+++// //Perform the carrier wipe-off
+++// x1 = _mm_lddqu_si128((__m128i*)input_ptr);
+++// input_ptr += 4;
+++// x2 = _mm_lddqu_si128((__m128i*)input_ptr);
+++//
+++// y1 = _mm_lddqu_si128((__m128i*)carrier_ptr);
+++// carrier_ptr += 4;
+++// y2 = _mm_lddqu_si128((__m128i*)carrier_ptr);
+++//
+++// imagx = _mm_srli_si128 (x1, 2);
+++// imagx = _mm_blend_epi16 (x2, imagx, 85);
+++// realx = _mm_slli_si128 (x2, 2);
+++// realx = _mm_blend_epi16 (realx, x1, 85);
+++//
+++// imagy = _mm_srli_si128 (y1, 2);
+++// imagy = _mm_blend_epi16 (y2, imagy, 85);
+++// realy = _mm_slli_si128 (y2, 2);
+++// realy = _mm_blend_epi16 (realy, y1, 85);
+++//
+++// realx_mult_realy = _mm_mullo_epi16 (realx, realy);
+++// imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);
+++// realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);
+++// imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);
+++//
+++// real_bb_signal_sample = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
+++// imag_bb_signal_sample = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
+++//
+++// //Get early values
+++// y1 = _mm_lddqu_si128((__m128i*)E_code_ptr);
+++// E_code_ptr += 4;
+++// y2 = _mm_lddqu_si128((__m128i*)E_code_ptr);
+++//
+++// imagy = _mm_srli_si128 (y1, 2);
+++// imagy = _mm_blend_epi16 (y2, imagy, 85);
+++// realy = _mm_slli_si128 (y2, 2);
+++// realy = _mm_blend_epi16 (realy, y1, 85);
+++//
+++// realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy);
+++// imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy);
+++// realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy);
+++// imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy);
+++//
+++// real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
+++// imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
+++//
+++// real_output_i_1 = _mm_cvtepi16_epi32(real_output);
+++// real_output_ps_1 = _mm_cvtepi32_ps(real_output_i_1);
+++// real_output = _mm_srli_si128 (real_output, 8);
+++// real_output_i_2 = _mm_cvtepi16_epi32(real_output);
+++// real_output_ps_2 = _mm_cvtepi32_ps(real_output_i_2);
+++//
+++// imag_output_i_1 = _mm_cvtepi16_epi32(imag_output);
+++// imag_output_ps_1 = _mm_cvtepi32_ps(imag_output_i_1);
+++// imag_output = _mm_srli_si128 (imag_output, 8);
+++// imag_output_i_2 = _mm_cvtepi16_epi32(imag_output);
+++// imag_output_ps_2 = _mm_cvtepi32_ps(imag_output_i_2);
+++//
+++// real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps_1);
+++// real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps_2);
+++// imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps_1);
+++// imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps_2);
+++//
+++// //Get prompt values
+++// y1 = _mm_lddqu_si128((__m128i*)P_code_ptr);
+++// P_code_ptr += 4;
+++// y2 = _mm_lddqu_si128((__m128i*)P_code_ptr);
+++//
+++// imagy = _mm_srli_si128 (y1, 2);
+++// imagy = _mm_blend_epi16 (y2, imagy, 85);
+++// realy = _mm_slli_si128 (y2, 2);
+++// realy = _mm_blend_epi16 (realy, y1, 85);
+++//
+++// realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy);
+++// imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy);
+++// realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy);
+++// imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy);
+++//
+++// real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
+++// imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
+++//
+++// real_output_i_1 = _mm_cvtepi16_epi32(real_output);
+++// real_output_ps_1 = _mm_cvtepi32_ps(real_output_i_1);
+++// real_output = _mm_srli_si128 (real_output, 8);
+++// real_output_i_2 = _mm_cvtepi16_epi32(real_output);
+++// real_output_ps_2 = _mm_cvtepi32_ps(real_output_i_2);
+++//
+++// imag_output_i_1 = _mm_cvtepi16_epi32(imag_output);
+++// imag_output_ps_1 = _mm_cvtepi32_ps(imag_output_i_1);
+++// imag_output = _mm_srli_si128 (imag_output, 8);
+++// imag_output_i_2 = _mm_cvtepi16_epi32(imag_output);
+++// imag_output_ps_2 = _mm_cvtepi32_ps(imag_output_i_2);
+++//
+++// real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps_1);
+++// real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps_2);
+++// imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps_1);
+++// imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps_2);
+++//
+++// //Get late values
+++// y1 = _mm_lddqu_si128((__m128i*)L_code_ptr);
+++// L_code_ptr += 4;
+++// y2 = _mm_lddqu_si128((__m128i*)L_code_ptr);
+++//
+++// imagy = _mm_srli_si128 (y1, 2);
+++// imagy = _mm_blend_epi16 (y2, imagy, 85);
+++// realy = _mm_slli_si128 (y2, 2);
+++// realy = _mm_blend_epi16 (realy, y1, 85);
+++//
+++// realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy);
+++// imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy);
+++// realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy);
+++// imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy);
+++//
+++// real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
+++// imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
+++//
+++// real_output_i_1 = _mm_cvtepi16_epi32(real_output);
+++// real_output_ps_1 = _mm_cvtepi32_ps(real_output_i_1);
+++// real_output = _mm_srli_si128 (real_output, 8);
+++// real_output_i_2 = _mm_cvtepi16_epi32(real_output);
+++// real_output_ps_2 = _mm_cvtepi32_ps(real_output_i_2);
+++//
+++// imag_output_i_1 = _mm_cvtepi16_epi32(imag_output);
+++// imag_output_ps_1 = _mm_cvtepi32_ps(imag_output_i_1);
+++// imag_output = _mm_srli_si128 (imag_output, 8);
+++// imag_output_i_2 = _mm_cvtepi16_epi32(imag_output);
+++// imag_output_ps_2 = _mm_cvtepi32_ps(imag_output_i_2);
+++//
+++// real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps_1);
+++// real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps_2);
+++// imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps_1);
+++// imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps_2);
+++//
+++// input_ptr += 4;
+++// carrier_ptr += 4;
+++// E_code_ptr += 4;
+++// L_code_ptr += 4;
+++// P_code_ptr += 4;
+++// }
+++//
+++// __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4];
+++// __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4];
+++// __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4];
+++// __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4];
+++// __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4];
+++// __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4];
+++//
+++// _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector
+++// _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector
+++// _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector
+++// _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector
+++// _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector
+++// _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector
+++//
+++// for (int i = 0; i<4; ++i)
+++// {
+++// E_out_real += real_E_dotProductVector[i];
+++// E_out_imag += imag_E_dotProductVector[i];
+++// P_out_real += real_P_dotProductVector[i];
+++// P_out_imag += imag_P_dotProductVector[i];
+++// L_out_real += real_L_dotProductVector[i];
+++// L_out_imag += imag_L_dotProductVector[i];
+++// }
+++// *E_out_ptr = lv_cmake(E_out_real, E_out_imag);
+++// *P_out_ptr = lv_cmake(P_out_real, P_out_imag);
+++// *L_out_ptr = lv_cmake(L_out_real, L_out_imag);
+++// }
+++//
+++// lv_16sc_t bb_signal_sample;
+++// for(int i=0; i < num_points%8; ++i)
+++// {
+++// //Perform the carrier wipe-off
+++// bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
+++// // Now get early, late, and prompt values for each
+++// *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++));
+++// *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++));
+++// *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++));
+++// }
+++//}
+++//#endif /* LV_HAVE_SSE4_1 */
+++//
+++#ifdef LV_HAVE_GENERIC
+++/*!
+++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
+++ \param input The input signal input
+++ \param carrier The carrier signal input
+++ \param E_code Early PRN code replica input
+++ \param P_code Early PRN code replica input
+++ \param L_code Early PRN code replica input
+++ \param E_out Early correlation output
+++ \param P_out Early correlation output
+++ \param L_out Early correlation output
+++ \param num_points The number of complex values in vectors
+++ */
+++static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_a_generic(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points)
+++{
+++ lv_16sc_t bb_signal_sample;
+++ lv_16sc_t tmp1;
+++ lv_16sc_t tmp2;
+++ lv_16sc_t tmp3;
+++
+++ bb_signal_sample = lv_cmake(0, 0);
+++
+++ *E_out = 0;
+++ *P_out = 0;
+++ *L_out = 0;
+++ // perform Early, Prompt and Late correlation
+++
+++ for(int i=0; i < num_points; ++i)
+++ {
+++ //Perform the carrier wipe-off
+++ bb_signal_sample = input[i] * carrier[i];
+++
+++ tmp1 = bb_signal_sample * E_code[i];
+++ tmp2 = bb_signal_sample * P_code[i];
+++ tmp3 = bb_signal_sample * L_code[i];
+++
+++ // Now get early, late, and prompt values for each
+++ *E_out += (lv_32fc_t)tmp1;
+++ *P_out += (lv_32fc_t)tmp2;
+++ *L_out += (lv_32fc_t)tmp3;
+++ }
+++}
+++#endif /* LV_HAVE_GENERIC */
+++#endif /* INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_a_H */
++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5.h
++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5.h 1970-01-01 01:00:00.000000000 +0100
+++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5.h 2014-10-15 01:55:08.000000000 +0200
++@@ -0,0 +1,595 @@
+++/*!
+++ * \file volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5.h
+++ * \brief Volk protokernel: performs the carrier wipe-off mixing and the Very early, Early, Prompt, Late and very late correlation with 32 bits vectors and returns float32 values.
+++ * \authors <ul>
+++ * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
+++ * </ul>
+++ *
+++ * Volk protokernel that performs the carrier wipe-off mixing and the
+++ * Very Early, Early, Prompt, Late and Very Late correlation with 32 bits vectors (16 bits the
+++ * real part and 16 bits the imaginary part) and accumulates into float32 values, returning them:
+++ * - The carrier wipe-off is done by multiplying the input signal by the
+++ * carrier (multiplication of 32 bits vectors) It returns the input
+++ * signal in base band (BB)
+++ * - Very Early values are calculated by multiplying the input signal in BB by the
+++ * very early code (multiplication of 32 bits vectors), converting that to float32 and accumulating the results
+++ * - Early values are calculated by multiplying the input signal in BB by the
+++ * early code (multiplication of 32 bits vectors), converting that to float32 and accumulating the results
+++ * - Prompt values are calculated by multiplying the input signal in BB by the
+++ * prompt code (multiplication of 32 bits vectors), converting that to float32 and accumulating the results
+++ * - Late values are calculated by multiplying the input signal in BB by the
+++ * late code (multiplication of 32 bits vectors), converting that to float32 and accumulating the results
+++ * - Very Late values are calculated by multiplying the input signal in BB by the
+++ * very late code (multiplication of 32 bits vectors), converting that to float32 and accumulating the results
+++ *
+++ * -------------------------------------------------------------------------
+++ *
+++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
+++ *
+++ * GNSS-SDR is a software defined Global Navigation
+++ * Satellite Systems receiver
+++ *
+++ * This file is part of GNSS-SDR.
+++ *
+++ * GNSS-SDR is free software: you can redistribute it and/or modify
+++ * it under the terms of the GNU General Public License as published by
+++ * the Free Software Foundation, either version 3 of the License, or
+++ * at your option) any later version.
+++ *
+++ * GNSS-SDR is distributed in the hope that it will be useful,
+++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+++ * GNU General Public License for more details.
+++ *
+++ * You should have received a copy of the GNU General Public License
+++ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
+++ *
+++ * -------------------------------------------------------------------------
+++ */
+++
+++#ifndef INCLUDED_gnsssdr_volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5_u_H
+++#define INCLUDED_gnsssdr_volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5_u_H
+++
+++#include <inttypes.h>
+++#include <stdio.h>
+++#include <volk_gnsssdr/volk_gnsssdr_complex.h>
+++#include <float.h>
+++#include <string.h>
+++
+++#ifdef LV_HAVE_SSE4_1
+++#include "smmintrin.h"
+++#include "CommonMacros/CommonMacros_16ic_cw_epl_corr_32fc.h"
+++#include "CommonMacros/CommonMacros.h"
+++ /*!
+++ \brief Performs the carrier wipe-off mixing and the Very Early, Early, Prompt, Late and Very Vate correlation
+++ \param input The input signal input
+++ \param carrier The carrier signal input
+++ \param VE_code Very Early PRN code replica input
+++ \param E_code Early PRN code replica input
+++ \param P_code Prompt PRN code replica input
+++ \param L_code Late PRN code replica input
+++ \param VL_code Very Late PRN code replica input
+++ \param VE_out Very Early correlation output
+++ \param E_out Early correlation output
+++ \param P_out Prompt correlation output
+++ \param L_out Late correlation output
+++ \param VL_out Very Late correlation output
+++ \param num_points The number of complex values in vectors
+++ */
+++static inline void volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5_u_sse4_1(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* VE_code, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, const lv_16sc_t* VL_code, unsigned int num_points)
+++{
+++ const unsigned int sse_iters = num_points / 8;
+++
+++ __m128i x1, x2, y1, y2, real_bb_signal_sample, imag_bb_signal_sample;
+++ __m128i realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output;
+++
+++ __m128 real_VE_code_acc, imag_VE_code_acc, real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc, real_VL_code_acc, imag_VL_code_acc;
+++ __m128i input_i_1, input_i_2, output_i32;
+++ __m128 real_output_ps, imag_output_ps;
+++
+++ float VE_out_real = 0;
+++ float VE_out_imag = 0;
+++ float E_out_real = 0;
+++ float E_out_imag = 0;
+++ float P_out_real = 0;
+++ float P_out_imag = 0;
+++ float L_out_real = 0;
+++ float L_out_imag = 0;
+++ float VL_out_real = 0;
+++ float VL_out_imag = 0;
+++
+++ const lv_16sc_t* input_ptr = input;
+++ const lv_16sc_t* carrier_ptr = carrier;
+++
+++ const lv_16sc_t* VE_code_ptr = VE_code;
+++ lv_32fc_t* VE_out_ptr = VE_out;
+++ const lv_16sc_t* E_code_ptr = E_code;
+++ lv_32fc_t* E_out_ptr = E_out;
+++ const lv_16sc_t* L_code_ptr = L_code;
+++ lv_32fc_t* L_out_ptr = L_out;
+++ const lv_16sc_t* P_code_ptr = P_code;
+++ lv_32fc_t* P_out_ptr = P_out;
+++ const lv_16sc_t* VL_code_ptr = VL_code;
+++ lv_32fc_t* VL_out_ptr = VL_out;
+++
+++ *VE_out_ptr = 0;
+++ *E_out_ptr = 0;
+++ *P_out_ptr = 0;
+++ *L_out_ptr = 0;
+++ *VL_out_ptr = 0;
+++
+++ real_VE_code_acc = _mm_setzero_ps();
+++ imag_VE_code_acc = _mm_setzero_ps();
+++ real_E_code_acc = _mm_setzero_ps();
+++ imag_E_code_acc = _mm_setzero_ps();
+++ real_P_code_acc = _mm_setzero_ps();
+++ imag_P_code_acc = _mm_setzero_ps();
+++ real_L_code_acc = _mm_setzero_ps();
+++ imag_L_code_acc = _mm_setzero_ps();
+++ real_VL_code_acc = _mm_setzero_ps();
+++ imag_VL_code_acc = _mm_setzero_ps();
+++
+++ if (sse_iters>0)
+++ {
+++ for(int number = 0;number < sse_iters; number++){
+++
+++ //Perform the carrier wipe-off
+++ x1 = _mm_lddqu_si128((__m128i*)input_ptr);
+++ input_ptr += 4;
+++ x2 = _mm_lddqu_si128((__m128i*)input_ptr);
+++
+++ y1 = _mm_lddqu_si128((__m128i*)carrier_ptr);
+++ carrier_ptr += 4;
+++ y2 = _mm_lddqu_si128((__m128i*)carrier_ptr);
+++
+++ CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(x1, x2, realx, imagx)
+++ CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy)
+++ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample)
+++
+++ //Get very early values
+++ y1 = _mm_lddqu_si128((__m128i*)VE_code_ptr);
+++ VE_code_ptr += 4;
+++ y2 = _mm_lddqu_si128((__m128i*)VE_code_ptr);
+++
+++ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
+++
+++ real_VE_code_acc = _mm_add_ps (real_VE_code_acc, real_output_ps);
+++ imag_VE_code_acc = _mm_add_ps (imag_VE_code_acc, imag_output_ps);
+++
+++ //Get early values
+++ y1 = _mm_lddqu_si128((__m128i*)E_code_ptr);
+++ E_code_ptr += 4;
+++ y2 = _mm_lddqu_si128((__m128i*)E_code_ptr);
+++
+++ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
+++
+++ real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps);
+++ imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps);
+++
+++ //Get prompt values
+++ y1 = _mm_lddqu_si128((__m128i*)P_code_ptr);
+++ P_code_ptr += 4;
+++ y2 = _mm_lddqu_si128((__m128i*)P_code_ptr);
+++
+++ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
+++
+++ real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps);
+++ imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps);
+++
+++ //Get late values
+++ y1 = _mm_lddqu_si128((__m128i*)L_code_ptr);
+++ L_code_ptr += 4;
+++ y2 = _mm_lddqu_si128((__m128i*)L_code_ptr);
+++
+++ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
+++
+++ real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps);
+++ imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps);
+++
+++ //Get very late values
+++ y1 = _mm_lddqu_si128((__m128i*)VL_code_ptr);
+++ VL_code_ptr += 4;
+++ y2 = _mm_lddqu_si128((__m128i*)VL_code_ptr);
+++
+++ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
+++
+++ real_VL_code_acc = _mm_add_ps (real_VL_code_acc, real_output_ps);
+++ imag_VL_code_acc = _mm_add_ps (imag_VL_code_acc, imag_output_ps);
+++
+++ input_ptr += 4;
+++ carrier_ptr += 4;
+++ VE_code_ptr += 4;
+++ E_code_ptr += 4;
+++ P_code_ptr += 4;
+++ L_code_ptr += 4;
+++ VL_code_ptr += 4;
+++ }
+++
+++ __VOLK_ATTR_ALIGNED(16) float real_VE_dotProductVector[4];
+++ __VOLK_ATTR_ALIGNED(16) float imag_VE_dotProductVector[4];
+++ __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4];
+++ __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4];
+++ __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4];
+++ __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4];
+++ __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4];
+++ __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4];
+++ __VOLK_ATTR_ALIGNED(16) float real_VL_dotProductVector[4];
+++ __VOLK_ATTR_ALIGNED(16) float imag_VL_dotProductVector[4];
+++
+++ _mm_storeu_ps((float*)real_VE_dotProductVector,real_VE_code_acc); // Store the results back into the dot product vector
+++ _mm_storeu_ps((float*)imag_VE_dotProductVector,imag_VE_code_acc); // Store the results back into the dot product vector
+++ _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector
+++ _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector
+++ _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector
+++ _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector
+++ _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector
+++ _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector
+++ _mm_storeu_ps((float*)real_VL_dotProductVector,real_VL_code_acc); // Store the results back into the dot product vector
+++ _mm_storeu_ps((float*)imag_VL_dotProductVector,imag_VL_code_acc); // Store the results back into the dot product vector
+++
+++ for (int i = 0; i<4; ++i)
+++ {
+++ VE_out_real += real_VE_dotProductVector[i];
+++ VE_out_imag += imag_VE_dotProductVector[i];
+++ E_out_real += real_E_dotProductVector[i];
+++ E_out_imag += imag_E_dotProductVector[i];
+++ P_out_real += real_P_dotProductVector[i];
+++ P_out_imag += imag_P_dotProductVector[i];
+++ L_out_real += real_L_dotProductVector[i];
+++ L_out_imag += imag_L_dotProductVector[i];
+++ VL_out_real += real_VL_dotProductVector[i];
+++ VL_out_imag += imag_VL_dotProductVector[i];
+++ }
+++ *VE_out_ptr = lv_cmake(VE_out_real, VE_out_imag);
+++ *E_out_ptr = lv_cmake(E_out_real, E_out_imag);
+++ *P_out_ptr = lv_cmake(P_out_real, P_out_imag);
+++ *L_out_ptr = lv_cmake(L_out_real, L_out_imag);
+++ *VL_out_ptr = lv_cmake(VL_out_real, VL_out_imag);
+++ }
+++
+++ lv_16sc_t bb_signal_sample;
+++ for(int i=0; i < num_points%8; ++i)
+++ {
+++ //Perform the carrier wipe-off
+++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
+++ // Now get early, late, and prompt values for each
+++ *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VE_code_ptr++));
+++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++));
+++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++));
+++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++));
+++ *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VL_code_ptr++));
+++ }
+++
+++}
+++#endif /* LV_HAVE_SSE4_1 */
+++
+++#ifdef LV_HAVE_GENERIC
+++/*!
+++ \brief Performs the carrier wipe-off mixing and the Very Early, Early, Prompt, Late and Very Vate correlation
+++ \param input The input signal input
+++ \param carrier The carrier signal input
+++ \param VE_code Very Early PRN code replica input
+++ \param E_code Early PRN code replica input
+++ \param P_code Prompt PRN code replica input
+++ \param L_code Late PRN code replica input
+++ \param VL_code Very Late PRN code replica input
+++ \param VE_out Very Early correlation output
+++ \param E_out Early correlation output
+++ \param P_out Prompt correlation output
+++ \param L_out Late correlation output
+++ \param VL_out Very Late correlation output
+++ \param num_points The number of complex values in vectors
+++ */
+++static inline void volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* VE_code, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, const lv_16sc_t* VL_code, unsigned int num_points)
+++{
+++ lv_16sc_t bb_signal_sample;
+++ lv_16sc_t tmp1;
+++ lv_16sc_t tmp2;
+++ lv_16sc_t tmp3;
+++ lv_16sc_t tmp4;
+++ lv_16sc_t tmp5;
+++
+++ bb_signal_sample = lv_cmake(0, 0);
+++
+++ *VE_out = 0;
+++ *E_out = 0;
+++ *P_out = 0;
+++ *L_out = 0;
+++ *VL_out = 0;
+++ // perform Early, Prompt and Late correlation
+++
+++ for(int i=0; i < num_points; ++i)
+++ {
+++ //Perform the carrier wipe-off
+++ bb_signal_sample = input[i] * carrier[i];
+++
+++ tmp1 = bb_signal_sample * VE_code[i];
+++ tmp2 = bb_signal_sample * E_code[i];
+++ tmp3 = bb_signal_sample * P_code[i];
+++ tmp4 = bb_signal_sample * L_code[i];
+++ tmp5 = bb_signal_sample * VL_code[i];
+++
+++ // Now get early, late, and prompt values for each
+++ *VE_out += (lv_32fc_t)tmp1;
+++ *E_out += (lv_32fc_t)tmp2;
+++ *P_out += (lv_32fc_t)tmp3;
+++ *L_out += (lv_32fc_t)tmp4;
+++ *VL_out += (lv_32fc_t)tmp5;
+++ }
+++}
+++#endif /* LV_HAVE_GENERIC */
+++#endif /* INCLUDED_gnsssdr_volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5_u_H */
+++
+++
+++#ifndef INCLUDED_gnsssdr_volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5_a_H
+++#define INCLUDED_gnsssdr_volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5_a_H
+++
+++#include <inttypes.h>
+++#include <stdio.h>
+++#include <volk_gnsssdr/volk_gnsssdr_complex.h>
+++#include <float.h>
+++#include <string.h>
+++
+++#ifdef LV_HAVE_SSE4_1
+++#include "smmintrin.h"
+++#include "CommonMacros/CommonMacros_16ic_cw_epl_corr_32fc.h"
+++#include "CommonMacros/CommonMacros.h"
+++/*!
+++ \brief Performs the carrier wipe-off mixing and the Very Early, Early, Prompt, Late and Very Vate correlation
+++ \param input The input signal input
+++ \param carrier The carrier signal input
+++ \param VE_code Very Early PRN code replica input
+++ \param E_code Early PRN code replica input
+++ \param P_code Prompt PRN code replica input
+++ \param L_code Late PRN code replica input
+++ \param VL_code Very Late PRN code replica input
+++ \param VE_out Very Early correlation output
+++ \param E_out Early correlation output
+++ \param P_out Prompt correlation output
+++ \param L_out Late correlation output
+++ \param VL_out Very Late correlation output
+++ \param num_points The number of complex values in vectors
+++ */
+++static inline void volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5_a_sse4_1(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* VE_code, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, const lv_16sc_t* VL_code, unsigned int num_points)
+++{
+++ const unsigned int sse_iters = num_points / 8;
+++
+++ __m128i x1, x2, y1, y2, real_bb_signal_sample, imag_bb_signal_sample;
+++ __m128i realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output;
+++
+++ __m128 real_VE_code_acc, imag_VE_code_acc, real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc, real_VL_code_acc, imag_VL_code_acc;
+++ __m128i input_i_1, input_i_2, output_i32;
+++ __m128 real_output_ps, imag_output_ps;
+++
+++ float VE_out_real = 0;
+++ float VE_out_imag = 0;
+++ float E_out_real = 0;
+++ float E_out_imag = 0;
+++ float P_out_real = 0;
+++ float P_out_imag = 0;
+++ float L_out_real = 0;
+++ float L_out_imag = 0;
+++ float VL_out_real = 0;
+++ float VL_out_imag = 0;
+++
+++ const lv_16sc_t* input_ptr = input;
+++ const lv_16sc_t* carrier_ptr = carrier;
+++
+++ const lv_16sc_t* VE_code_ptr = VE_code;
+++ lv_32fc_t* VE_out_ptr = VE_out;
+++ const lv_16sc_t* E_code_ptr = E_code;
+++ lv_32fc_t* E_out_ptr = E_out;
+++ const lv_16sc_t* L_code_ptr = L_code;
+++ lv_32fc_t* L_out_ptr = L_out;
+++ const lv_16sc_t* P_code_ptr = P_code;
+++ lv_32fc_t* P_out_ptr = P_out;
+++ const lv_16sc_t* VL_code_ptr = VL_code;
+++ lv_32fc_t* VL_out_ptr = VL_out;
+++
+++ *VE_out_ptr = 0;
+++ *E_out_ptr = 0;
+++ *P_out_ptr = 0;
+++ *L_out_ptr = 0;
+++ *VL_out_ptr = 0;
+++
+++ real_VE_code_acc = _mm_setzero_ps();
+++ imag_VE_code_acc = _mm_setzero_ps();
+++ real_E_code_acc = _mm_setzero_ps();
+++ imag_E_code_acc = _mm_setzero_ps();
+++ real_P_code_acc = _mm_setzero_ps();
+++ imag_P_code_acc = _mm_setzero_ps();
+++ real_L_code_acc = _mm_setzero_ps();
+++ imag_L_code_acc = _mm_setzero_ps();
+++ real_VL_code_acc = _mm_setzero_ps();
+++ imag_VL_code_acc = _mm_setzero_ps();
+++
+++ if (sse_iters>0)
+++ {
+++ for(int number = 0;number < sse_iters; number++){
+++
+++ //Perform the carrier wipe-off
+++ x1 = _mm_load_si128((__m128i*)input_ptr);
+++ input_ptr += 4;
+++ x2 = _mm_load_si128((__m128i*)input_ptr);
+++
+++ y1 = _mm_load_si128((__m128i*)carrier_ptr);
+++ carrier_ptr += 4;
+++ y2 = _mm_load_si128((__m128i*)carrier_ptr);
+++
+++ CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(x1, x2, realx, imagx)
+++ CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy)
+++ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample)
+++
+++ //Get very early values
+++ y1 = _mm_load_si128((__m128i*)VE_code_ptr);
+++ VE_code_ptr += 4;
+++ y2 = _mm_load_si128((__m128i*)VE_code_ptr);
+++
+++ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
+++
+++ real_VE_code_acc = _mm_add_ps (real_VE_code_acc, real_output_ps);
+++ imag_VE_code_acc = _mm_add_ps (imag_VE_code_acc, imag_output_ps);
+++
+++ //Get early values
+++ y1 = _mm_load_si128((__m128i*)E_code_ptr);
+++ E_code_ptr += 4;
+++ y2 = _mm_load_si128((__m128i*)E_code_ptr);
+++
+++ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
+++
+++ real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps);
+++ imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps);
+++
+++ //Get prompt values
+++ y1 = _mm_load_si128((__m128i*)P_code_ptr);
+++ P_code_ptr += 4;
+++ y2 = _mm_load_si128((__m128i*)P_code_ptr);
+++
+++ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
+++
+++ real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps);
+++ imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps);
+++
+++ //Get late values
+++ y1 = _mm_load_si128((__m128i*)L_code_ptr);
+++ L_code_ptr += 4;
+++ y2 = _mm_load_si128((__m128i*)L_code_ptr);
+++
+++ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
+++
+++ real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps);
+++ imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps);
+++
+++ //Get very late values
+++ y1 = _mm_load_si128((__m128i*)VL_code_ptr);
+++ VL_code_ptr += 4;
+++ y2 = _mm_load_si128((__m128i*)VL_code_ptr);
+++
+++ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
+++
+++ real_VL_code_acc = _mm_add_ps (real_VL_code_acc, real_output_ps);
+++ imag_VL_code_acc = _mm_add_ps (imag_VL_code_acc, imag_output_ps);
+++
+++ input_ptr += 4;
+++ carrier_ptr += 4;
+++ VE_code_ptr += 4;
+++ E_code_ptr += 4;
+++ P_code_ptr += 4;
+++ L_code_ptr += 4;
+++ VL_code_ptr += 4;
+++ }
+++
+++ __VOLK_ATTR_ALIGNED(16) float real_VE_dotProductVector[4];
+++ __VOLK_ATTR_ALIGNED(16) float imag_VE_dotProductVector[4];
+++ __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4];
+++ __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4];
+++ __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4];
+++ __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4];
+++ __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4];
+++ __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4];
+++ __VOLK_ATTR_ALIGNED(16) float real_VL_dotProductVector[4];
+++ __VOLK_ATTR_ALIGNED(16) float imag_VL_dotProductVector[4];
+++
+++ _mm_store_ps((float*)real_VE_dotProductVector,real_VE_code_acc); // Store the results back into the dot product vector
+++ _mm_store_ps((float*)imag_VE_dotProductVector,imag_VE_code_acc); // Store the results back into the dot product vector
+++ _mm_store_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector
+++ _mm_store_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector
+++ _mm_store_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector
+++ _mm_store_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector
+++ _mm_store_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector
+++ _mm_store_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector
+++ _mm_store_ps((float*)real_VL_dotProductVector,real_VL_code_acc); // Store the results back into the dot product vector
+++ _mm_store_ps((float*)imag_VL_dotProductVector,imag_VL_code_acc); // Store the results back into the dot product vector
+++
+++ for (int i = 0; i<4; ++i)
+++ {
+++ VE_out_real += real_VE_dotProductVector[i];
+++ VE_out_imag += imag_VE_dotProductVector[i];
+++ E_out_real += real_E_dotProductVector[i];
+++ E_out_imag += imag_E_dotProductVector[i];
+++ P_out_real += real_P_dotProductVector[i];
+++ P_out_imag += imag_P_dotProductVector[i];
+++ L_out_real += real_L_dotProductVector[i];
+++ L_out_imag += imag_L_dotProductVector[i];
+++ VL_out_real += real_VL_dotProductVector[i];
+++ VL_out_imag += imag_VL_dotProductVector[i];
+++ }
+++ *VE_out_ptr = lv_cmake(VE_out_real, VE_out_imag);
+++ *E_out_ptr = lv_cmake(E_out_real, E_out_imag);
+++ *P_out_ptr = lv_cmake(P_out_real, P_out_imag);
+++ *L_out_ptr = lv_cmake(L_out_real, L_out_imag);
+++ *VL_out_ptr = lv_cmake(VL_out_real, VL_out_imag);
+++ }
+++
+++ lv_16sc_t bb_signal_sample;
+++ for(int i=0; i < num_points%8; ++i)
+++ {
+++ //Perform the carrier wipe-off
+++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
+++ // Now get early, late, and prompt values for each
+++ *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VE_code_ptr++));
+++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++));
+++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++));
+++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++));
+++ *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VL_code_ptr++));
+++ }
+++
+++}
+++#endif /* LV_HAVE_SSE4_1 */
+++
+++#ifdef LV_HAVE_GENERIC
+++/*!
+++ \brief Performs the carrier wipe-off mixing and the Very Early, Early, Prompt, Late and Very Vate correlation
+++ \param input The input signal input
+++ \param carrier The carrier signal input
+++ \param VE_code Very Early PRN code replica input
+++ \param E_code Early PRN code replica input
+++ \param P_code Prompt PRN code replica input
+++ \param L_code Late PRN code replica input
+++ \param VL_code Very Late PRN code replica input
+++ \param VE_out Very Early correlation output
+++ \param E_out Early correlation output
+++ \param P_out Prompt correlation output
+++ \param L_out Late correlation output
+++ \param VL_out Very Late correlation output
+++ \param num_points The number of complex values in vectors
+++ */
+++static inline void volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5_a_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* VE_code, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, const lv_16sc_t* VL_code, unsigned int num_points)
+++{
+++ lv_16sc_t bb_signal_sample;
+++ lv_16sc_t tmp1;
+++ lv_16sc_t tmp2;
+++ lv_16sc_t tmp3;
+++ lv_16sc_t tmp4;
+++ lv_16sc_t tmp5;
+++
+++ bb_signal_sample = lv_cmake(0, 0);
+++
+++ *VE_out = 0;
+++ *E_out = 0;
+++ *P_out = 0;
+++ *L_out = 0;
+++ *VL_out = 0;
+++ // perform Early, Prompt and Late correlation
+++
+++ for(int i=0; i < num_points; ++i)
+++ {
+++ //Perform the carrier wipe-off
+++ bb_signal_sample = input[i] * carrier[i];
+++
+++ tmp1 = bb_signal_sample * VE_code[i];
+++ tmp2 = bb_signal_sample * E_code[i];
+++ tmp3 = bb_signal_sample * P_code[i];
+++ tmp4 = bb_signal_sample * L_code[i];
+++ tmp5 = bb_signal_sample * VL_code[i];
+++
+++ // Now get early, late, and prompt values for each
+++ *VE_out += (lv_32fc_t)tmp1;
+++ *E_out += (lv_32fc_t)tmp2;
+++ *P_out += (lv_32fc_t)tmp3;
+++ *L_out += (lv_32fc_t)tmp4;
+++ *VL_out += (lv_32fc_t)tmp5;
+++ }
+++}
+++#endif /* LV_HAVE_GENERIC */
+++#endif /* INCLUDED_gnsssdr_volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5_a_H */
++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_accumulator_s32f.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32f_accumulator_s32f.h
++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_accumulator_s32f.h 1970-01-01 01:00:00.000000000 +0100
+++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32f_accumulator_s32f.h 2014-10-15 01:55:08.000000000 +0200
++@@ -0,0 +1,68 @@
+++#ifndef INCLUDED_volk_gnsssdr_32f_accumulator_s32f_a_H
+++#define INCLUDED_volk_gnsssdr_32f_accumulator_s32f_a_H
+++
+++#include <volk_gnsssdr/volk_gnsssdr_common.h>
+++#include <inttypes.h>
+++#include <stdio.h>
+++
+++#ifdef LV_HAVE_SSE
+++#include <xmmintrin.h>
+++/*!
+++ \brief Accumulates the values in the input buffer
+++ \param result The accumulated result
+++ \param inputBuffer The buffer of data to be accumulated
+++ \param num_points The number of values in inputBuffer to be accumulated
+++*/
+++static inline void volk_gnsssdr_32f_accumulator_s32f_a_sse(float* result, const float* inputBuffer, unsigned int num_points){
+++ float returnValue = 0;
+++ unsigned int number = 0;
+++ const unsigned int quarterPoints = num_points / 4;
+++
+++ const float* aPtr = inputBuffer;
+++ __VOLK_ATTR_ALIGNED(16) float tempBuffer[4];
+++
+++ __m128 accumulator = _mm_setzero_ps();
+++ __m128 aVal = _mm_setzero_ps();
+++
+++ for(;number < quarterPoints; number++){
+++ aVal = _mm_load_ps(aPtr);
+++ accumulator = _mm_add_ps(accumulator, aVal);
+++ aPtr += 4;
+++ }
+++ _mm_store_ps(tempBuffer,accumulator); // Store the results back into the C container
+++ returnValue = tempBuffer[0];
+++ returnValue += tempBuffer[1];
+++ returnValue += tempBuffer[2];
+++ returnValue += tempBuffer[3];
+++
+++ number = quarterPoints * 4;
+++ for(;number < num_points; number++){
+++ returnValue += (*aPtr++);
+++ }
+++ *result = returnValue;
+++}
+++#endif /* LV_HAVE_SSE */
+++
+++#ifdef LV_HAVE_GENERIC
+++/*!
+++ \brief Accumulates the values in the input buffer
+++ \param result The accumulated result
+++ \param inputBuffer The buffer of data to be accumulated
+++ \param num_points The number of values in inputBuffer to be accumulated
+++*/
+++static inline void volk_gnsssdr_32f_accumulator_s32f_generic(float* result, const float* inputBuffer, unsigned int num_points){
+++ const float* aPtr = inputBuffer;
+++ unsigned int number = 0;
+++ float returnValue = 0;
+++
+++ for(;number < num_points; number++){
+++ returnValue += (*aPtr++);
+++ }
+++ *result = returnValue;
+++}
+++#endif /* LV_HAVE_GENERIC */
+++
+++
+++
+++
+++#endif /* INCLUDED_volk_gnsssdr_32f_accumulator_s32f_a_H */
++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_index_max_16u.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32f_index_max_16u.h
++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_index_max_16u.h 1970-01-01 01:00:00.000000000 +0100
+++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32f_index_max_16u.h 2014-10-15 01:55:08.000000000 +0200
++@@ -0,0 +1,149 @@
+++#ifndef INCLUDED_volk_gnsssdr_32f_index_max_16u_a_H
+++#define INCLUDED_volk_gnsssdr_32f_index_max_16u_a_H
+++
+++#include <volk_gnsssdr/volk_gnsssdr_common.h>
+++#include <volk_gnsssdr/volk_gnsssdr_common.h>
+++#include <inttypes.h>
+++#include <stdio.h>
+++
+++#ifdef LV_HAVE_SSE4_1
+++#include<smmintrin.h>
+++
+++static inline void volk_gnsssdr_32f_index_max_16u_a_sse4_1(unsigned int* target, const float* src0, unsigned int num_points) {
+++ if(num_points > 0){
+++ unsigned int number = 0;
+++ const unsigned int quarterPoints = num_points / 4;
+++
+++ float* inputPtr = (float*)src0;
+++
+++ __m128 indexIncrementValues = _mm_set1_ps(4);
+++ __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4);
+++
+++ float max = src0[0];
+++ float index = 0;
+++ __m128 maxValues = _mm_set1_ps(max);
+++ __m128 maxValuesIndex = _mm_setzero_ps();
+++ __m128 compareResults;
+++ __m128 currentValues;
+++
+++ __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
+++ __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
+++
+++ for(;number < quarterPoints; number++){
+++
+++ currentValues = _mm_load_ps(inputPtr); inputPtr += 4;
+++ currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
+++
+++ compareResults = _mm_cmpgt_ps(maxValues, currentValues);
+++
+++ maxValuesIndex = _mm_blendv_ps(currentIndexes, maxValuesIndex, compareResults);
+++ maxValues = _mm_blendv_ps(currentValues, maxValues, compareResults);
+++ }
+++
+++ // Calculate the largest value from the remaining 4 points
+++ _mm_store_ps(maxValuesBuffer, maxValues);
+++ _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
+++
+++ for(number = 0; number < 4; number++){
+++ if(maxValuesBuffer[number] > max){
+++ index = maxIndexesBuffer[number];
+++ max = maxValuesBuffer[number];
+++ }
+++ }
+++
+++ number = quarterPoints * 4;
+++ for(;number < num_points; number++){
+++ if(src0[number] > max){
+++ index = number;
+++ max = src0[number];
+++ }
+++ }
+++ target[0] = (unsigned int)index;
+++ }
+++}
+++
+++#endif /*LV_HAVE_SSE4_1*/
+++
+++#ifdef LV_HAVE_SSE
+++#include<xmmintrin.h>
+++
+++static inline void volk_gnsssdr_32f_index_max_16u_a_sse(unsigned int* target, const float* src0, unsigned int num_points) {
+++ if(num_points > 0){
+++ unsigned int number = 0;
+++ const unsigned int quarterPoints = num_points / 4;
+++
+++ float* inputPtr = (float*)src0;
+++
+++ __m128 indexIncrementValues = _mm_set1_ps(4);
+++ __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4);
+++
+++ float max = src0[0];
+++ float index = 0;
+++ __m128 maxValues = _mm_set1_ps(max);
+++ __m128 maxValuesIndex = _mm_setzero_ps();
+++ __m128 compareResults;
+++ __m128 currentValues;
+++
+++ __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
+++ __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
+++
+++ for(;number < quarterPoints; number++){
+++
+++ currentValues = _mm_load_ps(inputPtr); inputPtr += 4;
+++ currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
+++
+++ compareResults = _mm_cmpgt_ps(maxValues, currentValues);
+++
+++ maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, maxValuesIndex) , _mm_andnot_ps(compareResults, currentIndexes));
+++
+++ maxValues = _mm_or_ps(_mm_and_ps(compareResults, maxValues) , _mm_andnot_ps(compareResults, currentValues));
+++ }
+++
+++ // Calculate the largest value from the remaining 4 points
+++ _mm_store_ps(maxValuesBuffer, maxValues);
+++ _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
+++
+++ for(number = 0; number < 4; number++){
+++ if(maxValuesBuffer[number] > max){
+++ index = maxIndexesBuffer[number];
+++ max = maxValuesBuffer[number];
+++ }
+++ }
+++
+++ number = quarterPoints * 4;
+++ for(;number < num_points; number++){
+++ if(src0[number] > max){
+++ index = number;
+++ max = src0[number];
+++ }
+++ }
+++ target[0] = (unsigned int)index;
+++ }
+++}
+++
+++#endif /*LV_HAVE_SSE*/
+++
+++#ifdef LV_HAVE_GENERIC
+++static inline void volk_gnsssdr_32f_index_max_16u_generic(unsigned int* target, const float* src0, unsigned int num_points) {
+++ if(num_points > 0){
+++ float max = src0[0];
+++ unsigned int index = 0;
+++
+++ unsigned int i = 1;
+++
+++ for(; i < num_points; ++i) {
+++
+++ if(src0[i] > max){
+++ index = i;
+++ max = src0[i];
+++ }
+++
+++ }
+++ target[0] = index;
+++ }
+++}
+++
+++#endif /*LV_HAVE_GENERIC*/
+++
+++
+++#endif /*INCLUDED_volk_gnsssdr_32f_index_max_16u_a_H*/
++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_s32f_convert_16i.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32f_s32f_convert_16i.h
++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_s32f_convert_16i.h 1970-01-01 01:00:00.000000000 +0100
+++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32f_s32f_convert_16i.h 2014-10-15 01:55:08.000000000 +0200
++@@ -0,0 +1,302 @@
+++#ifndef INCLUDED_volk_gnsssdr_32f_s32f_convert_16i_u_H
+++#define INCLUDED_volk_gnsssdr_32f_s32f_convert_16i_u_H
+++
+++#include <inttypes.h>
+++#include <stdio.h>
+++#include <math.h>
+++
+++#ifdef LV_HAVE_SSE2
+++#include <emmintrin.h>
+++ /*!
+++ \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
+++ \param inputVector The floating point input data buffer
+++ \param outputVector The 16 bit output data buffer
+++ \param scalar The value multiplied against each point in the input buffer
+++ \param num_points The number of data values to be converted
+++ \note Input buffer does NOT need to be properly aligned
+++ */
+++static inline void volk_gnsssdr_32f_s32f_convert_16i_u_sse2(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+++ unsigned int number = 0;
+++
+++ const unsigned int eighthPoints = num_points / 8;
+++
+++ const float* inputVectorPtr = (const float*)inputVector;
+++ int16_t* outputVectorPtr = outputVector;
+++
+++ float min_val = -32768;
+++ float max_val = 32767;
+++ float r;
+++
+++ __m128 vScalar = _mm_set_ps1(scalar);
+++ __m128 inputVal1, inputVal2;
+++ __m128i intInputVal1, intInputVal2;
+++ __m128 ret1, ret2;
+++ __m128 vmin_val = _mm_set_ps1(min_val);
+++ __m128 vmax_val = _mm_set_ps1(max_val);
+++
+++ for(;number < eighthPoints; number++){
+++ inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
+++ inputVal2 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
+++
+++ // Scale and clip
+++ ret1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
+++ ret2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
+++
+++ intInputVal1 = _mm_cvtps_epi32(ret1);
+++ intInputVal2 = _mm_cvtps_epi32(ret2);
+++
+++ intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
+++
+++ _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
+++ outputVectorPtr += 8;
+++ }
+++
+++ number = eighthPoints * 8;
+++ for(; number < num_points; number++){
+++ r = inputVector[number] * scalar;
+++ if(r > max_val)
+++ r = max_val;
+++ else if(r < min_val)
+++ r = min_val;
+++ outputVector[number] = (int16_t)rintf(r);
+++ }
+++}
+++#endif /* LV_HAVE_SSE2 */
+++
+++#ifdef LV_HAVE_SSE
+++#include <xmmintrin.h>
+++ /*!
+++ \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
+++ \param inputVector The floating point input data buffer
+++ \param outputVector The 16 bit output data buffer
+++ \param scalar The value multiplied against each point in the input buffer
+++ \param num_points The number of data values to be converted
+++ \note Input buffer does NOT need to be properly aligned
+++ */
+++static inline void volk_gnsssdr_32f_s32f_convert_16i_u_sse(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+++ unsigned int number = 0;
+++
+++ const unsigned int quarterPoints = num_points / 4;
+++
+++ const float* inputVectorPtr = (const float*)inputVector;
+++ int16_t* outputVectorPtr = outputVector;
+++
+++ float min_val = -32768;
+++ float max_val = 32767;
+++ float r;
+++
+++ __m128 vScalar = _mm_set_ps1(scalar);
+++ __m128 ret;
+++ __m128 vmin_val = _mm_set_ps1(min_val);
+++ __m128 vmax_val = _mm_set_ps1(max_val);
+++
+++ __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
+++
+++ for(;number < quarterPoints; number++){
+++ ret = _mm_loadu_ps(inputVectorPtr);
+++ inputVectorPtr += 4;
+++
+++ // Scale and clip
+++ ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
+++
+++ _mm_store_ps(outputFloatBuffer, ret);
+++ *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]);
+++ *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]);
+++ *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]);
+++ *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]);
+++ }
+++
+++ number = quarterPoints * 4;
+++ for(; number < num_points; number++){
+++ r = inputVector[number] * scalar;
+++ if(r > max_val)
+++ r = max_val;
+++ else if(r < min_val)
+++ r = min_val;
+++ outputVector[number] = (int16_t)rintf(r);
+++ }
+++}
+++#endif /* LV_HAVE_SSE */
+++
+++#ifdef LV_HAVE_GENERIC
+++ /*!
+++ \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
+++ \param inputVector The floating point input data buffer
+++ \param outputVector The 16 bit output data buffer
+++ \param scalar The value multiplied against each point in the input buffer
+++ \param num_points The number of data values to be converted
+++ \note Input buffer does NOT need to be properly aligned
+++ */
+++static inline void volk_gnsssdr_32f_s32f_convert_16i_generic(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+++ int16_t* outputVectorPtr = outputVector;
+++ const float* inputVectorPtr = inputVector;
+++ unsigned int number = 0;
+++ float min_val = -32768;
+++ float max_val = 32767;
+++ float r;
+++
+++ for(number = 0; number < num_points; number++){
+++ r = *inputVectorPtr++ * scalar;
+++ if(r > max_val)
+++ r = max_val;
+++ else if(r < min_val)
+++ r = min_val;
+++ *outputVectorPtr++ = (int16_t)rintf(r);
+++ }
+++}
+++#endif /* LV_HAVE_GENERIC */
+++
+++
+++
+++
+++#endif /* INCLUDED_volk_gnsssdr_32f_s32f_convert_16i_u_H */
+++#ifndef INCLUDED_volk_gnsssdr_32f_s32f_convert_16i_a_H
+++#define INCLUDED_volk_gnsssdr_32f_s32f_convert_16i_a_H
+++
+++#include <volk/volk_common.h>
+++#include <inttypes.h>
+++#include <stdio.h>
+++#include <math.h>
+++
+++#ifdef LV_HAVE_SSE2
+++#include <emmintrin.h>
+++ /*!
+++ \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
+++ \param inputVector The floating point input data buffer
+++ \param outputVector The 16 bit output data buffer
+++ \param scalar The value multiplied against each point in the input buffer
+++ \param num_points The number of data values to be converted
+++ */
+++static inline void volk_gnsssdr_32f_s32f_convert_16i_a_sse2(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+++ unsigned int number = 0;
+++
+++ const unsigned int eighthPoints = num_points / 8;
+++
+++ const float* inputVectorPtr = (const float*)inputVector;
+++ int16_t* outputVectorPtr = outputVector;
+++
+++ float min_val = -32768;
+++ float max_val = 32767;
+++ float r;
+++
+++ __m128 vScalar = _mm_set_ps1(scalar);
+++ __m128 inputVal1, inputVal2;
+++ __m128i intInputVal1, intInputVal2;
+++ __m128 ret1, ret2;
+++ __m128 vmin_val = _mm_set_ps1(min_val);
+++ __m128 vmax_val = _mm_set_ps1(max_val);
+++
+++ for(;number < eighthPoints; number++){
+++ inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
+++ inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
+++
+++ // Scale and clip
+++ ret1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
+++ ret2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
+++
+++ intInputVal1 = _mm_cvtps_epi32(ret1);
+++ intInputVal2 = _mm_cvtps_epi32(ret2);
+++
+++ intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
+++
+++ _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
+++ outputVectorPtr += 8;
+++ }
+++
+++ number = eighthPoints * 8;
+++ for(; number < num_points; number++){
+++ r = inputVector[number] * scalar;
+++ if(r > max_val)
+++ r = max_val;
+++ else if(r < min_val)
+++ r = min_val;
+++ outputVector[number] = (int16_t)rintf(r);
+++ }
+++}
+++#endif /* LV_HAVE_SSE2 */
+++
+++#ifdef LV_HAVE_SSE
+++#include <xmmintrin.h>
+++ /*!
+++ \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
+++ \param inputVector The floating point input data buffer
+++ \param outputVector The 16 bit output data buffer
+++ \param scalar The value multiplied against each point in the input buffer
+++ \param num_points The number of data values to be converted
+++ */
+++static inline void volk_gnsssdr_32f_s32f_convert_16i_a_sse(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+++ unsigned int number = 0;
+++
+++ const unsigned int quarterPoints = num_points / 4;
+++
+++ const float* inputVectorPtr = (const float*)inputVector;
+++ int16_t* outputVectorPtr = outputVector;
+++
+++ float min_val = -32768;
+++ float max_val = 32767;
+++ float r;
+++
+++ __m128 vScalar = _mm_set_ps1(scalar);
+++ __m128 ret;
+++ __m128 vmin_val = _mm_set_ps1(min_val);
+++ __m128 vmax_val = _mm_set_ps1(max_val);
+++
+++ __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
+++
+++ for(;number < quarterPoints; number++){
+++ ret = _mm_load_ps(inputVectorPtr);
+++ inputVectorPtr += 4;
+++
+++ // Scale and clip
+++ ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
+++
+++ _mm_store_ps(outputFloatBuffer, ret);
+++ *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]);
+++ *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]);
+++ *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]);
+++ *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]);
+++ }
+++
+++ number = quarterPoints * 4;
+++ for(; number < num_points; number++){
+++ r = inputVector[number] * scalar;
+++ if(r > max_val)
+++ r = max_val;
+++ else if(r < min_val)
+++ r = min_val;
+++ outputVector[number] = (int16_t)rintf(r);
+++ }
+++}
+++#endif /* LV_HAVE_SSE */
+++
+++#ifdef LV_HAVE_GENERIC
+++ /*!
+++ \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
+++ \param inputVector The floating point input data buffer
+++ \param outputVector The 16 bit output data buffer
+++ \param scalar The value multiplied against each point in the input buffer
+++ \param num_points The number of data values to be converted
+++ */
+++static inline void volk_gnsssdr_32f_s32f_convert_16i_a_generic(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+++ int16_t* outputVectorPtr = outputVector;
+++ const float* inputVectorPtr = inputVector;
+++ unsigned int number = 0;
+++ float min_val = -32768;
+++ float max_val = 32767;
+++ float r;
+++
+++ for(number = 0; number < num_points; number++){
+++ r = *inputVectorPtr++ * scalar;
+++ if(r < min_val)
+++ r = min_val;
+++ else if(r > max_val)
+++ r = max_val;
+++ *outputVectorPtr++ = (int16_t)rintf(r);
+++ }
+++}
+++#endif /* LV_HAVE_GENERIC */
+++
+++
+++
+++
+++#endif /* INCLUDED_volk_gnsssdr_32f_s32f_convert_16i_a_H */
++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_x2_add_32f.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32f_x2_add_32f.h
++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_x2_add_32f.h 1970-01-01 01:00:00.000000000 +0100
+++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32f_x2_add_32f.h 2014-10-15 01:55:08.000000000 +0200
++@@ -0,0 +1,147 @@
+++#ifndef INCLUDED_volk_gnsssdr_32f_x2_add_32f_u_H
+++#define INCLUDED_volk_gnsssdr_32f_x2_add_32f_u_H
+++
+++#include <inttypes.h>
+++#include <stdio.h>
+++
+++#ifdef LV_HAVE_SSE
+++#include <xmmintrin.h>
+++/*!
+++ \brief Adds the two input vectors and store their results in the third vector
+++ \param cVector The vector where the results will be stored
+++ \param aVector One of the vectors to be added
+++ \param bVector One of the vectors to be added
+++ \param num_points The number of values in aVector and bVector to be added together and stored into cVector
+++*/
+++static inline void volk_gnsssdr_32f_x2_add_32f_u_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+++ unsigned int number = 0;
+++ const unsigned int quarterPoints = num_points / 4;
+++
+++ float* cPtr = cVector;
+++ const float* aPtr = aVector;
+++ const float* bPtr= bVector;
+++
+++ __m128 aVal, bVal, cVal;
+++ for(;number < quarterPoints; number++){
+++
+++ aVal = _mm_loadu_ps(aPtr);
+++ bVal = _mm_loadu_ps(bPtr);
+++
+++ cVal = _mm_add_ps(aVal, bVal);
+++
+++ _mm_storeu_ps(cPtr,cVal); // Store the results back into the C container
+++
+++ aPtr += 4;
+++ bPtr += 4;
+++ cPtr += 4;
+++ }
+++
+++ number = quarterPoints * 4;
+++ for(;number < num_points; number++){
+++ *cPtr++ = (*aPtr++) + (*bPtr++);
+++ }
+++}
+++#endif /* LV_HAVE_SSE */
+++
+++#ifdef LV_HAVE_GENERIC
+++/*!
+++ \brief Adds the two input vectors and store their results in the third vector
+++ \param cVector The vector where the results will be stored
+++ \param aVector One of the vectors to be added
+++ \param bVector One of the vectors to be added
+++ \param num_points The number of values in aVector and bVector to be added together and stored into cVector
+++*/
+++static inline void volk_gnsssdr_32f_x2_add_32f_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+++ float* cPtr = cVector;
+++ const float* aPtr = aVector;
+++ const float* bPtr= bVector;
+++ unsigned int number = 0;
+++
+++ for(number = 0; number < num_points; number++){
+++ *cPtr++ = (*aPtr++) + (*bPtr++);
+++ }
+++}
+++#endif /* LV_HAVE_GENERIC */
+++
+++#endif /* INCLUDED_volk_gnsssdr_32f_x2_add_32f_u_H */
+++#ifndef INCLUDED_volk_gnsssdr_32f_x2_add_32f_a_H
+++#define INCLUDED_volk_gnsssdr_32f_x2_add_32f_a_H
+++
+++#include <inttypes.h>
+++#include <stdio.h>
+++
+++#ifdef LV_HAVE_SSE
+++#include <xmmintrin.h>
+++/*!
+++ \brief Adds the two input vectors and store their results in the third vector
+++ \param cVector The vector where the results will be stored
+++ \param aVector One of the vectors to be added
+++ \param bVector One of the vectors to be added
+++ \param num_points The number of values in aVector and bVector to be added together and stored into cVector
+++*/
+++static inline void volk_gnsssdr_32f_x2_add_32f_a_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+++ unsigned int number = 0;
+++ const unsigned int quarterPoints = num_points / 4;
+++
+++ float* cPtr = cVector;
+++ const float* aPtr = aVector;
+++ const float* bPtr= bVector;
+++
+++ __m128 aVal, bVal, cVal;
+++ for(;number < quarterPoints; number++){
+++
+++ aVal = _mm_load_ps(aPtr);
+++ bVal = _mm_load_ps(bPtr);
+++
+++ cVal = _mm_add_ps(aVal, bVal);
+++
+++ _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+++
+++ aPtr += 4;
+++ bPtr += 4;
+++ cPtr += 4;
+++ }
+++
+++ number = quarterPoints * 4;
+++ for(;number < num_points; number++){
+++ *cPtr++ = (*aPtr++) + (*bPtr++);
+++ }
+++}
+++#endif /* LV_HAVE_SSE */
+++
+++#ifdef LV_HAVE_GENERIC
+++/*!
+++ \brief Adds the two input vectors and store their results in the third vector
+++ \param cVector The vector where the results will be stored
+++ \param aVector One of the vectors to be added
+++ \param bVector One of the vectors to be added
+++ \param num_points The number of values in aVector and bVector to be added together and stored into cVector
+++*/
+++static inline void volk_gnsssdr_32f_x2_add_32f_a_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+++ float* cPtr = cVector;
+++ const float* aPtr = aVector;
+++ const float* bPtr= bVector;
+++ unsigned int number = 0;
+++
+++ for(number = 0; number < num_points; number++){
+++ *cPtr++ = (*aPtr++) + (*bPtr++);
+++ }
+++}
+++#endif /* LV_HAVE_GENERIC */
+++
+++#ifdef LV_HAVE_ORC
+++/*!
+++ \brief Adds the two input vectors and store their results in the third vector
+++ \param cVector The vector where the results will be stored
+++ \param aVector One of the vectors to be added
+++ \param bVector One of the vectors to be added
+++ \param num_points The number of values in aVector and bVector to be added together and stored into cVector
+++*/
+++extern void volk_gnsssdr_32f_x2_add_32f_a_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
+++static inline void volk_gnsssdr_32f_x2_add_32f_u_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+++ volk_gnsssdr_32f_x2_add_32f_a_orc_impl(cVector, aVector, bVector, num_points);
+++}
+++#endif /* LV_HAVE_ORC */
+++
+++
+++#endif /* INCLUDED_volk_gnsssdr_32f_x2_add_32f_a_H */
++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_conjugate_32fc.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_conjugate_32fc.h
++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_conjugate_32fc.h 1970-01-01 01:00:00.000000000 +0100
+++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_conjugate_32fc.h 2014-10-15 01:55:08.000000000 +0200
++@@ -0,0 +1,127 @@
+++#ifndef INCLUDED_volk_gnsssdr_32fc_conjugate_32fc_u_H
+++#define INCLUDED_volk_gnsssdr_32fc_conjugate_32fc_u_H
+++
+++#include <inttypes.h>
+++#include <stdio.h>
+++#include <volk_gnsssdr/volk_gnsssdr_complex.h>
+++#include <float.h>
+++
+++#ifdef LV_HAVE_SSE3
+++#include <pmmintrin.h>
+++ /*!
+++ \brief Takes the conjugate of a complex vector.
+++ \param cVector The vector where the results will be stored
+++ \param aVector Vector to be conjugated
+++ \param num_points The number of complex values in aVector to be conjugated and stored into cVector
+++ */
+++static inline void volk_gnsssdr_32fc_conjugate_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points){
+++ unsigned int number = 0;
+++ const unsigned int halfPoints = num_points / 2;
+++
+++ __m128 x;
+++ lv_32fc_t* c = cVector;
+++ const lv_32fc_t* a = aVector;
+++
+++ __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f);
+++
+++ for(;number < halfPoints; number++){
+++
+++ x = _mm_loadu_ps((float*)a); // Load the complex data as ar,ai,br,bi
+++
+++ x = _mm_xor_ps(x, conjugator); // conjugate register
+++
+++ _mm_storeu_ps((float*)c,x); // Store the results back into the C container
+++
+++ a += 2;
+++ c += 2;
+++ }
+++
+++ if((num_points % 2) != 0) {
+++ *c = lv_conj(*a);
+++ }
+++}
+++#endif /* LV_HAVE_SSE3 */
+++
+++#ifdef LV_HAVE_GENERIC
+++ /*!
+++ \brief Takes the conjugate of a complex vector.
+++ \param cVector The vector where the results will be stored
+++ \param aVector Vector to be conjugated
+++ \param num_points The number of complex values in aVector to be conjugated and stored into cVector
+++ */
+++static inline void volk_gnsssdr_32fc_conjugate_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points){
+++ lv_32fc_t* cPtr = cVector;
+++ const lv_32fc_t* aPtr = aVector;
+++ unsigned int number = 0;
+++
+++ for(number = 0; number < num_points; number++){
+++ *cPtr++ = lv_conj(*aPtr++);
+++ }
+++}
+++#endif /* LV_HAVE_GENERIC */
+++
+++
+++#endif /* INCLUDED_volk_gnsssdr_32fc_conjugate_32fc_u_H */
+++#ifndef INCLUDED_volk_gnsssdr_32fc_conjugate_32fc_a_H
+++#define INCLUDED_volk_gnsssdr_32fc_conjugate_32fc_a_H
+++
+++#include <inttypes.h>
+++#include <stdio.h>
+++#include <volk_gnsssdr/volk_gnsssdr_complex.h>
+++#include <float.h>
+++
+++#ifdef LV_HAVE_SSE3
+++#include <pmmintrin.h>
+++ /*!
+++ \brief Takes the conjugate of a complex vector.
+++ \param cVector The vector where the results will be stored
+++ \param aVector Vector to be conjugated
+++ \param num_points The number of complex values in aVector to be conjugated and stored into cVector
+++ */
+++static inline void volk_gnsssdr_32fc_conjugate_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points){
+++ unsigned int number = 0;
+++ const unsigned int halfPoints = num_points / 2;
+++
+++ __m128 x;
+++ lv_32fc_t* c = cVector;
+++ const lv_32fc_t* a = aVector;
+++
+++ __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f);
+++
+++ for(;number < halfPoints; number++){
+++
+++ x = _mm_load_ps((float*)a); // Load the complex data as ar,ai,br,bi
+++
+++ x = _mm_xor_ps(x, conjugator); // conjugate register
+++
+++ _mm_store_ps((float*)c,x); // Store the results back into the C container
+++
+++ a += 2;
+++ c += 2;
+++ }
+++
+++ if((num_points % 2) != 0) {
+++ *c = lv_conj(*a);
+++ }
+++}
+++#endif /* LV_HAVE_SSE3 */
+++
+++#ifdef LV_HAVE_GENERIC
+++ /*!
+++ \brief Takes the conjugate of a complex vector.
+++ \param cVector The vector where the results will be stored
+++ \param aVector Vector to be conjugated
+++ \param num_points The number of complex values in aVector to be conjugated and stored into cVector
+++ */
+++static inline void volk_gnsssdr_32fc_conjugate_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points){
+++ lv_32fc_t* cPtr = cVector;
+++ const lv_32fc_t* aPtr = aVector;
+++ unsigned int number = 0;
+++
+++ for(number = 0; number < num_points; number++){
+++ *cPtr++ = lv_conj(*aPtr++);
+++ }
+++}
+++#endif /* LV_HAVE_GENERIC */
+++
+++#endif /* INCLUDED_volk_gnsssdr_32fc_conjugate_32fc_a_H */
++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_16ic.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_16ic.h
++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_16ic.h 1970-01-01 01:00:00.000000000 +0100
+++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_16ic.h 2014-10-15 01:55:08.000000000 +0200
++@@ -0,0 +1,295 @@
+++/*!
+++ * \file volk_gnsssdr_32fc_convert_16ic.h
+++ * \brief Volk protokernel: converts float32 complex values to 16 integer complex values taking care of overflow
+++ * \authors <ul>
+++ * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
+++ * </ul>
+++ *
+++ * -------------------------------------------------------------------------
+++ *
+++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
+++ *
+++ * GNSS-SDR is a software defined Global Navigation
+++ * Satellite Systems receiver
+++ *
+++ * This file is part of GNSS-SDR.
+++ *
+++ * GNSS-SDR is free software: you can redistribute it and/or modify
+++ * it under the terms of the GNU General Public License as published by
+++ * the Free Software Foundation, either version 3 of the License, or
+++ * at your option) any later version.
+++ *
+++ * GNSS-SDR is distributed in the hope that it will be useful,
+++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+++ * GNU General Public License for more details.
+++ *
+++ * You should have received a copy of the GNU General Public License
+++ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
+++ *
+++ * -------------------------------------------------------------------------
+++ */
+++
+++#ifndef INCLUDED_volk_gnsssdr_32fc_convert_16ic_u_H
+++#define INCLUDED_volk_gnsssdr_32fc_convert_16ic_u_H
+++
+++#include <inttypes.h>
+++#include <stdio.h>
+++#include <math.h>
+++
+++#ifdef LV_HAVE_SSE2
+++#include <emmintrin.h>
+++/*!
+++ \brief Converts a float vector of 64 bits (32 bits each part) into a 32 integer vector (16 bits each part)
+++ \param inputVector The floating point input data buffer
+++ \param outputVector The 16 bit output data buffer
+++ \param num_points The number of data values to be converted
+++ */
+++static inline void volk_gnsssdr_32fc_convert_16ic_u_sse2(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){
+++ const unsigned int sse_iters = num_points/4;
+++
+++ float* inputVectorPtr = (float*)inputVector;
+++ int16_t* outputVectorPtr = (int16_t*)outputVector;
+++
+++ float min_val = -32768;
+++ float max_val = 32767;
+++
+++ __m128 inputVal1, inputVal2;
+++ __m128i intInputVal1, intInputVal2;
+++ __m128 ret1, ret2;
+++ __m128 vmin_val = _mm_set_ps1(min_val);
+++ __m128 vmax_val = _mm_set_ps1(max_val);
+++
+++ for(unsigned int i = 0;i < sse_iters; i++){
+++ inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
+++ inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
+++
+++ // Clip
+++ ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
+++ ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
+++
+++ intInputVal1 = _mm_cvtps_epi32(ret1);
+++ intInputVal2 = _mm_cvtps_epi32(ret2);
+++
+++ intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
+++
+++ _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
+++ outputVectorPtr += 8;
+++ }
+++
+++ for(unsigned int i = 0; i < (num_points%4)*2; i++){
+++ if(inputVectorPtr[i] > max_val)
+++ inputVectorPtr[i] = max_val;
+++ else if(inputVectorPtr[i] < min_val)
+++ inputVectorPtr[i] = min_val;
+++ outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]);
+++ }
+++}
+++#endif /* LV_HAVE_SSE2 */
+++
+++#ifdef LV_HAVE_SSE
+++#include <xmmintrin.h>
+++/*!
+++ \brief Converts a float vector of 64 bits (32 bits each part) into a 32 integer vector (16 bits each part)
+++ \param inputVector The floating point input data buffer
+++ \param outputVector The 16 bit output data buffer
+++ \param num_points The number of data values to be converted
+++ */
+++static inline void volk_gnsssdr_32fc_convert_16ic_u_sse(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){
+++ const unsigned int sse_iters = num_points/4;
+++
+++ float* inputVectorPtr = (float*)inputVector;
+++ int16_t* outputVectorPtr = (int16_t*)outputVector;
+++
+++ float min_val = -32768;
+++ float max_val = 32767;
+++
+++ __m128 inputVal1, inputVal2;
+++ __m128i intInputVal1, intInputVal2;
+++ __m128 ret1, ret2;
+++ __m128 vmin_val = _mm_set_ps1(min_val);
+++ __m128 vmax_val = _mm_set_ps1(max_val);
+++
+++ for(unsigned int i = 0;i < sse_iters; i++){
+++ inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
+++ inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
+++
+++ // Clip
+++ ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
+++ ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
+++
+++ intInputVal1 = _mm_cvtps_epi32(ret1);
+++ intInputVal2 = _mm_cvtps_epi32(ret2);
+++
+++ intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
+++
+++ _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
+++ outputVectorPtr += 8;
+++ }
+++
+++ for(unsigned int i = 0; i < (num_points%4)*2; i++){
+++ if(inputVectorPtr[i] > max_val)
+++ inputVectorPtr[i] = max_val;
+++ else if(inputVectorPtr[i] < min_val)
+++ inputVectorPtr[i] = min_val;
+++ outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]);
+++ }
+++}
+++#endif /* LV_HAVE_SSE */
+++
+++#ifdef LV_HAVE_GENERIC
+++/*!
+++ \brief Converts a float vector of 64 bits (32 bits each part) into a 32 integer vector (16 bits each part)
+++ \param inputVector The floating point input data buffer
+++ \param outputVector The 16 bit output data buffer
+++ \param num_points The number of data values to be converted
+++ */
+++static inline void volk_gnsssdr_32fc_convert_16ic_generic(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){
+++ float* inputVectorPtr = (float*)inputVector;
+++ int16_t* outputVectorPtr = (int16_t*)outputVector;
+++ float min_val = -32768;
+++ float max_val = 32767;
+++
+++ for(unsigned int i = 0; i < num_points*2; i++){
+++ if(inputVectorPtr[i] > max_val)
+++ inputVectorPtr[i] = max_val;
+++ else if(inputVectorPtr[i] < min_val)
+++ inputVectorPtr[i] = min_val;
+++ outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]);
+++ }
+++}
+++#endif /* LV_HAVE_GENERIC */
+++#endif /* INCLUDED_volk_gnsssdr_32fc_convert_16ic_u_H */
+++
+++
+++#ifndef INCLUDED_volk_gnsssdr_32fc_convert_16ic_a_H
+++#define INCLUDED_volk_gnsssdr_32fc_convert_16ic_a_H
+++
+++#include <volk/volk_common.h>
+++#include <inttypes.h>
+++#include <stdio.h>
+++#include <math.h>
+++
+++#ifdef LV_HAVE_SSE2
+++#include <emmintrin.h>
+++/*!
+++ \brief Converts a float vector of 64 bits (32 bits each part) into a 32 integer vector (16 bits each part)
+++ \param inputVector The floating point input data buffer
+++ \param outputVector The 16 bit output data buffer
+++ \param num_points The number of data values to be converted
+++ */
+++static inline void volk_gnsssdr_32fc_convert_16ic_a_sse2(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){
+++ const unsigned int sse_iters = num_points/4;
+++
+++ float* inputVectorPtr = (float*)inputVector;
+++ int16_t* outputVectorPtr = (int16_t*)outputVector;
+++
+++ float min_val = -32768;
+++ float max_val = 32767;
+++
+++ __m128 inputVal1, inputVal2;
+++ __m128i intInputVal1, intInputVal2;
+++ __m128 ret1, ret2;
+++ __m128 vmin_val = _mm_set_ps1(min_val);
+++ __m128 vmax_val = _mm_set_ps1(max_val);
+++
+++ for(unsigned int i = 0;i < sse_iters; i++){
+++ inputVal1 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
+++ inputVal2 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
+++
+++ // Clip
+++ ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
+++ ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
+++
+++ intInputVal1 = _mm_cvtps_epi32(ret1);
+++ intInputVal2 = _mm_cvtps_epi32(ret2);
+++
+++ intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
+++
+++ _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
+++ outputVectorPtr += 8;
+++ }
+++
+++ for(unsigned int i = 0; i < (num_points%4)*2; i++){
+++ if(inputVectorPtr[i] > max_val)
+++ inputVectorPtr[i] = max_val;
+++ else if(inputVectorPtr[i] < min_val)
+++ inputVectorPtr[i] = min_val;
+++ outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]);
+++ }
+++}
+++#endif /* LV_HAVE_SSE2 */
+++
+++#ifdef LV_HAVE_SSE
+++#include <xmmintrin.h>
+++/*!
+++ \brief Converts a float vector of 64 bits (32 bits each part) into a 32 integer vector (16 bits each part)
+++ \param inputVector The floating point input data buffer
+++ \param outputVector The 16 bit output data buffer
+++ \param num_points The number of data values to be converted
+++ */
+++static inline void volk_gnsssdr_32fc_convert_16ic_a_sse(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){
+++ const unsigned int sse_iters = num_points/4;
+++
+++ float* inputVectorPtr = (float*)inputVector;
+++ int16_t* outputVectorPtr = (int16_t*)outputVector;
+++
+++ float min_val = -32768;
+++ float max_val = 32767;
+++
+++ __m128 inputVal1, inputVal2;
+++ __m128i intInputVal1, intInputVal2;
+++ __m128 ret1, ret2;
+++ __m128 vmin_val = _mm_set_ps1(min_val);
+++ __m128 vmax_val = _mm_set_ps1(max_val);
+++
+++ for(unsigned int i = 0;i < sse_iters; i++){
+++ inputVal1 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
+++ inputVal2 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
+++
+++ // Clip
+++ ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
+++ ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
+++
+++ intInputVal1 = _mm_cvtps_epi32(ret1);
+++ intInputVal2 = _mm_cvtps_epi32(ret2);
+++
+++ intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
+++
+++ _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
+++ outputVectorPtr += 8;
+++ }
+++
+++ for(unsigned int i = 0; i < (num_points%4)*2; i++){
+++ if(inputVectorPtr[i] > max_val)
+++ inputVectorPtr[i] = max_val;
+++ else if(inputVectorPtr[i] < min_val)
+++ inputVectorPtr[i] = min_val;
+++ outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]);
+++ }
+++}
+++#endif /* LV_HAVE_SSE */
+++
+++#ifdef LV_HAVE_GENERIC
+++/*!
+++ \brief Converts a float vector of 64 bits (32 bits each part) into a 32 integer vector (16 bits each part)
+++ \param inputVector The floating point input data buffer
+++ \param outputVector The 16 bit output data buffer
+++ \param num_points The number of data values to be converted
+++ */
+++static inline void volk_gnsssdr_32fc_convert_16ic_a_generic(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){
+++ float* inputVectorPtr = (float*)inputVector;
+++ int16_t* outputVectorPtr = (int16_t*)outputVector;
+++ float min_val = -32768;
+++ float max_val = 32767;
+++
+++ for(unsigned int i = 0; i < num_points*2; i++){
+++ if(inputVectorPtr[i] > max_val)
+++ inputVectorPtr[i] = max_val;
+++ else if(inputVectorPtr[i] < min_val)
+++ inputVectorPtr[i] = min_val;
+++ outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]);
+++ }
+++}
+++#endif /* LV_HAVE_GENERIC */
+++#endif /* INCLUDED_volk_gnsssdr_32fc_convert_16ic_a_H */
++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_8ic.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_8ic.h
++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_8ic.h 1970-01-01 01:00:00.000000000 +0100
+++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_8ic.h 2014-10-15 01:55:08.000000000 +0200
++@@ -0,0 +1,213 @@
+++/*!
+++ * \file volk_gnsssdr_32fc_convert_8ic.h
+++ * \brief Volk protokernel: converts float32 complex values to 8 integer complex values taking care of overflow
+++ * \authors <ul>
+++ * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
+++ * </ul>
+++ *
+++ * -------------------------------------------------------------------------
+++ *
+++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
+++ *
+++ * GNSS-SDR is a software defined Global Navigation
+++ * Satellite Systems receiver
+++ *
+++ * This file is part of GNSS-SDR.
+++ *
+++ * GNSS-SDR is free software: you can redistribute it and/or modify
+++ * it under the terms of the GNU General Public License as published by
+++ * the Free Software Foundation, either version 3 of the License, or
+++ * at your option) any later version.
+++ *
+++ * GNSS-SDR is distributed in the hope that it will be useful,
+++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+++ * GNU General Public License for more details.
+++ *
+++ * You should have received a copy of the GNU General Public License
+++ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
+++ *
+++ * -------------------------------------------------------------------------
+++ */
+++
+++#ifndef INCLUDED_volk_gnsssdr_32fc_convert_8ic_u_H
+++#define INCLUDED_volk_gnsssdr_32fc_convert_8ic_u_H
+++
+++#include <inttypes.h>
+++#include <stdio.h>
+++#include <math.h>
+++
+++#ifdef LV_HAVE_SSE2
+++#include <emmintrin.h>
+++/*!
+++ \brief Converts a float vector of 64 bits (32 bits each part) into a 16 integer vector (8 bits each part)
+++ \param inputVector The floating point input data buffer
+++ \param outputVector The 16 bit output data buffer
+++ \param num_points The number of data values to be converted
+++ */
+++static inline void volk_gnsssdr_32fc_convert_8ic_u_sse2(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){
+++ const unsigned int sse_iters = num_points/8;
+++
+++ float* inputVectorPtr = (float*)inputVector;
+++ int8_t* outputVectorPtr = (int8_t*)outputVector;
+++
+++ float min_val = -128;
+++ float max_val = 127;
+++
+++ __m128 inputVal1, inputVal2, inputVal3, inputVal4;
+++ __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
+++ __m128i int8InputVal;
+++ __m128 ret1, ret2, ret3, ret4;
+++ __m128 vmin_val = _mm_set_ps1(min_val);
+++ __m128 vmax_val = _mm_set_ps1(max_val);
+++
+++ for(unsigned int i = 0;i < sse_iters; i++){
+++ inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
+++ inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
+++ inputVal3 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
+++ inputVal4 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
+++
+++ // Clip
+++ ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
+++ ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
+++ ret3 = _mm_max_ps(_mm_min_ps(inputVal3, vmax_val), vmin_val);
+++ ret4 = _mm_max_ps(_mm_min_ps(inputVal4, vmax_val), vmin_val);
+++
+++ intInputVal1 = _mm_cvtps_epi32(ret1);
+++ intInputVal2 = _mm_cvtps_epi32(ret2);
+++ intInputVal3 = _mm_cvtps_epi32(ret3);
+++ intInputVal4 = _mm_cvtps_epi32(ret4);
+++
+++ intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
+++ intInputVal2 = _mm_packs_epi32(intInputVal3, intInputVal4);
+++ int8InputVal = _mm_packs_epi16(intInputVal1, intInputVal2);
+++
+++ _mm_storeu_si128((__m128i*)outputVectorPtr, int8InputVal);
+++ outputVectorPtr += 16;
+++ }
+++
+++ for(unsigned int i = 0; i < (num_points%4)*4; i++){
+++ if(inputVectorPtr[i] > max_val)
+++ inputVectorPtr[i] = max_val;
+++ else if(inputVectorPtr[i] < min_val)
+++ inputVectorPtr[i] = min_val;
+++ outputVectorPtr[i] = (int8_t)rintf(inputVectorPtr[i]);
+++ }
+++}
+++#endif /* LV_HAVE_SSE2 */
+++
+++#ifdef LV_HAVE_GENERIC
+++/*!
+++ \brief Converts a float vector of 64 bits (32 bits each part) into a 16 integer vector (8 bits each part)
+++ \param inputVector The floating point input data buffer
+++ \param outputVector The 16 bit output data buffer
+++ \param num_points The number of data values to be converted
+++ */
+++static inline void volk_gnsssdr_32fc_convert_8ic_generic(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){
+++ float* inputVectorPtr = (float*)inputVector;
+++ int8_t* outputVectorPtr = (int8_t*)outputVector;
+++ float min_val = -128;
+++ float max_val = 127;
+++
+++ for(unsigned int i = 0; i < num_points*2; i++){
+++ if(inputVectorPtr[i] > max_val)
+++ inputVectorPtr[i] = max_val;
+++ else if(inputVectorPtr[i] < min_val)
+++ inputVectorPtr[i] = min_val;
+++ outputVectorPtr[i] = (int8_t)rintf(inputVectorPtr[i]);
+++ }
+++}
+++#endif /* LV_HAVE_GENERIC */
+++#endif /* INCLUDED_volk_gnsssdr_32fc_convert_8ic_u_H */
+++
+++
+++#ifndef INCLUDED_volk_gnsssdr_32fc_convert_8ic_a_H
+++#define INCLUDED_volk_gnsssdr_32fc_convert_8ic_a_H
+++
+++#include <volk/volk_common.h>
+++#include <inttypes.h>
+++#include <stdio.h>
+++#include <math.h>
+++
+++#ifdef LV_HAVE_SSE2
+++#include <emmintrin.h>
+++/*!
+++ \brief Converts a float vector of 64 bits (32 bits each part) into a 16 integer vector (8 bits each part)
+++ \param inputVector The floating point input data buffer
+++ \param outputVector The 16 bit output data buffer
+++ \param num_points The number of data values to be converted
+++ */
+++static inline void volk_gnsssdr_32fc_convert_8ic_a_sse2(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){
+++ const unsigned int sse_iters = num_points/8;
+++
+++ float* inputVectorPtr = (float*)inputVector;
+++ int8_t* outputVectorPtr = (int8_t*)outputVector;
+++
+++ float min_val = -128;
+++ float max_val = 127;
+++
+++ __m128 inputVal1, inputVal2, inputVal3, inputVal4;
+++ __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
+++ __m128i int8InputVal;
+++ __m128 ret1, ret2, ret3, ret4;
+++ __m128 vmin_val = _mm_set_ps1(min_val);
+++ __m128 vmax_val = _mm_set_ps1(max_val);
+++
+++ for(unsigned int i = 0;i < sse_iters; i++){
+++ inputVal1 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
+++ inputVal2 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
+++ inputVal3 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
+++ inputVal4 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
+++
+++ // Clip
+++ ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
+++ ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
+++ ret3 = _mm_max_ps(_mm_min_ps(inputVal3, vmax_val), vmin_val);
+++ ret4 = _mm_max_ps(_mm_min_ps(inputVal4, vmax_val), vmin_val);
+++
+++ intInputVal1 = _mm_cvtps_epi32(ret1);
+++ intInputVal2 = _mm_cvtps_epi32(ret2);
+++ intInputVal3 = _mm_cvtps_epi32(ret3);
+++ intInputVal4 = _mm_cvtps_epi32(ret4);
+++
+++ intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
+++ intInputVal2 = _mm_packs_epi32(intInputVal3, intInputVal4);
+++ int8InputVal = _mm_packs_epi16(intInputVal1, intInputVal2);
+++
+++ _mm_store_si128((__m128i*)outputVectorPtr, int8InputVal);
+++ outputVectorPtr += 16;
+++ }
+++
+++ for(unsigned int i = 0; i < (num_points%4)*4; i++){
+++ if(inputVectorPtr[i] > max_val)
+++ inputVectorPtr[i] = max_val;
+++ else if(inputVectorPtr[i] < min_val)
+++ inputVectorPtr[i] = min_val;
+++ outputVectorPtr[i] = (int8_t)rintf(inputVectorPtr[i]);
+++ }
+++}
+++#endif /* LV_HAVE_SSE2 */
+++
+++#ifdef LV_HAVE_GENERIC
+++/*!
+++ \brief Converts a float vector of 64 bits (32 bits each part) into a 16 integer vector (8 bits each part)
+++ \param inputVector The floating point input data buffer
+++ \param outputVector The 16 bit output data buffer
+++ \param num_points The number of data values to be converted
+++ */
+++static inline void volk_gnsssdr_32fc_convert_8ic_a_generic(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){
+++ float* inputVectorPtr = (float*)inputVector;
+++ int8_t* outputVectorPtr = (int8_t*)outputVector;
+++ float min_val = -128;
+++ float max_val = 127;
+++
+++ for(unsigned int i = 0; i < num_points*2; i++){
+++ if(inputVectorPtr[i] > max_val)
+++ inputVectorPtr[i] = max_val;
+++ else if(inputVectorPtr[i] < min_val)
+++ inputVectorPtr[i] = min_val;
+++ outputVectorPtr[i] = (int8_t)rintf(inputVectorPtr[i]);
+++ }
+++}
+++#endif /* LV_HAVE_GENERIC */
+++#endif /* INCLUDED_volk_gnsssdr_32fc_convert_8ic_a_H */
++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_magnitude_squared_32f.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_magnitude_squared_32f.h
++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_magnitude_squared_32f.h 1970-01-01 01:00:00.000000000 +0100
+++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_magnitude_squared_32f.h 2014-10-15 01:55:08.000000000 +0200
++@@ -0,0 +1,228 @@
+++#ifndef INCLUDED_volk_gnsssdr_32fc_magnitude_squared_32f_u_H
+++#define INCLUDED_volk_gnsssdr_32fc_magnitude_squared_32f_u_H
+++
+++#include <inttypes.h>
+++#include <stdio.h>
+++#include <math.h>
+++
+++#ifdef LV_HAVE_SSE3
+++#include <pmmintrin.h>
+++ /*!
+++ \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector
+++ \param complexVector The vector containing the complex input values
+++ \param magnitudeVector The vector containing the real output values
+++ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+++ */
+++static inline void volk_gnsssdr_32fc_magnitude_squared_32f_u_sse3(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
+++ unsigned int number = 0;
+++ const unsigned int quarterPoints = num_points / 4;
+++
+++ const float* complexVectorPtr = (float*)complexVector;
+++ float* magnitudeVectorPtr = magnitudeVector;
+++
+++ __m128 cplxValue1, cplxValue2, result;
+++ for(;number < quarterPoints; number++){
+++ cplxValue1 = _mm_loadu_ps(complexVectorPtr);
+++ complexVectorPtr += 4;
+++
+++ cplxValue2 = _mm_loadu_ps(complexVectorPtr);
+++ complexVectorPtr += 4;
+++
+++ cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
+++ cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
+++
+++ result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
+++
+++ _mm_storeu_ps(magnitudeVectorPtr, result);
+++ magnitudeVectorPtr += 4;
+++ }
+++
+++ number = quarterPoints * 4;
+++ for(; number < num_points; number++){
+++ float val1Real = *complexVectorPtr++;
+++ float val1Imag = *complexVectorPtr++;
+++ *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
+++ }
+++}
+++#endif /* LV_HAVE_SSE3 */
+++
+++#ifdef LV_HAVE_SSE
+++#include <xmmintrin.h>
+++ /*!
+++ \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector
+++ \param complexVector The vector containing the complex input values
+++ \param magnitudeVector The vector containing the real output values
+++ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+++ */
+++static inline void volk_gnsssdr_32fc_magnitude_squared_32f_u_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
+++ unsigned int number = 0;
+++ const unsigned int quarterPoints = num_points / 4;
+++
+++ const float* complexVectorPtr = (float*)complexVector;
+++ float* magnitudeVectorPtr = magnitudeVector;
+++
+++ __m128 cplxValue1, cplxValue2, iValue, qValue, result;
+++ for(;number < quarterPoints; number++){
+++ cplxValue1 = _mm_loadu_ps(complexVectorPtr);
+++ complexVectorPtr += 4;
+++
+++ cplxValue2 = _mm_loadu_ps(complexVectorPtr);
+++ complexVectorPtr += 4;
+++
+++ // Arrange in i1i2i3i4 format
+++ iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+++ // Arrange in q1q2q3q4 format
+++ qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
+++
+++ iValue = _mm_mul_ps(iValue, iValue); // Square the I values
+++ qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values
+++
+++ result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values
+++
+++ _mm_storeu_ps(magnitudeVectorPtr, result);
+++ magnitudeVectorPtr += 4;
+++ }
+++
+++ number = quarterPoints * 4;
+++ for(; number < num_points; number++){
+++ float val1Real = *complexVectorPtr++;
+++ float val1Imag = *complexVectorPtr++;
+++ *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
+++ }
+++}
+++#endif /* LV_HAVE_SSE */
+++
+++#ifdef LV_HAVE_GENERIC
+++ /*!
+++ \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector
+++ \param complexVector The vector containing the complex input values
+++ \param magnitudeVector The vector containing the real output values
+++ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+++ */
+++static inline void volk_gnsssdr_32fc_magnitude_squared_32f_generic(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
+++ const float* complexVectorPtr = (float*)complexVector;
+++ float* magnitudeVectorPtr = magnitudeVector;
+++ unsigned int number = 0;
+++ for(number = 0; number < num_points; number++){
+++ const float real = *complexVectorPtr++;
+++ const float imag = *complexVectorPtr++;
+++ *magnitudeVectorPtr++ = (real*real) + (imag*imag);
+++ }
+++}
+++#endif /* LV_HAVE_GENERIC */
+++
+++#endif /* INCLUDED_volk_gnsssdr_32fc_magnitude_32f_u_H */
+++#ifndef INCLUDED_volk_gnsssdr_32fc_magnitude_squared_32f_a_H
+++#define INCLUDED_volk_gnsssdr_32fc_magnitude_squared_32f_a_H
+++
+++#include <inttypes.h>
+++#include <stdio.h>
+++#include <math.h>
+++
+++#ifdef LV_HAVE_SSE3
+++#include <pmmintrin.h>
+++ /*!
+++ \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector
+++ \param complexVector The vector containing the complex input values
+++ \param magnitudeVector The vector containing the real output values
+++ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+++ */
+++static inline void volk_gnsssdr_32fc_magnitude_squared_32f_a_sse3(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
+++ unsigned int number = 0;
+++ const unsigned int quarterPoints = num_points / 4;
+++
+++ const float* complexVectorPtr = (float*)complexVector;
+++ float* magnitudeVectorPtr = magnitudeVector;
+++
+++ __m128 cplxValue1, cplxValue2, result;
+++ for(;number < quarterPoints; number++){
+++ cplxValue1 = _mm_load_ps(complexVectorPtr);
+++ complexVectorPtr += 4;
+++
+++ cplxValue2 = _mm_load_ps(complexVectorPtr);
+++ complexVectorPtr += 4;
+++
+++ cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
+++ cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
+++
+++ result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
+++
+++ _mm_store_ps(magnitudeVectorPtr, result);
+++ magnitudeVectorPtr += 4;
+++ }
+++
+++ number = quarterPoints * 4;
+++ for(; number < num_points; number++){
+++ float val1Real = *complexVectorPtr++;
+++ float val1Imag = *complexVectorPtr++;
+++ *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
+++ }
+++}
+++#endif /* LV_HAVE_SSE3 */
+++
+++#ifdef LV_HAVE_SSE
+++#include <xmmintrin.h>
+++ /*!
+++ \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector
+++ \param complexVector The vector containing the complex input values
+++ \param magnitudeVector The vector containing the real output values
+++ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+++ */
+++static inline void volk_gnsssdr_32fc_magnitude_squared_32f_a_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
+++ unsigned int number = 0;
+++ const unsigned int quarterPoints = num_points / 4;
+++
+++ const float* complexVectorPtr = (float*)complexVector;
+++ float* magnitudeVectorPtr = magnitudeVector;
+++
+++ __m128 cplxValue1, cplxValue2, iValue, qValue, result;
+++ for(;number < quarterPoints; number++){
+++ cplxValue1 = _mm_load_ps(complexVectorPtr);
+++ complexVectorPtr += 4;
+++
+++ cplxValue2 = _mm_load_ps(complexVectorPtr);
+++ complexVectorPtr += 4;
+++
+++ // Arrange in i1i2i3i4 format
+++ iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+++ // Arrange in q1q2q3q4 format
+++ qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
+++
+++ iValue = _mm_mul_ps(iValue, iValue); // Square the I values
+++ qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values
+++
+++ result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values
+++
+++ _mm_store_ps(magnitudeVectorPtr, result);
+++ magnitudeVectorPtr += 4;
+++ }
+++
+++ number = quarterPoints * 4;
+++ for(; number < num_points; number++){
+++ float val1Real = *complexVectorPtr++;
+++ float val1Imag = *complexVectorPtr++;
+++ *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
+++ }
+++}
+++#endif /* LV_HAVE_SSE */
+++
+++#ifdef LV_HAVE_GENERIC
+++ /*!
+++ \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector
+++ \param complexVector The vector containing the complex input values
+++ \param magnitudeVector The vector containing the real output values
+++ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+++ */
+++static inline void volk_gnsssdr_32fc_magnitude_squared_32f_a_generic(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
+++ const float* complexVectorPtr = (float*)complexVector;
+++ float* magnitudeVectorPtr = magnitudeVector;
+++ unsigned int number = 0;
+++ for(number = 0; number < num_points; number++){
+++ const float real = *complexVectorPtr++;
+++ const float imag = *complexVectorPtr++;
+++ *magnitudeVectorPtr++ = (real*real) + (imag*imag);
+++ }
+++}
+++#endif /* LV_HAVE_GENERIC */
+++
+++#endif /* INCLUDED_volk_gnsssdr_32fc_magnitude_32f_a_H */
++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_s32f_convert_8ic.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_s32f_convert_8ic.h
++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_s32f_convert_8ic.h 1970-01-01 01:00:00.000000000 +0100
+++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_s32f_convert_8ic.h 2014-10-15 01:55:08.000000000 +0200
++@@ -0,0 +1,231 @@
+++/*!
+++ * \file volk_gnsssdr_32fc_s32f_convert_8ic.h
+++ * \brief Volk protokernel: converts float32 complex values to 8 integer complex values taking care of overflow
+++ * \authors <ul>
+++ * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
+++ * </ul>
+++ *
+++ * -------------------------------------------------------------------------
+++ *
+++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
+++ *
+++ * GNSS-SDR is a software defined Global Navigation
+++ * Satellite Systems receiver
+++ *
+++ * This file is part of GNSS-SDR.
+++ *
+++ * GNSS-SDR is free software: you can redistribute it and/or modify
+++ * it under the terms of the GNU General Public License as published by
+++ * the Free Software Foundation, either version 3 of the License, or
+++ * at your option) any later version.
+++ *
+++ * GNSS-SDR is distributed in the hope that it will be useful,
+++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+++ * GNU General Public License for more details.
+++ *
+++ * You should have received a copy of the GNU General Public License
+++ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
+++ *
+++ * -------------------------------------------------------------------------
+++ */
+++
+++#ifndef INCLUDED_volk_gnsssdr_32fc_s32f_convert_8ic_u_H
+++#define INCLUDED_volk_gnsssdr_32fc_s32f_convert_8ic_u_H
+++
+++#include <inttypes.h>
+++#include <stdio.h>
+++#include <math.h>
+++
+++#ifdef LV_HAVE_SSE2
+++#include <emmintrin.h>
+++/*!
+++ \brief Converts a float vector of 64 bits (32 bits each part) into a 16 integer vector (8 bits each part)
+++ \param inputVector The floating point input data buffer
+++ \param outputVector The 16 bit output data buffer
+++ \param num_points The number of data values to be converted
+++ */
+++static inline void volk_gnsssdr_32fc_s32f_convert_8ic_u_sse2(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, const float scalar, unsigned int num_points){
+++ const unsigned int sse_iters = num_points/8;
+++
+++ float* inputVectorPtr = (float*)inputVector;
+++ int8_t* outputVectorPtr = (int8_t*)outputVector;
+++ __m128 invScalar = _mm_set_ps1(1.0/scalar);
+++
+++ float min_val = -128;
+++ float max_val = 127;
+++
+++ __m128 inputVal1, inputVal2, inputVal3, inputVal4;
+++ __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
+++ __m128i int8InputVal;
+++ __m128 ret1, ret2, ret3, ret4;
+++ __m128 vmin_val = _mm_set_ps1(min_val);
+++ __m128 vmax_val = _mm_set_ps1(max_val);
+++
+++ for(unsigned int i = 0;i < sse_iters; i++){
+++ inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
+++ inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
+++ inputVal3 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
+++ inputVal4 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
+++
+++ inputVal1 = _mm_mul_ps(inputVal1, invScalar);
+++ inputVal2 = _mm_mul_ps(inputVal2, invScalar);
+++ inputVal3 = _mm_mul_ps(inputVal3, invScalar);
+++ inputVal4 = _mm_mul_ps(inputVal4, invScalar);
+++ // Clip
+++ ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
+++ ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
+++ ret3 = _mm_max_ps(_mm_min_ps(inputVal3, vmax_val), vmin_val);
+++ ret4 = _mm_max_ps(_mm_min_ps(inputVal4, vmax_val), vmin_val);
+++
+++ intInputVal1 = _mm_cvtps_epi32(ret1);
+++ intInputVal2 = _mm_cvtps_epi32(ret2);
+++ intInputVal3 = _mm_cvtps_epi32(ret3);
+++ intInputVal4 = _mm_cvtps_epi32(ret4);
+++
+++ intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
+++ intInputVal2 = _mm_packs_epi32(intInputVal3, intInputVal4);
+++ int8InputVal = _mm_packs_epi16(intInputVal1, intInputVal2);
+++
+++ _mm_storeu_si128((__m128i*)outputVectorPtr, int8InputVal);
+++ outputVectorPtr += 16;
+++ }
+++
+++ float scaled = 0;
+++ for(unsigned int i = 0; i < (num_points%4)*4; i++){
+++ scaled = inputVectorPtr[i]/scalar;
+++ if(scaled > max_val)
+++ scaled = max_val;
+++ else if(scaled < min_val)
+++ scaled = min_val;
+++ outputVectorPtr[i] = (int8_t)rintf(scaled);
+++ }
+++}
+++#endif /* LV_HAVE_SSE2 */
+++
+++#ifdef LV_HAVE_GENERIC
+++/*!
+++ \brief Converts a float vector of 64 bits (32 bits each part) into a 16 integer vector (8 bits each part)
+++ \param inputVector The floating point input data buffer
+++ \param outputVector The 16 bit output data buffer
+++ \param num_points The number of data values to be converted
+++ */
+++static inline void volk_gnsssdr_32fc_s32f_convert_8ic_generic(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, const float scalar, unsigned int num_points){
+++ float* inputVectorPtr = (float*)inputVector;
+++ int8_t* outputVectorPtr = (int8_t*)outputVector;
+++ float scaled = 0;
+++ float min_val = -128;
+++ float max_val = 127;
+++
+++ for(unsigned int i = 0; i < num_points*2; i++){
+++ scaled = (inputVectorPtr[i])/scalar;
+++ if(scaled > max_val)
+++ scaled = max_val;
+++ else if(scaled < min_val)
+++ scaled = min_val;
+++ outputVectorPtr[i] = (int8_t)rintf(scaled);
+++ }
+++}
+++#endif /* LV_HAVE_GENERIC */
+++#endif /* INCLUDED_volk_gnsssdr_32fc_s32f_convert_8ic_u_H */
+++
+++
+++#ifndef INCLUDED_volk_gnsssdr_32fc_s32f_convert_8ic_a_H
+++#define INCLUDED_volk_gnsssdr_32fc_s32f_convert_8ic_a_H
+++
+++#include <volk/volk_common.h>
+++#include <inttypes.h>
+++#include <stdio.h>
+++#include <math.h>
+++
+++#ifdef LV_HAVE_SSE2
+++#include <emmintrin.h>
+++/*!
+++ \brief Converts a float vector of 64 bits (32 bits each part) into a 16 integer vector (8 bits each part)
+++ \param inputVector The floating point input data buffer
+++ \param outputVector The 16 bit output data buffer
+++ \param num_points The number of data values to be converted
+++ */
+++static inline void volk_gnsssdr_32fc_s32f_convert_8ic_a_sse2(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, const float scalar, unsigned int num_points){
+++ const unsigned int sse_iters = num_points/8;
+++
+++ float* inputVectorPtr = (float*)inputVector;
+++ int8_t* outputVectorPtr = (int8_t*)outputVector;
+++ __m128 invScalar = _mm_set_ps1(1.0/scalar);
+++
+++ float min_val = -128;
+++ float max_val = 127;
+++
+++ __m128 inputVal1, inputVal2, inputVal3, inputVal4;
+++ __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
+++ __m128i int8InputVal;
+++ __m128 ret1, ret2, ret3, ret4;
+++ __m128 vmin_val = _mm_set_ps1(min_val);
+++ __m128 vmax_val = _mm_set_ps1(max_val);
+++
+++ for(unsigned int i = 0;i < sse_iters; i++){
+++ inputVal1 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
+++ inputVal2 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
+++ inputVal3 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
+++ inputVal4 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
+++
+++ inputVal1 = _mm_mul_ps(inputVal1, invScalar);
+++ inputVal2 = _mm_mul_ps(inputVal2, invScalar);
+++ inputVal3 = _mm_mul_ps(inputVal3, invScalar);
+++ inputVal4 = _mm_mul_ps(inputVal4, invScalar);
+++ // Clip
+++ ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
+++ ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
+++ ret3 = _mm_max_ps(_mm_min_ps(inputVal3, vmax_val), vmin_val);
+++ ret4 = _mm_max_ps(_mm_min_ps(inputVal4, vmax_val), vmin_val);
+++
+++ intInputVal1 = _mm_cvtps_epi32(ret1);
+++ intInputVal2 = _mm_cvtps_epi32(ret2);
+++ intInputVal3 = _mm_cvtps_epi32(ret3);
+++ intInputVal4 = _mm_cvtps_epi32(ret4);
+++
+++ intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
+++ intInputVal2 = _mm_packs_epi32(intInputVal3, intInputVal4);
+++ int8InputVal = _mm_packs_epi16(intInputVal1, intInputVal2);
+++
+++ _mm_store_si128((__m128i*)outputVectorPtr, int8InputVal);
+++ outputVectorPtr += 16;
+++ }
+++
+++ float scaled = 0;
+++ for(unsigned int i = 0; i < (num_points%4)*4; i++){
+++ scaled = inputVectorPtr[i]/scalar;
+++ if(scaled > max_val)
+++ scaled = max_val;
+++ else if(scaled < min_val)
+++ scaled = min_val;
+++ outputVectorPtr[i] = (int8_t)rintf(scaled);
+++ }
+++}
+++#endif /* LV_HAVE_SSE2 */
+++
+++#ifdef LV_HAVE_GENERIC
+++/*!
+++ \brief Converts a float vector of 64 bits (32 bits each part) into a 16 integer vector (8 bits each part)
+++ \param inputVector The floating point input data buffer
+++ \param outputVector The 16 bit output data buffer
+++ \param num_points The number of data values to be converted
+++ */
+++static inline void volk_gnsssdr_32fc_s32f_convert_8ic_a_generic(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, const float scalar, unsigned int num_points){
+++ float* inputVectorPtr = (float*)inputVector;
+++ int8_t* outputVectorPtr = (int8_t*)outputVector;
+++ float scaled = 0;
+++ float min_val = -128;
+++ float max_val = 127;
+++
+++ for(unsigned int i = 0; i < num_points*2; i++){
+++ scaled = inputVectorPtr[i]/scalar;
+++ if(scaled > max_val)
+++ scaled = max_val;
+++ else if(scaled < min_val)
+++ scaled = min_val;
+++ outputVectorPtr[i] = (int8_t)rintf(scaled);
+++ }
+++}
+++#endif /* LV_HAVE_GENERIC */
+++#endif /* INCLUDED_volk_gnsssdr_32fc_s32f_convert_8ic_a_H */
++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc.h
++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc.h 1970-01-01 01:00:00.000000000 +0100
+++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc.h 2014-10-15 01:55:08.000000000 +0200
++@@ -0,0 +1,266 @@
+++/*!
+++ * \file volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc
+++ * \brief Volk protokernel: replaces the tracking function for update_local_code
+++ * \authors <ul>
+++ * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
+++ * </ul>
+++ *
+++ * Volk protokernel that replaces the tracking function for update_local_code
+++ *
+++ * -------------------------------------------------------------------------
+++ *
+++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
+++ *
+++ * GNSS-SDR is a software defined Global Navigation
+++ * Satellite Systems receiver
+++ *
+++ * This file is part of GNSS-SDR.
+++ *
+++ * GNSS-SDR is free software: you can redistribute it and/or modify
+++ * it under the terms of the GNU General Public License as published by
+++ * the Free Software Foundation, either version 3 of the License, or
+++ * at your option) any later version.
+++ *
+++ * GNSS-SDR is distributed in the hope that it will be useful,
+++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+++ * GNU General Public License for more details.
+++ *
+++ * You should have received a copy of the GNU General Public License
+++ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
+++ *
+++ * -------------------------------------------------------------------------
+++ */
+++
+++#ifndef INCLUDED_volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_u_H
+++#define INCLUDED_volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_u_H
+++
+++#include <inttypes.h>
+++#include <stdio.h>
+++#include <volk_gnsssdr/volk_gnsssdr_complex.h>
+++#include <float.h>
+++
+++#ifdef LV_HAVE_SSE4_1
+++#include <smmintrin.h>
+++ /*!
+++ \brief Takes the conjugate of a complex vector.
+++ \param cVector The vector where the results will be stored
+++ \param aVector Vector to be conjugated
+++ \param num_points The number of complex values in aVector to be conjugated and stored into cVector
+++ */
+++static inline void volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_u_sse4_1(lv_32fc_t* d_very_early_code, const float d_very_early_late_spc_chips, const float code_length_half_chips, const float code_phase_step_half_chips, const float tcode_half_chips_input, const lv_32fc_t* d_ca_code, unsigned int num_points){
+++
+++// float* pointer1 = (float*)&d_very_early_late_spc_chips;
+++// *pointer1 = 1;
+++// float* pointer2 = (float*)&code_length_half_chips;
+++// *pointer2 = 6;
+++// float* pointer3 = (float*)&code_phase_step_half_chips;
+++// *pointer3 = 7;
+++// float* pointer4 = (float*)&tcode_half_chips_input;
+++// *pointer4 = 8;
+++
+++ const unsigned int sse_iters = num_points / 4;
+++
+++ __m128 tquot, fmod_num, fmod_result, associated_chip_index_array;
+++
+++ __m128 tcode_half_chips_array = _mm_set_ps (tcode_half_chips_input+3*code_phase_step_half_chips, tcode_half_chips_input+2*code_phase_step_half_chips, tcode_half_chips_input+code_phase_step_half_chips, tcode_half_chips_input);
+++ __m128 code_phase_step_half_chips_array = _mm_set1_ps (code_phase_step_half_chips*4);
+++ __m128 d_very_early_late_spc_chips_Multiplied_by_2 = _mm_set1_ps (2*d_very_early_late_spc_chips);
+++ __m128 code_length_half_chips_array = _mm_set1_ps (code_length_half_chips);
+++ __m128 twos = _mm_set1_ps (2);
+++ __m128i associated_chip_index_array_int;
+++
+++ __VOLK_ATTR_ALIGNED(16) int32_t output[4];
+++
+++ for (unsigned int i = 0; i < sse_iters; i++)
+++ {
+++ //fmod = numer - tquot * denom; tquot = numer/denom truncated
+++ //associated_chip_index = 2 + round(fmod(tcode_half_chips - 2*d_very_early_late_spc_chips, code_length_half_chips));
+++ fmod_num = _mm_sub_ps (tcode_half_chips_array, d_very_early_late_spc_chips_Multiplied_by_2);
+++ tquot = _mm_div_ps (fmod_num, code_length_half_chips_array);
+++ tquot = _mm_round_ps (tquot, (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) );
+++ fmod_result = _mm_sub_ps (fmod_num, _mm_mul_ps (tquot, code_length_half_chips_array));
+++
+++ associated_chip_index_array = _mm_round_ps (fmod_result, (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC));
+++ associated_chip_index_array = _mm_add_ps(twos, associated_chip_index_array);
+++ associated_chip_index_array_int = _mm_cvtps_epi32 (associated_chip_index_array);
+++ _mm_storeu_si128 ((__m128i*)output, associated_chip_index_array_int);
+++
+++ //d_very_early_code[i] = d_ca_code[associated_chip_index];
+++ *d_very_early_code++ = d_ca_code[output[0]];
+++ *d_very_early_code++ = d_ca_code[output[1]];
+++ *d_very_early_code++ = d_ca_code[output[2]];
+++ *d_very_early_code++ = d_ca_code[output[3]];
+++
+++ //tcode_half_chips = tcode_half_chips + code_phase_step_half_chips;
+++ tcode_half_chips_array = _mm_add_ps (tcode_half_chips_array, code_phase_step_half_chips_array);
+++ }
+++
+++ if (num_points%4!=0)
+++ {
+++ __VOLK_ATTR_ALIGNED(16) float tcode_half_chips_stored[4];
+++ _mm_storeu_si128 ((__m128i*)tcode_half_chips_stored, tcode_half_chips_array);
+++
+++ int associated_chip_index;
+++ float tcode_half_chips = tcode_half_chips_stored[0];
+++ float d_very_early_late_spc_chips_multiplied_by_2 = 2*d_very_early_late_spc_chips;
+++
+++ for (unsigned int i = 0; i < num_points%4; i++)
+++ {
+++ associated_chip_index = 2 + round(fmod(tcode_half_chips - d_very_early_late_spc_chips_multiplied_by_2, code_length_half_chips));
+++ d_very_early_code[i] = d_ca_code[associated_chip_index];
+++ tcode_half_chips = tcode_half_chips + code_phase_step_half_chips;
+++ }
+++ }
+++}
+++#endif /* LV_HAVE_SSE4_1 */
+++
+++#ifdef LV_HAVE_GENERIC
+++ /*!
+++ \brief Takes the conjugate of a complex vector.
+++ \param cVector The vector where the results will be stored
+++ \param aVector Vector to be conjugated
+++ \param num_points The number of complex values in aVector to be conjugated and stored into cVector
+++ */
+++static inline void volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_generic(lv_32fc_t* d_very_early_code, const float d_very_early_late_spc_chips, const float code_length_half_chips, const float code_phase_step_half_chips, const float tcode_half_chips_input, const lv_32fc_t* d_ca_code, unsigned int num_points){
+++
+++ float* pointer1 = (float*)&d_very_early_late_spc_chips;
+++ *pointer1 = 1;
+++ float* pointer2 = (float*)&code_length_half_chips;
+++ *pointer2 = 6;
+++ float* pointer3 = (float*)&code_phase_step_half_chips;
+++ *pointer3 = 7;
+++ float* pointer4 = (float*)&tcode_half_chips_input;
+++ *pointer4 = 8;
+++
+++ int associated_chip_index;
+++ float tcode_half_chips = tcode_half_chips_input;
+++ float d_very_early_late_spc_chips_multiplied_by_2 = 2*d_very_early_late_spc_chips;
+++
+++ for (unsigned int i = 0; i < num_points; i++)
+++ {
+++ associated_chip_index = 2 + round(fmod(tcode_half_chips - d_very_early_late_spc_chips_multiplied_by_2, code_length_half_chips));
+++ d_very_early_code[i] = d_ca_code[associated_chip_index];
+++ tcode_half_chips = tcode_half_chips + code_phase_step_half_chips;
+++ }
+++}
+++#endif /* LV_HAVE_GENERIC */
+++
+++
+++#endif /* INCLUDED_volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_u_H */
+++#ifndef INCLUDED_volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_a_H
+++#define INCLUDED_volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_a_H
+++
+++#include <inttypes.h>
+++#include <stdio.h>
+++#include <volk_gnsssdr/volk_gnsssdr_complex.h>
+++#include <float.h>
+++
+++#ifdef LV_HAVE_SSE4_1
+++#include <smmintrin.h>
+++ /*!
+++ \brief Takes the conjugate of a complex vector.
+++ \param cVector The vector where the results will be stored
+++ \param aVector Vector to be conjugated
+++ \param num_points The number of complex values in aVector to be conjugated and stored into cVector
+++ */
+++static inline void volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_a_sse4_1(lv_32fc_t* d_very_early_code, const float d_very_early_late_spc_chips, const float code_length_half_chips, const float code_phase_step_half_chips, const float tcode_half_chips_input, const lv_32fc_t* d_ca_code, unsigned int num_points){
+++
+++ // float* pointer1 = (float*)&d_very_early_late_spc_chips;
+++ // *pointer1 = 1;
+++ // float* pointer2 = (float*)&code_length_half_chips;
+++ // *pointer2 = 6;
+++ // float* pointer3 = (float*)&code_phase_step_half_chips;
+++ // *pointer3 = 7;
+++ // float* pointer4 = (float*)&tcode_half_chips_input;
+++ // *pointer4 = 8;
+++
+++ const unsigned int sse_iters = num_points / 4;
+++
+++ __m128 tquot, fmod_num, fmod_result, associated_chip_index_array;
+++
+++ __m128 tcode_half_chips_array = _mm_set_ps (tcode_half_chips_input+3*code_phase_step_half_chips, tcode_half_chips_input+2*code_phase_step_half_chips, tcode_half_chips_input+code_phase_step_half_chips, tcode_half_chips_input);
+++ __m128 code_phase_step_half_chips_array = _mm_set1_ps (code_phase_step_half_chips*4);
+++ __m128 d_very_early_late_spc_chips_Multiplied_by_2 = _mm_set1_ps (2*d_very_early_late_spc_chips);
+++ __m128 code_length_half_chips_array = _mm_set1_ps (code_length_half_chips);
+++ __m128 twos = _mm_set1_ps (2);
+++ __m128i associated_chip_index_array_int;
+++
+++ __VOLK_ATTR_ALIGNED(16) int32_t output[4];
+++
+++ for (unsigned int i = 0; i < sse_iters; i++)
+++ {
+++ //fmod = numer - tquot * denom; tquot = numer/denom truncated
+++ //associated_chip_index = 2 + round(fmod(tcode_half_chips - 2*d_very_early_late_spc_chips, code_length_half_chips));
+++ fmod_num = _mm_sub_ps (tcode_half_chips_array, d_very_early_late_spc_chips_Multiplied_by_2);
+++ tquot = _mm_div_ps (fmod_num, code_length_half_chips_array);
+++ tquot = _mm_round_ps (tquot, (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) );
+++ fmod_result = _mm_sub_ps (fmod_num, _mm_mul_ps (tquot, code_length_half_chips_array));
+++
+++ associated_chip_index_array = _mm_round_ps (fmod_result, (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC));
+++ associated_chip_index_array = _mm_add_ps(twos, associated_chip_index_array);
+++ associated_chip_index_array_int = _mm_cvtps_epi32 (associated_chip_index_array);
+++ _mm_store_si128 ((__m128i*)output, associated_chip_index_array_int);
+++
+++ //d_very_early_code[i] = d_ca_code[associated_chip_index];
+++ *d_very_early_code++ = d_ca_code[output[0]];
+++ *d_very_early_code++ = d_ca_code[output[1]];
+++ *d_very_early_code++ = d_ca_code[output[2]];
+++ *d_very_early_code++ = d_ca_code[output[3]];
+++
+++ //tcode_half_chips = tcode_half_chips + code_phase_step_half_chips;
+++ tcode_half_chips_array = _mm_add_ps (tcode_half_chips_array, code_phase_step_half_chips_array);
+++ }
+++
+++ if (num_points%4!=0)
+++ {
+++ __VOLK_ATTR_ALIGNED(16) float tcode_half_chips_stored[4];
+++ _mm_store_si128 ((__m128i*)tcode_half_chips_stored, tcode_half_chips_array);
+++
+++ int associated_chip_index;
+++ float tcode_half_chips = tcode_half_chips_stored[0];
+++ float d_very_early_late_spc_chips_multiplied_by_2 = 2*d_very_early_late_spc_chips;
+++
+++ for (unsigned int i = 0; i < num_points%4; i++)
+++ {
+++ associated_chip_index = 2 + round(fmod(tcode_half_chips - d_very_early_late_spc_chips_multiplied_by_2, code_length_half_chips));
+++ d_very_early_code[i] = d_ca_code[associated_chip_index];
+++ tcode_half_chips = tcode_half_chips + code_phase_step_half_chips;
+++ }
+++ }
+++
+++}
+++#endif /* LV_HAVE_SSE4_1 */
+++
+++#ifdef LV_HAVE_GENERIC
+++ /*!
+++ \brief Takes the conjugate of a complex vector.
+++ \param cVector The vector where the results will be stored
+++ \param aVector Vector to be conjugated
+++ \param num_points The number of complex values in aVector to be conjugated and stored into cVector
+++ */
+++static inline void volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_a_generic(lv_32fc_t* d_very_early_code, const float d_very_early_late_spc_chips, const float code_length_half_chips, const float code_phase_step_half_chips, const float tcode_half_chips_input, const lv_32fc_t* d_ca_code, unsigned int num_points){
+++
+++ // float* pointer1 = (float*)&d_very_early_late_spc_chips;
+++ // *pointer1 = 1;
+++ // float* pointer2 = (float*)&code_length_half_chips;
+++ // *pointer2 = 6;
+++ // float* pointer3 = (float*)&code_phase_step_half_chips;
+++ // *pointer3 = 7;
+++ // float* pointer4 = (float*)&tcode_half_chips_input;
+++ // *pointer4 = 8;
+++
+++ int associated_chip_index;
+++ float tcode_half_chips = tcode_half_chips_input;
+++ float d_very_early_late_spc_chips_multiplied_by_2 = 2*d_very_early_late_spc_chips;
+++
+++ for (unsigned int i = 0; i < num_points; i++)
+++ {
+++ associated_chip_index = 2 + round(fmod(tcode_half_chips - d_very_early_late_spc_chips_multiplied_by_2, code_length_half_chips));
+++ d_very_early_code[i] = d_ca_code[associated_chip_index];
+++ tcode_half_chips = tcode_half_chips + code_phase_step_half_chips;
+++ }
+++}
+++#endif /* LV_HAVE_GENERIC */
+++
+++#endif /* INCLUDED_volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_a_H */
++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_s32fc_multiply_32fc.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_s32fc_multiply_32fc.h
++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_s32fc_multiply_32fc.h 1970-01-01 01:00:00.000000000 +0100
+++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_s32fc_multiply_32fc.h 2014-10-15 01:55:08.000000000 +0200
++@@ -0,0 +1,178 @@
+++#ifndef INCLUDED_volk_gnsssdr_32fc_s32fc_multiply_32fc_u_H
+++#define INCLUDED_volk_gnsssdr_32fc_s32fc_multiply_32fc_u_H
+++
+++#include <inttypes.h>
+++#include <stdio.h>
+++#include <volk_gnsssdr/volk_gnsssdr_complex.h>
+++#include <float.h>
+++
+++#ifdef LV_HAVE_SSE3
+++#include <pmmintrin.h>
+++/*!
+++ \brief Multiplies the input vector by a scalar and stores the results in the third vector
+++ \param cVector The vector where the results will be stored
+++ \param aVector The vector to be multiplied
+++ \param scalar The complex scalar to multiply aVector
+++ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+++*/
+++static inline void volk_gnsssdr_32fc_s32fc_multiply_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
+++ unsigned int number = 0;
+++ const unsigned int halfPoints = num_points / 2;
+++
+++ __m128 x, yl, yh, z, tmp1, tmp2;
+++ lv_32fc_t* c = cVector;
+++ const lv_32fc_t* a = aVector;
+++
+++ // Set up constant scalar vector
+++ yl = _mm_set_ps1(lv_creal(scalar));
+++ yh = _mm_set_ps1(lv_cimag(scalar));
+++
+++ for(;number < halfPoints; number++){
+++
+++ x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+++
+++ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+++
+++ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+++
+++ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+++
+++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+++
+++ _mm_storeu_ps((float*)c,z); // Store the results back into the C container
+++
+++ a += 2;
+++ c += 2;
+++ }
+++
+++ if((num_points % 2) != 0) {
+++ *c = (*a) * scalar;
+++ }
+++}
+++#endif /* LV_HAVE_SSE */
+++
+++#ifdef LV_HAVE_GENERIC
+++/*!
+++ \brief Multiplies the input vector by a scalar and stores the results in the third vector
+++ \param cVector The vector where the results will be stored
+++ \param aVector The vector to be multiplied
+++ \param scalar The complex scalar to multiply aVector
+++ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+++*/
+++static inline void volk_gnsssdr_32fc_s32fc_multiply_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
+++ lv_32fc_t* cPtr = cVector;
+++ const lv_32fc_t* aPtr = aVector;
+++ unsigned int number = num_points;
+++
+++ // unwrap loop
+++ while (number >= 8){
+++ *cPtr++ = (*aPtr++) * scalar;
+++ *cPtr++ = (*aPtr++) * scalar;
+++ *cPtr++ = (*aPtr++) * scalar;
+++ *cPtr++ = (*aPtr++) * scalar;
+++ *cPtr++ = (*aPtr++) * scalar;
+++ *cPtr++ = (*aPtr++) * scalar;
+++ *cPtr++ = (*aPtr++) * scalar;
+++ *cPtr++ = (*aPtr++) * scalar;
+++ number -= 8;
+++ }
+++
+++ // clean up any remaining
+++ while (number-- > 0)
+++ *cPtr++ = *aPtr++ * scalar;
+++}
+++#endif /* LV_HAVE_GENERIC */
+++
+++
+++#endif /* INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_u_H */
+++#ifndef INCLUDED_volk_gnsssdr_32fc_s32fc_multiply_32fc_a_H
+++#define INCLUDED_volk_gnsssdr_32fc_s32fc_multiply_32fc_a_H
+++
+++#include <inttypes.h>
+++#include <stdio.h>
+++#include <volk_gnsssdr/volk_gnsssdr_complex.h>
+++#include <float.h>
+++
+++#ifdef LV_HAVE_SSE3
+++#include <pmmintrin.h>
+++ /*!
+++ \brief Multiplies the two input complex vectors and stores their results in the third vector
+++ \param cVector The vector where the results will be stored
+++ \param aVector One of the vectors to be multiplied
+++ \param bVector One of the vectors to be multiplied
+++ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+++ */
+++static inline void volk_gnsssdr_32fc_s32fc_multiply_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
+++ unsigned int number = 0;
+++ const unsigned int halfPoints = num_points / 2;
+++
+++ __m128 x, yl, yh, z, tmp1, tmp2;
+++ lv_32fc_t* c = cVector;
+++ const lv_32fc_t* a = aVector;
+++
+++ // Set up constant scalar vector
+++ yl = _mm_set_ps1(lv_creal(scalar));
+++ yh = _mm_set_ps1(lv_cimag(scalar));
+++
+++ for(;number < halfPoints; number++){
+++
+++ x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+++
+++ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+++
+++ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+++
+++ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+++
+++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+++
+++ _mm_store_ps((float*)c,z); // Store the results back into the C container
+++
+++ a += 2;
+++ c += 2;
+++ }
+++
+++ if((num_points % 2) != 0) {
+++ *c = (*a) * scalar;
+++ }
+++}
+++#endif /* LV_HAVE_SSE */
+++
+++
+++#ifdef LV_HAVE_GENERIC
+++ /*!
+++ \brief Multiplies the two input complex vectors and stores their results in the third vector
+++ \param cVector The vector where the results will be stored
+++ \param aVector One of the vectors to be multiplied
+++ \param bVector One of the vectors to be multiplied
+++ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+++ */
+++static inline void volk_gnsssdr_32fc_s32fc_multiply_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
+++ lv_32fc_t* cPtr = cVector;
+++ const lv_32fc_t* aPtr = aVector;
+++ unsigned int number = num_points;
+++
+++ // unwrap loop
+++ while (number >= 8){
+++ *cPtr++ = (*aPtr++) * scalar;
+++ *cPtr++ = (*aPtr++) * scalar;
+++ *cPtr++ = (*aPtr++) * scalar;
+++ *cPtr++ = (*aPtr++) * scalar;
+++ *cPtr++ = (*aPtr++) * scalar;
+++ *cPtr++ = (*aPtr++) * scalar;
+++ *cPtr++ = (*aPtr++) * scalar;
+++ *cPtr++ = (*aPtr++) * scalar;
+++ number -= 8;
+++ }
+++
+++ // clean up any remaining
+++ while (number-- > 0)
+++ *cPtr++ = *aPtr++ * scalar;
+++}
+++#endif /* LV_HAVE_GENERIC */
+++
+++
+++
+++
+++
+++#endif /* INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_a_H */
++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x2_dot_prod_32fc.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x2_dot_prod_32fc.h
++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x2_dot_prod_32fc.h 1970-01-01 01:00:00.000000000 +0100
+++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x2_dot_prod_32fc.h 2014-10-15 01:55:08.000000000 +0200
++@@ -0,0 +1,763 @@
+++#ifndef INCLUDED_volk_gnsssdr_32fc_x2_dot_prod_32fc_u_H
+++#define INCLUDED_volk_gnsssdr_32fc_x2_dot_prod_32fc_u_H
+++
+++#include <volk_gnsssdr/volk_gnsssdr_common.h>
+++#include <volk_gnsssdr/volk_gnsssdr_complex.h>
+++#include <stdio.h>
+++#include <string.h>
+++
+++
+++#ifdef LV_HAVE_GENERIC
+++
+++
+++static inline void volk_gnsssdr_32fc_x2_dot_prod_32fc_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+++
+++ float * res = (float*) result;
+++ float * in = (float*) input;
+++ float * tp = (float*) taps;
+++ unsigned int n_2_ccomplex_blocks = num_points/2;
+++ unsigned int isodd = num_points & 1;
+++
+++ float sum0[2] = {0,0};
+++ float sum1[2] = {0,0};
+++ unsigned int i = 0;
+++
+++ for(i = 0; i < n_2_ccomplex_blocks; ++i) {
+++ sum0[0] += in[0] * tp[0] - in[1] * tp[1];
+++ sum0[1] += in[0] * tp[1] + in[1] * tp[0];
+++ sum1[0] += in[2] * tp[2] - in[3] * tp[3];
+++ sum1[1] += in[2] * tp[3] + in[3] * tp[2];
+++
+++ in += 4;
+++ tp += 4;
+++ }
+++
+++ res[0] = sum0[0] + sum1[0];
+++ res[1] = sum0[1] + sum1[1];
+++
+++ // Cleanup if we had an odd number of points
+++ for(i = 0; i < isodd; ++i) {
+++ *result += input[num_points - 1] * taps[num_points - 1];
+++ }
+++}
+++
+++#endif /*LV_HAVE_GENERIC*/
+++
+++
+++
+++#if LV_HAVE_SSE && LV_HAVE_64
+++
+++static inline void volk_gnsssdr_32fc_x2_dot_prod_32fc_u_sse_64(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+++
+++ const unsigned int num_bytes = num_points*8;
+++ unsigned int isodd = num_points & 1;
+++
+++ asm
+++ (
+++ "# ccomplex_dotprod_generic (float* result, const float *input,\n\t"
+++ "# const float *taps, unsigned num_bytes)\n\t"
+++ "# float sum0 = 0;\n\t"
+++ "# float sum1 = 0;\n\t"
+++ "# float sum2 = 0;\n\t"
+++ "# float sum3 = 0;\n\t"
+++ "# do {\n\t"
+++ "# sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t"
+++ "# sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t"
+++ "# sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t"
+++ "# sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t"
+++ "# input += 4;\n\t"
+++ "# taps += 4; \n\t"
+++ "# } while (--n_2_ccomplex_blocks != 0);\n\t"
+++ "# result[0] = sum0 + sum2;\n\t"
+++ "# result[1] = sum1 + sum3;\n\t"
+++ "# TODO: prefetch and better scheduling\n\t"
+++ " xor %%r9, %%r9\n\t"
+++ " xor %%r10, %%r10\n\t"
+++ " movq %%rcx, %%rax\n\t"
+++ " movq %%rcx, %%r8\n\t"
+++ " movq %[rsi], %%r9\n\t"
+++ " movq %[rdx], %%r10\n\t"
+++ " xorps %%xmm6, %%xmm6 # zero accumulators\n\t"
+++ " movups 0(%%r9), %%xmm0\n\t"
+++ " xorps %%xmm7, %%xmm7 # zero accumulators\n\t"
+++ " movups 0(%%r10), %%xmm2\n\t"
+++ " shr $5, %%rax # rax = n_2_ccomplex_blocks / 2\n\t"
+++ " shr $4, %%r8\n\t"
+++ " jmp .%=L1_test\n\t"
+++ " # 4 taps / loop\n\t"
+++ " # something like ?? cycles / loop\n\t"
+++ ".%=Loop1: \n\t"
+++ "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
+++ "# movups (%%r9), %%xmmA\n\t"
+++ "# movups (%%r10), %%xmmB\n\t"
+++ "# movups %%xmmA, %%xmmZ\n\t"
+++ "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t"
+++ "# mulps %%xmmB, %%xmmA\n\t"
+++ "# mulps %%xmmZ, %%xmmB\n\t"
+++ "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
+++ "# xorps %%xmmPN, %%xmmA\n\t"
+++ "# movups %%xmmA, %%xmmZ\n\t"
+++ "# unpcklps %%xmmB, %%xmmA\n\t"
+++ "# unpckhps %%xmmB, %%xmmZ\n\t"
+++ "# movups %%xmmZ, %%xmmY\n\t"
+++ "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t"
+++ "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t"
+++ "# addps %%xmmZ, %%xmmA\n\t"
+++ "# addps %%xmmA, %%xmmC\n\t"
+++ "# A=xmm0, B=xmm2, Z=xmm4\n\t"
+++ "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
+++ " movups 16(%%r9), %%xmm1\n\t"
+++ " movups %%xmm0, %%xmm4\n\t"
+++ " mulps %%xmm2, %%xmm0\n\t"
+++ " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
+++ " movups 16(%%r10), %%xmm3\n\t"
+++ " movups %%xmm1, %%xmm5\n\t"
+++ " addps %%xmm0, %%xmm6\n\t"
+++ " mulps %%xmm3, %%xmm1\n\t"
+++ " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t"
+++ " addps %%xmm1, %%xmm6\n\t"
+++ " mulps %%xmm4, %%xmm2\n\t"
+++ " movups 32(%%r9), %%xmm0\n\t"
+++ " addps %%xmm2, %%xmm7\n\t"
+++ " mulps %%xmm5, %%xmm3\n\t"
+++ " add $32, %%r9\n\t"
+++ " movups 32(%%r10), %%xmm2\n\t"
+++ " addps %%xmm3, %%xmm7\n\t"
+++ " add $32, %%r10\n\t"
+++ ".%=L1_test:\n\t"
+++ " dec %%rax\n\t"
+++ " jge .%=Loop1\n\t"
+++ " # We've handled the bulk of multiplies up to here.\n\t"
+++ " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
+++ " # If so, we've got 2 more taps to do.\n\t"
+++ " and $1, %%r8\n\t"
+++ " je .%=Leven\n\t"
+++ " # The count was odd, do 2 more taps.\n\t"
+++ " # Note that we've already got mm0/mm2 preloaded\n\t"
+++ " # from the main loop.\n\t"
+++ " movups %%xmm0, %%xmm4\n\t"
+++ " mulps %%xmm2, %%xmm0\n\t"
+++ " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
+++ " addps %%xmm0, %%xmm6\n\t"
+++ " mulps %%xmm4, %%xmm2\n\t"
+++ " addps %%xmm2, %%xmm7\n\t"
+++ ".%=Leven:\n\t"
+++ " # neg inversor\n\t"
+++ " xorps %%xmm1, %%xmm1\n\t"
+++ " mov $0x80000000, %%r9\n\t"
+++ " movd %%r9, %%xmm1\n\t"
+++ " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t"
+++ " # pfpnacc\n\t"
+++ " xorps %%xmm1, %%xmm6\n\t"
+++ " movups %%xmm6, %%xmm2\n\t"
+++ " unpcklps %%xmm7, %%xmm6\n\t"
+++ " unpckhps %%xmm7, %%xmm2\n\t"
+++ " movups %%xmm2, %%xmm3\n\t"
+++ " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t"
+++ " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t"
+++ " addps %%xmm2, %%xmm6\n\t"
+++ " # xmm6 = r1 i2 r3 i4\n\t"
+++ " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t"
+++ " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t"
+++ " movlps %%xmm6, (%[rdi]) # store low 2x32 bits (complex) to memory\n\t"
+++ :
+++ :[rsi] "r" (input), [rdx] "r" (taps), "c" (num_bytes), [rdi] "r" (result)
+++ :"rax", "r8", "r9", "r10"
+++ );
+++
+++
+++ if(isodd) {
+++ *result += input[num_points - 1] * taps[num_points - 1];
+++ }
+++
+++ return;
+++
+++}
+++
+++#endif /* LV_HAVE_SSE && LV_HAVE_64 */
+++
+++
+++
+++
+++#ifdef LV_HAVE_SSE3
+++
+++#include <pmmintrin.h>
+++
+++static inline void volk_gnsssdr_32fc_x2_dot_prod_32fc_u_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+++
+++ lv_32fc_t dotProduct;
+++ memset(&dotProduct, 0x0, 2*sizeof(float));
+++
+++ unsigned int number = 0;
+++ const unsigned int halfPoints = num_points/2;
+++ unsigned int isodd = num_points & 1;
+++
+++ __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
+++
+++ const lv_32fc_t* a = input;
+++ const lv_32fc_t* b = taps;
+++
+++ dotProdVal = _mm_setzero_ps();
+++
+++ for(;number < halfPoints; number++){
+++
+++ x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+++ y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
+++
+++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
+++
+++ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+++
+++ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+++
+++ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+++
+++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+++
+++ dotProdVal = _mm_add_ps(dotProdVal, z); // Add the complex multiplication results together
+++
+++ a += 2;
+++ b += 2;
+++ }
+++
+++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector[2];
+++
+++ _mm_storeu_ps((float*)dotProductVector,dotProdVal); // Store the results back into the dot product vector
+++
+++ dotProduct += ( dotProductVector[0] + dotProductVector[1] );
+++
+++ if(isodd) {
+++ dotProduct += input[num_points - 1] * taps[num_points - 1];
+++ }
+++
+++ *result = dotProduct;
+++}
+++
+++#endif /*LV_HAVE_SSE3*/
+++
+++#ifdef LV_HAVE_SSE4_1
+++
+++#include <smmintrin.h>
+++
+++static inline void volk_gnsssdr_32fc_x2_dot_prod_32fc_u_sse4_1(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+++
+++ unsigned int i = 0;
+++ const unsigned int qtr_points = num_points/4;
+++ const unsigned int isodd = num_points & 3;
+++
+++ __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1;
+++ float *p_input, *p_taps;
+++ __m64 *p_result;
+++
+++ p_result = (__m64*)result;
+++ p_input = (float*)input;
+++ p_taps = (float*)taps;
+++
+++ static const __m128i neg = {0x000000000000000080000000};
+++
+++ real0 = _mm_setzero_ps();
+++ real1 = _mm_setzero_ps();
+++ im0 = _mm_setzero_ps();
+++ im1 = _mm_setzero_ps();
+++
+++ for(; i < qtr_points; ++i) {
+++ xmm0 = _mm_loadu_ps(p_input);
+++ xmm1 = _mm_loadu_ps(p_taps);
+++
+++ p_input += 4;
+++ p_taps += 4;
+++
+++ xmm2 = _mm_loadu_ps(p_input);
+++ xmm3 = _mm_loadu_ps(p_taps);
+++
+++ p_input += 4;
+++ p_taps += 4;
+++
+++ xmm4 = _mm_unpackhi_ps(xmm0, xmm2);
+++ xmm5 = _mm_unpackhi_ps(xmm1, xmm3);
+++ xmm0 = _mm_unpacklo_ps(xmm0, xmm2);
+++ xmm2 = _mm_unpacklo_ps(xmm1, xmm3);
+++
+++ //imaginary vector from input
+++ xmm1 = _mm_unpackhi_ps(xmm0, xmm4);
+++ //real vector from input
+++ xmm3 = _mm_unpacklo_ps(xmm0, xmm4);
+++ //imaginary vector from taps
+++ xmm0 = _mm_unpackhi_ps(xmm2, xmm5);
+++ //real vector from taps
+++ xmm2 = _mm_unpacklo_ps(xmm2, xmm5);
+++
+++ xmm4 = _mm_dp_ps(xmm3, xmm2, 0xf1);
+++ xmm5 = _mm_dp_ps(xmm1, xmm0, 0xf1);
+++
+++ xmm6 = _mm_dp_ps(xmm3, xmm0, 0xf2);
+++ xmm7 = _mm_dp_ps(xmm1, xmm2, 0xf2);
+++
+++ real0 = _mm_add_ps(xmm4, real0);
+++ real1 = _mm_add_ps(xmm5, real1);
+++ im0 = _mm_add_ps(xmm6, im0);
+++ im1 = _mm_add_ps(xmm7, im1);
+++ }
+++
+++ real1 = _mm_xor_ps(real1, bit128_p(&neg)->float_vec);
+++
+++ im0 = _mm_add_ps(im0, im1);
+++ real0 = _mm_add_ps(real0, real1);
+++
+++ im0 = _mm_add_ps(im0, real0);
+++
+++ _mm_storel_pi(p_result, im0);
+++
+++ for(i = num_points-isodd; i < num_points; i++) {
+++ *result += input[i] * taps[i];
+++ }
+++}
+++
+++#endif /*LV_HAVE_SSE4_1*/
+++
+++
+++
+++
+++#endif /*INCLUDED_volk_gnsssdr_32fc_x2_dot_prod_32fc_u_H*/
+++#ifndef INCLUDED_volk_gnsssdr_32fc_x2_dot_prod_32fc_a_H
+++#define INCLUDED_volk_gnsssdr_32fc_x2_dot_prod_32fc_a_H
+++
+++#include <volk_gnsssdr/volk_gnsssdr_common.h>
+++#include <volk_gnsssdr/volk_gnsssdr_complex.h>
+++#include <stdio.h>
+++#include <string.h>
+++
+++
+++#ifdef LV_HAVE_GENERIC
+++
+++
+++static inline void volk_gnsssdr_32fc_x2_dot_prod_32fc_a_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+++
+++ const unsigned int num_bytes = num_points*8;
+++
+++ float * res = (float*) result;
+++ float * in = (float*) input;
+++ float * tp = (float*) taps;
+++ unsigned int n_2_ccomplex_blocks = num_bytes >> 4;
+++ unsigned int isodd = num_points & 1;
+++
+++ float sum0[2] = {0,0};
+++ float sum1[2] = {0,0};
+++ unsigned int i = 0;
+++
+++ for(i = 0; i < n_2_ccomplex_blocks; ++i) {
+++ sum0[0] += in[0] * tp[0] - in[1] * tp[1];
+++ sum0[1] += in[0] * tp[1] + in[1] * tp[0];
+++ sum1[0] += in[2] * tp[2] - in[3] * tp[3];
+++ sum1[1] += in[2] * tp[3] + in[3] * tp[2];
+++
+++ in += 4;
+++ tp += 4;
+++ }
+++
+++ res[0] = sum0[0] + sum1[0];
+++ res[1] = sum0[1] + sum1[1];
+++
+++ for(i = 0; i < isodd; ++i) {
+++ *result += input[num_points - 1] * taps[num_points - 1];
+++ }
+++}
+++
+++#endif /*LV_HAVE_GENERIC*/
+++
+++
+++#if LV_HAVE_SSE && LV_HAVE_64
+++
+++
+++static inline void volk_gnsssdr_32fc_x2_dot_prod_32fc_a_sse_64(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+++
+++ const unsigned int num_bytes = num_points*8;
+++ unsigned int isodd = num_points & 1;
+++
+++ asm
+++ (
+++ "# ccomplex_dotprod_generic (float* result, const float *input,\n\t"
+++ "# const float *taps, unsigned num_bytes)\n\t"
+++ "# float sum0 = 0;\n\t"
+++ "# float sum1 = 0;\n\t"
+++ "# float sum2 = 0;\n\t"
+++ "# float sum3 = 0;\n\t"
+++ "# do {\n\t"
+++ "# sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t"
+++ "# sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t"
+++ "# sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t"
+++ "# sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t"
+++ "# input += 4;\n\t"
+++ "# taps += 4; \n\t"
+++ "# } while (--n_2_ccomplex_blocks != 0);\n\t"
+++ "# result[0] = sum0 + sum2;\n\t"
+++ "# result[1] = sum1 + sum3;\n\t"
+++ "# TODO: prefetch and better scheduling\n\t"
+++ " xor %%r9, %%r9\n\t"
+++ " xor %%r10, %%r10\n\t"
+++ " movq %%rcx, %%rax\n\t"
+++ " movq %%rcx, %%r8\n\t"
+++ " movq %[rsi], %%r9\n\t"
+++ " movq %[rdx], %%r10\n\t"
+++ " xorps %%xmm6, %%xmm6 # zero accumulators\n\t"
+++ " movaps 0(%%r9), %%xmm0\n\t"
+++ " xorps %%xmm7, %%xmm7 # zero accumulators\n\t"
+++ " movaps 0(%%r10), %%xmm2\n\t"
+++ " shr $5, %%rax # rax = n_2_ccomplex_blocks / 2\n\t"
+++ " shr $4, %%r8\n\t"
+++ " jmp .%=L1_test\n\t"
+++ " # 4 taps / loop\n\t"
+++ " # something like ?? cycles / loop\n\t"
+++ ".%=Loop1: \n\t"
+++ "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
+++ "# movaps (%%r9), %%xmmA\n\t"
+++ "# movaps (%%r10), %%xmmB\n\t"
+++ "# movaps %%xmmA, %%xmmZ\n\t"
+++ "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t"
+++ "# mulps %%xmmB, %%xmmA\n\t"
+++ "# mulps %%xmmZ, %%xmmB\n\t"
+++ "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
+++ "# xorps %%xmmPN, %%xmmA\n\t"
+++ "# movaps %%xmmA, %%xmmZ\n\t"
+++ "# unpcklps %%xmmB, %%xmmA\n\t"
+++ "# unpckhps %%xmmB, %%xmmZ\n\t"
+++ "# movaps %%xmmZ, %%xmmY\n\t"
+++ "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t"
+++ "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t"
+++ "# addps %%xmmZ, %%xmmA\n\t"
+++ "# addps %%xmmA, %%xmmC\n\t"
+++ "# A=xmm0, B=xmm2, Z=xmm4\n\t"
+++ "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
+++ " movaps 16(%%r9), %%xmm1\n\t"
+++ " movaps %%xmm0, %%xmm4\n\t"
+++ " mulps %%xmm2, %%xmm0\n\t"
+++ " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
+++ " movaps 16(%%r10), %%xmm3\n\t"
+++ " movaps %%xmm1, %%xmm5\n\t"
+++ " addps %%xmm0, %%xmm6\n\t"
+++ " mulps %%xmm3, %%xmm1\n\t"
+++ " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t"
+++ " addps %%xmm1, %%xmm6\n\t"
+++ " mulps %%xmm4, %%xmm2\n\t"
+++ " movaps 32(%%r9), %%xmm0\n\t"
+++ " addps %%xmm2, %%xmm7\n\t"
+++ " mulps %%xmm5, %%xmm3\n\t"
+++ " add $32, %%r9\n\t"
+++ " movaps 32(%%r10), %%xmm2\n\t"
+++ " addps %%xmm3, %%xmm7\n\t"
+++ " add $32, %%r10\n\t"
+++ ".%=L1_test:\n\t"
+++ " dec %%rax\n\t"
+++ " jge .%=Loop1\n\t"
+++ " # We've handled the bulk of multiplies up to here.\n\t"
+++ " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
+++ " # If so, we've got 2 more taps to do.\n\t"
+++ " and $1, %%r8\n\t"
+++ " je .%=Leven\n\t"
+++ " # The count was odd, do 2 more taps.\n\t"
+++ " # Note that we've already got mm0/mm2 preloaded\n\t"
+++ " # from the main loop.\n\t"
+++ " movaps %%xmm0, %%xmm4\n\t"
+++ " mulps %%xmm2, %%xmm0\n\t"
+++ " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
+++ " addps %%xmm0, %%xmm6\n\t"
+++ " mulps %%xmm4, %%xmm2\n\t"
+++ " addps %%xmm2, %%xmm7\n\t"
+++ ".%=Leven:\n\t"
+++ " # neg inversor\n\t"
+++ " xorps %%xmm1, %%xmm1\n\t"
+++ " mov $0x80000000, %%r9\n\t"
+++ " movd %%r9, %%xmm1\n\t"
+++ " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t"
+++ " # pfpnacc\n\t"
+++ " xorps %%xmm1, %%xmm6\n\t"
+++ " movaps %%xmm6, %%xmm2\n\t"
+++ " unpcklps %%xmm7, %%xmm6\n\t"
+++ " unpckhps %%xmm7, %%xmm2\n\t"
+++ " movaps %%xmm2, %%xmm3\n\t"
+++ " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t"
+++ " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t"
+++ " addps %%xmm2, %%xmm6\n\t"
+++ " # xmm6 = r1 i2 r3 i4\n\t"
+++ " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t"
+++ " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t"
+++ " movlps %%xmm6, (%[rdi]) # store low 2x32 bits (complex) to memory\n\t"
+++ :
+++ :[rsi] "r" (input), [rdx] "r" (taps), "c" (num_bytes), [rdi] "r" (result)
+++ :"rax", "r8", "r9", "r10"
+++ );
+++
+++
+++ if(isodd) {
+++ *result += input[num_points - 1] * taps[num_points - 1];
+++ }
+++
+++ return;
+++
+++}
+++
+++#endif
+++
+++#if LV_HAVE_SSE && LV_HAVE_32
+++
+++static inline void volk_gnsssdr_32fc_x2_dot_prod_32fc_a_sse_32(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+++
+++ volk_gnsssdr_32fc_x2_dot_prod_32fc_a_generic(result, input, taps, num_points);
+++
+++#if 0
+++ const unsigned int num_bytes = num_points*8;
+++ unsigned int isodd = num_points & 1;
+++
+++ asm volatile
+++ (
+++ " #pushl %%ebp\n\t"
+++ " #movl %%esp, %%ebp\n\t"
+++ " movl 12(%%ebp), %%eax # input\n\t"
+++ " movl 16(%%ebp), %%edx # taps\n\t"
+++ " movl 20(%%ebp), %%ecx # n_bytes\n\t"
+++ " xorps %%xmm6, %%xmm6 # zero accumulators\n\t"
+++ " movaps 0(%%eax), %%xmm0\n\t"
+++ " xorps %%xmm7, %%xmm7 # zero accumulators\n\t"
+++ " movaps 0(%%edx), %%xmm2\n\t"
+++ " shrl $5, %%ecx # ecx = n_2_ccomplex_blocks / 2\n\t"
+++ " jmp .%=L1_test\n\t"
+++ " # 4 taps / loop\n\t"
+++ " # something like ?? cycles / loop\n\t"
+++ ".%=Loop1: \n\t"
+++ "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
+++ "# movaps (%%eax), %%xmmA\n\t"
+++ "# movaps (%%edx), %%xmmB\n\t"
+++ "# movaps %%xmmA, %%xmmZ\n\t"
+++ "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t"
+++ "# mulps %%xmmB, %%xmmA\n\t"
+++ "# mulps %%xmmZ, %%xmmB\n\t"
+++ "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
+++ "# xorps %%xmmPN, %%xmmA\n\t"
+++ "# movaps %%xmmA, %%xmmZ\n\t"
+++ "# unpcklps %%xmmB, %%xmmA\n\t"
+++ "# unpckhps %%xmmB, %%xmmZ\n\t"
+++ "# movaps %%xmmZ, %%xmmY\n\t"
+++ "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t"
+++ "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t"
+++ "# addps %%xmmZ, %%xmmA\n\t"
+++ "# addps %%xmmA, %%xmmC\n\t"
+++ "# A=xmm0, B=xmm2, Z=xmm4\n\t"
+++ "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
+++ " movaps 16(%%eax), %%xmm1\n\t"
+++ " movaps %%xmm0, %%xmm4\n\t"
+++ " mulps %%xmm2, %%xmm0\n\t"
+++ " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
+++ " movaps 16(%%edx), %%xmm3\n\t"
+++ " movaps %%xmm1, %%xmm5\n\t"
+++ " addps %%xmm0, %%xmm6\n\t"
+++ " mulps %%xmm3, %%xmm1\n\t"
+++ " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t"
+++ " addps %%xmm1, %%xmm6\n\t"
+++ " mulps %%xmm4, %%xmm2\n\t"
+++ " movaps 32(%%eax), %%xmm0\n\t"
+++ " addps %%xmm2, %%xmm7\n\t"
+++ " mulps %%xmm5, %%xmm3\n\t"
+++ " addl $32, %%eax\n\t"
+++ " movaps 32(%%edx), %%xmm2\n\t"
+++ " addps %%xmm3, %%xmm7\n\t"
+++ " addl $32, %%edx\n\t"
+++ ".%=L1_test:\n\t"
+++ " decl %%ecx\n\t"
+++ " jge .%=Loop1\n\t"
+++ " # We've handled the bulk of multiplies up to here.\n\t"
+++ " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
+++ " # If so, we've got 2 more taps to do.\n\t"
+++ " movl 20(%%ebp), %%ecx # n_2_ccomplex_blocks\n\t"
+++ " shrl $4, %%ecx\n\t"
+++ " andl $1, %%ecx\n\t"
+++ " je .%=Leven\n\t"
+++ " # The count was odd, do 2 more taps.\n\t"
+++ " # Note that we've already got mm0/mm2 preloaded\n\t"
+++ " # from the main loop.\n\t"
+++ " movaps %%xmm0, %%xmm4\n\t"
+++ " mulps %%xmm2, %%xmm0\n\t"
+++ " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
+++ " addps %%xmm0, %%xmm6\n\t"
+++ " mulps %%xmm4, %%xmm2\n\t"
+++ " addps %%xmm2, %%xmm7\n\t"
+++ ".%=Leven:\n\t"
+++ " # neg inversor\n\t"
+++ " movl 8(%%ebp), %%eax \n\t"
+++ " xorps %%xmm1, %%xmm1\n\t"
+++ " movl $0x80000000, (%%eax)\n\t"
+++ " movss (%%eax), %%xmm1\n\t"
+++ " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t"
+++ " # pfpnacc\n\t"
+++ " xorps %%xmm1, %%xmm6\n\t"
+++ " movaps %%xmm6, %%xmm2\n\t"
+++ " unpcklps %%xmm7, %%xmm6\n\t"
+++ " unpckhps %%xmm7, %%xmm2\n\t"
+++ " movaps %%xmm2, %%xmm3\n\t"
+++ " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t"
+++ " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t"
+++ " addps %%xmm2, %%xmm6\n\t"
+++ " # xmm6 = r1 i2 r3 i4\n\t"
+++ " #movl 8(%%ebp), %%eax # @result\n\t"
+++ " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t"
+++ " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t"
+++ " movlps %%xmm6, (%%eax) # store low 2x32 bits (complex) to memory\n\t"
+++ " #popl %%ebp\n\t"
+++ :
+++ :
+++ : "eax", "ecx", "edx"
+++ );
+++
+++
+++ int getem = num_bytes % 16;
+++
+++ if(isodd) {
+++ *result += (input[num_points - 1] * taps[num_points - 1]);
+++ }
+++
+++ return;
+++#endif
+++}
+++
+++#endif /*LV_HAVE_SSE*/
+++
+++#ifdef LV_HAVE_SSE3
+++
+++#include <pmmintrin.h>
+++
+++static inline void volk_gnsssdr_32fc_x2_dot_prod_32fc_a_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+++
+++ const unsigned int num_bytes = num_points*8;
+++ unsigned int isodd = num_points & 1;
+++
+++ lv_32fc_t dotProduct;
+++ memset(&dotProduct, 0x0, 2*sizeof(float));
+++
+++ unsigned int number = 0;
+++ const unsigned int halfPoints = num_bytes >> 4;
+++
+++ __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
+++
+++ const lv_32fc_t* a = input;
+++ const lv_32fc_t* b = taps;
+++
+++ dotProdVal = _mm_setzero_ps();
+++
+++ for(;number < halfPoints; number++){
+++
+++ x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+++ y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
+++
+++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
+++
+++ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+++
+++ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+++
+++ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+++
+++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+++
+++ dotProdVal = _mm_add_ps(dotProdVal, z); // Add the complex multiplication results together
+++
+++ a += 2;
+++ b += 2;
+++ }
+++
+++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector[2];
+++
+++ _mm_store_ps((float*)dotProductVector,dotProdVal); // Store the results back into the dot product vector
+++
+++ dotProduct += ( dotProductVector[0] + dotProductVector[1] );
+++
+++ if(isodd) {
+++ dotProduct += input[num_points - 1] * taps[num_points - 1];
+++ }
+++
+++ *result = dotProduct;
+++}
+++
+++#endif /*LV_HAVE_SSE3*/
+++
+++#ifdef LV_HAVE_SSE4_1
+++
+++#include <smmintrin.h>
+++
+++static inline void volk_gnsssdr_32fc_x2_dot_prod_32fc_a_sse4_1(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+++
+++ unsigned int i = 0;
+++ const unsigned int qtr_points = num_points/4;
+++ const unsigned int isodd = num_points & 3;
+++
+++ __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1;
+++ float *p_input, *p_taps;
+++ __m64 *p_result;
+++
+++ static const __m128i neg = {0x000000000000000080000000};
+++
+++ p_result = (__m64*)result;
+++ p_input = (float*)input;
+++ p_taps = (float*)taps;
+++
+++ real0 = _mm_setzero_ps();
+++ real1 = _mm_setzero_ps();
+++ im0 = _mm_setzero_ps();
+++ im1 = _mm_setzero_ps();
+++
+++ for(; i < qtr_points; ++i) {
+++ xmm0 = _mm_load_ps(p_input);
+++ xmm1 = _mm_load_ps(p_taps);
+++
+++ p_input += 4;
+++ p_taps += 4;
+++
+++ xmm2 = _mm_load_ps(p_input);
+++ xmm3 = _mm_load_ps(p_taps);
+++
+++ p_input += 4;
+++ p_taps += 4;
+++
+++ xmm4 = _mm_unpackhi_ps(xmm0, xmm2);
+++ xmm5 = _mm_unpackhi_ps(xmm1, xmm3);
+++ xmm0 = _mm_unpacklo_ps(xmm0, xmm2);
+++ xmm2 = _mm_unpacklo_ps(xmm1, xmm3);
+++
+++ //imaginary vector from input
+++ xmm1 = _mm_unpackhi_ps(xmm0, xmm4);
+++ //real vector from input
+++ xmm3 = _mm_unpacklo_ps(xmm0, xmm4);
+++ //imaginary vector from taps
+++ xmm0 = _mm_unpackhi_ps(xmm2, xmm5);
+++ //real vector from taps
+++ xmm2 = _mm_unpacklo_ps(xmm2, xmm5);
+++
+++ xmm4 = _mm_dp_ps(xmm3, xmm2, 0xf1);
+++ xmm5 = _mm_dp_ps(xmm1, xmm0, 0xf1);
+++
+++ xmm6 = _mm_dp_ps(xmm3, xmm0, 0xf2);
+++ xmm7 = _mm_dp_ps(xmm1, xmm2, 0xf2);
+++
+++ real0 = _mm_add_ps(xmm4, real0);
+++ real1 = _mm_add_ps(xmm5, real1);
+++ im0 = _mm_add_ps(xmm6, im0);
+++ im1 = _mm_add_ps(xmm7, im1);
+++ }
+++
+++ real1 = _mm_xor_ps(real1, bit128_p(&neg)->float_vec);
+++
+++ im0 = _mm_add_ps(im0, im1);
+++ real0 = _mm_add_ps(real0, real1);
+++
+++ im0 = _mm_add_ps(im0, real0);
+++
+++ _mm_storel_pi(p_result, im0);
+++
+++ for(i = num_points-isodd; i < num_points; i++) {
+++ *result += input[i] * taps[i];
+++ }
+++}
+++
+++#endif /*LV_HAVE_SSE4_1*/
+++
+++#endif /*INCLUDED_volk_gnsssdr_32fc_x2_dot_prod_32fc_a_H*/
++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x2_multiply_32fc.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x2_multiply_32fc.h
++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x2_multiply_32fc.h 1970-01-01 01:00:00.000000000 +0100
+++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x2_multiply_32fc.h 2014-10-15 01:55:08.000000000 +0200
++@@ -0,0 +1,170 @@
+++#ifndef INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_u_H
+++#define INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_u_H
+++
+++#include <inttypes.h>
+++#include <stdio.h>
+++#include <volk_gnsssdr/volk_gnsssdr_complex.h>
+++#include <float.h>
+++
+++#ifdef LV_HAVE_SSE3
+++#include <pmmintrin.h>
+++ /*!
+++ \brief Multiplies the two input complex vectors and stores their results in the third vector
+++ \param cVector The vector where the results will be stored
+++ \param aVector One of the vectors to be multiplied
+++ \param bVector One of the vectors to be multiplied
+++ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+++ */
+++static inline void volk_gnsssdr_32fc_x2_multiply_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
+++ unsigned int number = 0;
+++ const unsigned int halfPoints = num_points / 2;
+++
+++ __m128 x, y, yl, yh, z, tmp1, tmp2;
+++ lv_32fc_t* c = cVector;
+++ const lv_32fc_t* a = aVector;
+++ const lv_32fc_t* b = bVector;
+++
+++ for(;number < halfPoints; number++){
+++
+++ x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+++ y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
+++
+++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
+++
+++ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+++
+++ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+++
+++ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+++
+++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+++
+++ _mm_storeu_ps((float*)c,z); // Store the results back into the C container
+++
+++ a += 2;
+++ b += 2;
+++ c += 2;
+++ }
+++
+++ if((num_points % 2) != 0) {
+++ *c = (*a) * (*b);
+++ }
+++}
+++#endif /* LV_HAVE_SSE */
+++
+++#ifdef LV_HAVE_GENERIC
+++ /*!
+++ \brief Multiplies the two input complex vectors and stores their results in the third vector
+++ \param cVector The vector where the results will be stored
+++ \param aVector One of the vectors to be multiplied
+++ \param bVector One of the vectors to be multiplied
+++ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+++ */
+++static inline void volk_gnsssdr_32fc_x2_multiply_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
+++ lv_32fc_t* cPtr = cVector;
+++ const lv_32fc_t* aPtr = aVector;
+++ const lv_32fc_t* bPtr= bVector;
+++ unsigned int number = 0;
+++
+++ for(number = 0; number < num_points; number++){
+++ *cPtr++ = (*aPtr++) * (*bPtr++);
+++ }
+++}
+++#endif /* LV_HAVE_GENERIC */
+++
+++
+++#endif /* INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_u_H */
+++#ifndef INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_a_H
+++#define INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_a_H
+++
+++#include <inttypes.h>
+++#include <stdio.h>
+++#include <volk_gnsssdr/volk_gnsssdr_complex.h>
+++#include <float.h>
+++
+++#ifdef LV_HAVE_SSE3
+++#include <pmmintrin.h>
+++ /*!
+++ \brief Multiplies the two input complex vectors and stores their results in the third vector
+++ \param cVector The vector where the results will be stored
+++ \param aVector One of the vectors to be multiplied
+++ \param bVector One of the vectors to be multiplied
+++ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+++ */
+++static inline void volk_gnsssdr_32fc_x2_multiply_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
+++ unsigned int number = 0;
+++ const unsigned int halfPoints = num_points / 2;
+++
+++ __m128 x, y, yl, yh, z, tmp1, tmp2;
+++ lv_32fc_t* c = cVector;
+++ const lv_32fc_t* a = aVector;
+++ const lv_32fc_t* b = bVector;
+++ for(;number < halfPoints; number++){
+++
+++ x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+++ y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
+++
+++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
+++
+++ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+++
+++ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+++
+++ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+++
+++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+++
+++ _mm_store_ps((float*)c,z); // Store the results back into the C container
+++
+++ a += 2;
+++ b += 2;
+++ c += 2;
+++ }
+++
+++ if((num_points % 2) != 0) {
+++ *c = (*a) * (*b);
+++ }
+++}
+++#endif /* LV_HAVE_SSE */
+++
+++#ifdef LV_HAVE_GENERIC
+++ /*!
+++ \brief Multiplies the two input complex vectors and stores their results in the third vector
+++ \param cVector The vector where the results will be stored
+++ \param aVector One of the vectors to be multiplied
+++ \param bVector One of the vectors to be multiplied
+++ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+++ */
+++static inline void volk_gnsssdr_32fc_x2_multiply_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
+++ lv_32fc_t* cPtr = cVector;
+++ const lv_32fc_t* aPtr = aVector;
+++ const lv_32fc_t* bPtr= bVector;
+++ unsigned int number = 0;
+++
+++ for(number = 0; number < num_points; number++){
+++ *cPtr++ = (*aPtr++) * (*bPtr++);
+++ }
+++}
+++#endif /* LV_HAVE_GENERIC */
+++
+++#ifdef LV_HAVE_ORC
+++ /*!
+++ \brief Multiplies the two input complex vectors and stores their results in the third vector
+++ \param cVector The vector where the results will be stored
+++ \param aVector One of the vectors to be multiplied
+++ \param bVector One of the vectors to be multiplied
+++ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+++ */
+++extern void volk_gnsssdr_32fc_x2_multiply_32fc_a_orc_impl(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points);
+++static inline void volk_gnsssdr_32fc_x2_multiply_32fc_u_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
+++ volk_gnsssdr_32fc_x2_multiply_32fc_a_orc_impl(cVector, aVector, bVector, num_points);
+++}
+++#endif /* LV_HAVE_ORC */
+++
+++
+++
+++
+++
+++#endif /* INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_a_H */
++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3.h
++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3.h 1970-01-01 01:00:00.000000000 +0100
+++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3.h 2014-10-15 01:55:08.000000000 +0200
++@@ -0,0 +1,409 @@
+++#ifndef INCLUDED_gnsssdr_volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_u_H
+++#define INCLUDED_gnsssdr_volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_u_H
+++
+++#include <inttypes.h>
+++#include <stdio.h>
+++#include <volk_gnsssdr/volk_gnsssdr_complex.h>
+++#include <float.h>
+++#include <string.h>
+++
+++/*!
+++ * TODO: Code the SSE4 version and benchmark it
+++ */
+++#ifdef LV_HAVE_SSE3
+++#include <pmmintrin.h>
+++
+++
+++ /*!
+++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
+++ \param input The input signal input
+++ \param carrier The carrier signal input
+++ \param E_code Early PRN code replica input
+++ \param P_code Early PRN code replica input
+++ \param L_code Early PRN code replica input
+++ \param E_out Early correlation output
+++ \param P_out Early correlation output
+++ \param L_out Early correlation output
+++ \param num_points The number of complex values in vectors
+++ */
+++static inline void volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_u_sse3(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, unsigned int num_points)
+++{
+++ unsigned int number = 0;
+++ const unsigned int halfPoints = num_points / 2;
+++
+++ lv_32fc_t dotProduct_E;
+++ memset(&dotProduct_E, 0x0, 2*sizeof(float));
+++ lv_32fc_t dotProduct_P;
+++ memset(&dotProduct_P, 0x0, 2*sizeof(float));
+++ lv_32fc_t dotProduct_L;
+++ memset(&dotProduct_L, 0x0, 2*sizeof(float));
+++
+++ // Aux vars
+++ __m128 x, y, yl, yh, z, tmp1, tmp2, z_E, z_P, z_L;
+++
+++ z_E = _mm_setzero_ps();
+++ z_P = _mm_setzero_ps();
+++ z_L = _mm_setzero_ps();
+++
+++ //input and output vectors
+++ //lv_32fc_t* _input_BB = input_BB;
+++ const lv_32fc_t* _input = input;
+++ const lv_32fc_t* _carrier = carrier;
+++ const lv_32fc_t* _E_code = E_code;
+++ const lv_32fc_t* _P_code = P_code;
+++ const lv_32fc_t* _L_code = L_code;
+++
+++ for(;number < halfPoints; number++)
+++ {
+++ // carrier wipe-off (vector point-to-point product)
+++ x = _mm_loadu_ps((float*)_input); // Load the ar + ai, br + bi as ar,ai,br,bi
+++ y = _mm_loadu_ps((float*)_carrier); // Load the cr + ci, dr + di as cr,ci,dr,di
+++
+++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
+++
+++ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+++
+++ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+++
+++ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+++
+++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+++
+++ //_mm_storeu_ps((float*)_input_BB,z); // Store the results back into the _input_BB container
+++
+++ // correlation E,P,L (3x vector scalar product)
+++ // Early
+++ //x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi
+++ x = z;
+++
+++ y = _mm_load_ps((float*)_E_code); // Load the cr + ci, dr + di as cr,ci,dr,di
+++
+++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
+++
+++ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+++
+++ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+++
+++ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+++
+++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+++
+++ z_E = _mm_add_ps(z_E, z); // Add the complex multiplication results together
+++
+++ // Prompt
+++ //x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi
+++ y = _mm_load_ps((float*)_P_code); // Load the cr + ci, dr + di as cr,ci,dr,di
+++
+++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
+++
+++ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+++
+++ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+++
+++ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+++
+++ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+++
+++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+++
+++ z_P = _mm_add_ps(z_P, z); // Add the complex multiplication results together
+++
+++ // Late
+++ //x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi
+++ y = _mm_load_ps((float*)_L_code); // Load the cr + ci, dr + di as cr,ci,dr,di
+++
+++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
+++
+++ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+++
+++ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+++
+++ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+++
+++ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+++
+++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+++
+++ z_L = _mm_add_ps(z_L, z); // Add the complex multiplication results together
+++
+++ /*pointer increment*/
+++ _carrier += 2;
+++ _input += 2;
+++ //_input_BB += 2;
+++ _E_code += 2;
+++ _P_code += 2;
+++ _L_code +=2;
+++ }
+++
+++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_E[2];
+++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_P[2];
+++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_L[2];
+++ //__VOLK_ATTR_ALIGNED(16) lv_32fc_t _input_BB;
+++
+++ _mm_store_ps((float*)dotProductVector_E,z_E); // Store the results back into the dot product vector
+++ _mm_store_ps((float*)dotProductVector_P,z_P); // Store the results back into the dot product vector
+++ _mm_store_ps((float*)dotProductVector_L,z_L); // Store the results back into the dot product vector
+++
+++ dotProduct_E += ( dotProductVector_E[0] + dotProductVector_E[1] );
+++ dotProduct_P += ( dotProductVector_P[0] + dotProductVector_P[1] );
+++ dotProduct_L += ( dotProductVector_L[0] + dotProductVector_L[1] );
+++
+++ if((num_points % 2) != 0)
+++ {
+++ //_input_BB = (*_input) * (*_carrier);
+++ dotProduct_E += (*_input) * (*_E_code)*(*_carrier);
+++ dotProduct_P += (*_input) * (*_P_code)*(*_carrier);
+++ dotProduct_L += (*_input) * (*_L_code)*(*_carrier);
+++ }
+++
+++ *E_out = dotProduct_E;
+++ *P_out = dotProduct_P;
+++ *L_out = dotProduct_L;
+++}
+++
+++#endif /* LV_HAVE_SSE3 */
+++
+++#ifdef LV_HAVE_GENERIC
+++/*!
+++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
+++ \param input The input signal input
+++ \param carrier The carrier signal input
+++ \param E_code Early PRN code replica input
+++ \param P_code Early PRN code replica input
+++ \param L_code Early PRN code replica input
+++ \param E_out Early correlation output
+++ \param P_out Early correlation output
+++ \param L_out Early correlation output
+++ \param num_points The number of complex values in vectors
+++ */
+++static inline void volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_generic(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, unsigned int num_points)
+++{
+++ lv_32fc_t bb_signal_sample;
+++
+++ bb_signal_sample = lv_cmake(0, 0);
+++
+++ *E_out = 0;
+++ *P_out = 0;
+++ *L_out = 0;
+++ // perform Early, Prompt and Late correlation
+++ for(int i=0; i < num_points; ++i)
+++ {
+++ //Perform the carrier wipe-off
+++ bb_signal_sample = input[i] * carrier[i];
+++ // Now get early, late, and prompt values for each
+++ *E_out += bb_signal_sample * E_code[i];
+++ *P_out += bb_signal_sample * P_code[i];
+++ *L_out += bb_signal_sample * L_code[i];
+++ }
+++}
+++
+++#endif /* LV_HAVE_GENERIC */
+++
+++#endif /* INCLUDED_gnsssdr_volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_u_H */
+++
+++
+++#ifndef INCLUDED_gnsssdr_volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_a_H
+++#define INCLUDED_gnsssdr_volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_a_H
+++
+++#include <inttypes.h>
+++#include <stdio.h>
+++#include <volk_gnsssdr/volk_gnsssdr_complex.h>
+++#include <float.h>
+++#include <string.h>
+++
+++#ifdef LV_HAVE_SSE3
+++#include <pmmintrin.h>
+++/*!
+++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
+++ \param input The input signal input
+++ \param carrier The carrier signal input
+++ \param E_code Early PRN code replica input
+++ \param P_code Early PRN code replica input
+++ \param L_code Early PRN code replica input
+++ \param E_out Early correlation output
+++ \param P_out Early correlation output
+++ \param L_out Early correlation output
+++ \param num_points The number of complex values in vectors
+++ */
+++static inline void volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_a_sse3(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, unsigned int num_points)
+++{
+++ unsigned int number = 0;
+++ const unsigned int halfPoints = num_points / 2;
+++
+++ lv_32fc_t dotProduct_E;
+++ memset(&dotProduct_E, 0x0, 2*sizeof(float));
+++ lv_32fc_t dotProduct_P;
+++ memset(&dotProduct_P, 0x0, 2*sizeof(float));
+++ lv_32fc_t dotProduct_L;
+++ memset(&dotProduct_L, 0x0, 2*sizeof(float));
+++
+++ // Aux vars
+++ __m128 x, y, yl, yh, z, tmp1, tmp2, z_E, z_P, z_L;
+++
+++ z_E = _mm_setzero_ps();
+++ z_P = _mm_setzero_ps();
+++ z_L = _mm_setzero_ps();
+++
+++ //input and output vectors
+++ //lv_32fc_t* _input_BB = input_BB;
+++ const lv_32fc_t* _input = input;
+++ const lv_32fc_t* _carrier = carrier;
+++ const lv_32fc_t* _E_code = E_code;
+++ const lv_32fc_t* _P_code = P_code;
+++ const lv_32fc_t* _L_code = L_code;
+++
+++ for(;number < halfPoints; number++)
+++ {
+++ // carrier wipe-off (vector point-to-point product)
+++ x = _mm_load_ps((float*)_input); // Load the ar + ai, br + bi as ar,ai,br,bi
+++ y = _mm_load_ps((float*)_carrier); // Load the cr + ci, dr + di as cr,ci,dr,di
+++
+++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
+++
+++ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+++
+++ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+++
+++ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+++
+++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+++
+++ //_mm_storeu_ps((float*)_input_BB,z); // Store the results back into the _input_BB container
+++
+++ // correlation E,P,L (3x vector scalar product)
+++ // Early
+++ //x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi
+++ x = z;
+++
+++ y = _mm_load_ps((float*)_E_code); // Load the cr + ci, dr + di as cr,ci,dr,di
+++
+++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
+++
+++ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+++
+++ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+++
+++ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+++
+++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+++
+++ z_E = _mm_add_ps(z_E, z); // Add the complex multiplication results together
+++
+++ // Prompt
+++ //x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi
+++ y = _mm_load_ps((float*)_P_code); // Load the cr + ci, dr + di as cr,ci,dr,di
+++
+++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
+++
+++ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+++
+++ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+++
+++ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+++
+++ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+++
+++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+++
+++ z_P = _mm_add_ps(z_P, z); // Add the complex multiplication results together
+++
+++ // Late
+++ //x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi
+++ y = _mm_load_ps((float*)_L_code); // Load the cr + ci, dr + di as cr,ci,dr,di
+++
+++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
+++
+++ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+++
+++ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+++
+++ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+++
+++ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+++
+++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+++
+++ z_L = _mm_add_ps(z_L, z); // Add the complex multiplication results together
+++
+++ /*pointer increment*/
+++ _carrier += 2;
+++ _input += 2;
+++ //_input_BB += 2;
+++ _E_code += 2;
+++ _P_code += 2;
+++ _L_code +=2;
+++ }
+++
+++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_E[2];
+++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_P[2];
+++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_L[2];
+++ //__VOLK_ATTR_ALIGNED(16) lv_32fc_t _input_BB;
+++
+++ _mm_store_ps((float*)dotProductVector_E,z_E); // Store the results back into the dot product vector
+++ _mm_store_ps((float*)dotProductVector_P,z_P); // Store the results back into the dot product vector
+++ _mm_store_ps((float*)dotProductVector_L,z_L); // Store the results back into the dot product vector
+++
+++ dotProduct_E += ( dotProductVector_E[0] + dotProductVector_E[1] );
+++ dotProduct_P += ( dotProductVector_P[0] + dotProductVector_P[1] );
+++ dotProduct_L += ( dotProductVector_L[0] + dotProductVector_L[1] );
+++
+++ if((num_points % 2) != 0)
+++ {
+++ //_input_BB = (*_input) * (*_carrier);
+++ dotProduct_E += (*_input) * (*_E_code)*(*_carrier);
+++ dotProduct_P += (*_input) * (*_P_code)*(*_carrier);
+++ dotProduct_L += (*_input) * (*_L_code)*(*_carrier);
+++ }
+++
+++ *E_out = dotProduct_E;
+++ *P_out = dotProduct_P;
+++ *L_out = dotProduct_L;
+++}
+++
+++#endif /* LV_HAVE_SSE3 */
+++
+++#ifdef LV_HAVE_GENERIC
+++/*!
+++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
+++ \param input The input signal input
+++ \param carrier The carrier signal input
+++ \param E_code Early PRN code replica input
+++ \param P_code Early PRN code replica input
+++ \param L_code Early PRN code replica input
+++ \param E_out Early correlation output
+++ \param P_out Early correlation output
+++ \param L_out Early correlation output
+++ \param num_points The number of complex values in vectors
+++ */
+++static inline void volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_a_generic(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, unsigned int num_points)
+++{
+++ lv_32fc_t bb_signal_sample;
+++
+++ bb_signal_sample = lv_cmake(0, 0);
+++
+++ *E_out = 0;
+++ *P_out = 0;
+++ *L_out = 0;
+++ // perform Early, Prompt and Late correlation
+++ for(int i=0; i < num_points; ++i)
+++ {
+++ //Perform the carrier wipe-off
+++ bb_signal_sample = input[i] * carrier[i];
+++ // Now get early, late, and prompt values for each
+++ *E_out += bb_signal_sample * E_code[i];
+++ *P_out += bb_signal_sample * P_code[i];
+++ *L_out += bb_signal_sample * L_code[i];
+++ }
+++}
+++
+++#endif /* LV_HAVE_GENERIC */
+++
+++#endif /* INCLUDED_gnsssdr_volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_a_H */
++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5.h
++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5.h 1970-01-01 01:00:00.000000000 +0100
+++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5.h 2014-10-15 01:55:08.000000000 +0200
++@@ -0,0 +1,848 @@
+++/*!
+++ * \file volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5
+++ * \brief Volk protokernel: performs the carrier wipe-off mixing and the VE, Early, Prompt, Late and VL correlation with 64 bits vectors
+++ * \authors <ul>
+++ * <li>Javier Arribas, 2011. jarribas(at)cttc.es
+++ * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
+++ * </ul>
+++ *
+++ * Volk protokernel that performs the carrier wipe-off mixing and the
+++ * VE, Early, Prompt, Late and VL correlation with 64 bits vectors (32 bits the
+++ * real part and 32 bits the imaginary part):
+++ * - The carrier wipe-off is done by multiplying the input signal by the
+++ * carrier (multiplication of 64 bits vectors) It returns the input
+++ * signal in base band (BB)
+++ * - VE values are calculated by multiplying the input signal in BB by the
+++ * VE code (multiplication of 64 bits vectors), accumulating the results
+++ * - Early values are calculated by multiplying the input signal in BB by the
+++ * early code (multiplication of 64 bits vectors), accumulating the results
+++ * - Prompt values are calculated by multiplying the input signal in BB by the
+++ * prompt code (multiplication of 64 bits vectors), accumulating the results
+++ * - Late values are calculated by multiplying the input signal in BB by the
+++ * late code (multiplication of 64 bits vectors), accumulating the results
+++ * - VL values are calculated by multiplying the input signal in BB by the
+++ * VL code (multiplication of 64 bits vectors), accumulating the results
+++ *
+++ * -------------------------------------------------------------------------
+++ *
+++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
+++ *
+++ * GNSS-SDR is a software defined Global Navigation
+++ * Satellite Systems receiver
+++ *
+++ * This file is part of GNSS-SDR.
+++ *
+++ * GNSS-SDR is free software: you can redistribute it and/or modify
+++ * it under the terms of the GNU General Public License as published by
+++ * the Free Software Foundation, either version 3 of the License, or
+++ * at your option) any later version.
+++ *
+++ * GNSS-SDR is distributed in the hope that it will be useful,
+++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+++ * GNU General Public License for more details.
+++ *
+++ * You should have received a copy of the GNU General Public License
+++ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
+++ *
+++ * -------------------------------------------------------------------------
+++ */
+++
+++#ifndef INCLUDED_gnsssdr_volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_u_H
+++#define INCLUDED_gnsssdr_volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_u_H
+++
+++#include <inttypes.h>
+++#include <stdio.h>
+++#include <volk_gnsssdr/volk_gnsssdr_complex.h>
+++#include <float.h>
+++#include <string.h>
+++
+++#ifdef LV_HAVE_AVX
+++#include <immintrin.h>
+++/*!
+++ \brief Performs the carrier wipe-off mixing and the VE, Early, Prompt, Late and VL correlation
+++ \param input The input signal input
+++ \param carrier The carrier signal input
+++ \param VE_code VE PRN code replica input
+++ \param E_code Early PRN code replica input
+++ \param P_code Early PRN code replica input
+++ \param L_code Early PRN code replica input
+++ \param VL_code VL PRN code replica input
+++ \param VE_out VE correlation output
+++ \param E_out Early correlation output
+++ \param P_out Early correlation output
+++ \param L_out Early correlation output
+++ \param VL_out VL correlation output
+++ \param num_points The number of complex values in vectors
+++ */
+++static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_u_avx(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* VE_code, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, const lv_32fc_t* VL_code, unsigned int num_points)
+++{
+++ unsigned int number = 0;
+++ const unsigned int halfPoints = num_points / 4;
+++
+++ lv_32fc_t dotProduct_VE;
+++ lv_32fc_t dotProduct_E;
+++ lv_32fc_t dotProduct_P;
+++ lv_32fc_t dotProduct_L;
+++ lv_32fc_t dotProduct_VL;
+++
+++ // Aux vars
+++ __m256 x, y, yl, yh, z, tmp1, tmp2, z_VE, z_E, z_P, z_L, z_VL;
+++ __m256 bb_signal_sample, bb_signal_sample_shuffled;
+++
+++ z_VE = _mm256_setzero_ps();
+++ z_E = _mm256_setzero_ps();
+++ z_P = _mm256_setzero_ps();
+++ z_L = _mm256_setzero_ps();
+++ z_VL = _mm256_setzero_ps();
+++
+++ //input and output vectors
+++ const lv_32fc_t* _input = input;
+++ const lv_32fc_t* _carrier = carrier;
+++ const lv_32fc_t* _VE_code = VE_code;
+++ const lv_32fc_t* _E_code = E_code;
+++ const lv_32fc_t* _P_code = P_code;
+++ const lv_32fc_t* _L_code = L_code;
+++ const lv_32fc_t* _VL_code = VL_code;
+++
+++ for(;number < halfPoints; number++)
+++ {
+++ // carrier wipe-off (vector point-to-point product)
+++ x = _mm256_loadu_ps((float*)_input); // Load the ar + ai, br + bi as ar,ai,br,bi
+++ y = _mm256_loadu_ps((float*)_carrier); // Load the cr + ci, dr + di as cr,ci,dr,di
+++
+++ yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+++ yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di
+++
+++ tmp1 = _mm256_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+++
+++ x = _mm256_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+++
+++ tmp2 = _mm256_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+++
+++ bb_signal_sample = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+++ bb_signal_sample_shuffled = _mm256_shuffle_ps(bb_signal_sample,bb_signal_sample,0xB1); // Re-arrange bb_signal_sample to be ai,ar,bi,br
+++
+++ // correlation VE,E,P,L,VL (5x vector scalar product)
+++ // VE
+++ y = _mm256_loadu_ps((float*)_VE_code); // Load the cr + ci, dr + di as cr,ci,dr,di
+++
+++ yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+++ yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di
+++
+++ tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+++ tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+++
+++ z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+++ z_VE = _mm256_add_ps(z_VE, z); // Add the complex multiplication results together
+++
+++ // Early
+++ y = _mm256_loadu_ps((float*)_E_code); // Load the cr + ci, dr + di as cr,ci,dr,di
+++
+++ yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+++ yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di
+++
+++ tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+++ tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+++
+++ z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+++ z_E = _mm256_add_ps(z_E, z); // Add the complex multiplication results together
+++
+++ // Prompt
+++ y = _mm256_loadu_ps((float*)_P_code); // Load the cr + ci, dr + di as cr,ci,dr,di
+++
+++ yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+++ yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di
+++
+++ tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+++ tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+++
+++ z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+++ z_P = _mm256_add_ps(z_P, z); // Add the complex multiplication results together
+++
+++ // Late
+++ y = _mm256_loadu_ps((float*)_L_code); // Load the cr + ci, dr + di as cr,ci,dr,di
+++
+++ yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+++ yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di
+++
+++ tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+++ tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+++
+++ z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+++ z_L = _mm256_add_ps(z_L, z); // Add the complex multiplication results together
+++
+++ // VL
+++ y = _mm256_loadu_ps((float*)_VL_code); // Load the cr + ci, dr + di as cr,ci,dr,di
+++
+++ yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+++ yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di
+++
+++ tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+++ tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+++
+++ z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+++ z_VL = _mm256_add_ps(z_VL, z); // Add the complex multiplication results together
+++
+++ /*pointer increment*/
+++ _carrier += 4;
+++ _input += 4;
+++ _VE_code += 4;
+++ _E_code += 4;
+++ _P_code += 4;
+++ _L_code += 4;
+++ _VL_code += 4;
+++ }
+++
+++ __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_VE[4];
+++ __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_E[4];
+++ __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_P[4];
+++ __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_L[4];
+++ __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_VL[4];
+++
+++ _mm256_storeu_ps((float*)dotProductVector_VE,z_VE); // Store the results back into the dot product vector
+++ _mm256_storeu_ps((float*)dotProductVector_E,z_E); // Store the results back into the dot product vector
+++ _mm256_storeu_ps((float*)dotProductVector_P,z_P); // Store the results back into the dot product vector
+++ _mm256_storeu_ps((float*)dotProductVector_L,z_L); // Store the results back into the dot product vector
+++ _mm256_storeu_ps((float*)dotProductVector_VL,z_VL); // Store the results back into the dot product vector
+++
+++ dotProduct_VE = ( dotProductVector_VE[0] + dotProductVector_VE[1] + dotProductVector_VE[2] + dotProductVector_VE[3] );
+++ dotProduct_E = ( dotProductVector_E[0] + dotProductVector_E[1] + dotProductVector_E[2] + dotProductVector_E[3] );
+++ dotProduct_P = ( dotProductVector_P[0] + dotProductVector_P[1] + dotProductVector_P[2] + dotProductVector_P[3] );
+++ dotProduct_L = ( dotProductVector_L[0] + dotProductVector_L[1] + dotProductVector_L[2] + dotProductVector_L[3] );
+++ dotProduct_VL = ( dotProductVector_VL[0] + dotProductVector_VL[1] + dotProductVector_VL[2] + dotProductVector_VL[3] );
+++
+++ for (int i = 0; i<(num_points % 4); ++i)
+++ {
+++ dotProduct_VE += (*_input) * (*_VE_code++) * (*_carrier);
+++ dotProduct_E += (*_input) * (*_E_code++) * (*_carrier);
+++ dotProduct_P += (*_input) * (*_P_code++) * (*_carrier);
+++ dotProduct_L += (*_input) * (*_L_code++) * (*_carrier);
+++ dotProduct_VL += (*_input++) * (*_VL_code++) * (*_carrier++);
+++ }
+++
+++ *VE_out = dotProduct_VE;
+++ *E_out = dotProduct_E;
+++ *P_out = dotProduct_P;
+++ *L_out = dotProduct_L;
+++ *VL_out = dotProduct_VL;
+++}
+++#endif /* LV_HAVE_AVX */
+++
+++#ifdef LV_HAVE_SSE3
+++#include <pmmintrin.h>
+++ /*!
+++ \brief Performs the carrier wipe-off mixing and the VE, Early, Prompt, Late and VL correlation
+++ \param input The input signal input
+++ \param carrier The carrier signal input
+++ \param VE_code VE PRN code replica input
+++ \param E_code Early PRN code replica input
+++ \param P_code Early PRN code replica input
+++ \param L_code Early PRN code replica input
+++ \param VL_code VL PRN code replica input
+++ \param VE_out VE correlation output
+++ \param E_out Early correlation output
+++ \param P_out Early correlation output
+++ \param L_out Early correlation output
+++ \param VL_out VL correlation output
+++ \param num_points The number of complex values in vectors
+++ */
+++static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_u_sse3(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* VE_code, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, const lv_32fc_t* VL_code, unsigned int num_points)
+++{
+++ unsigned int number = 0;
+++ const unsigned int halfPoints = num_points / 2;
+++
+++ lv_32fc_t dotProduct_VE;
+++ lv_32fc_t dotProduct_E;
+++ lv_32fc_t dotProduct_P;
+++ lv_32fc_t dotProduct_L;
+++ lv_32fc_t dotProduct_VL;
+++
+++ // Aux vars
+++ __m128 x, y, yl, yh, z, tmp1, tmp2, z_VE, z_E, z_P, z_L, z_VL;
+++ __m128 bb_signal_sample, bb_signal_sample_shuffled;
+++
+++ z_VE = _mm_setzero_ps();
+++ z_E = _mm_setzero_ps();
+++ z_P = _mm_setzero_ps();
+++ z_L = _mm_setzero_ps();
+++ z_VL = _mm_setzero_ps();
+++
+++ //input and output vectors
+++ const lv_32fc_t* _input = input;
+++ const lv_32fc_t* _carrier = carrier;
+++ const lv_32fc_t* _VE_code = VE_code;
+++ const lv_32fc_t* _E_code = E_code;
+++ const lv_32fc_t* _P_code = P_code;
+++ const lv_32fc_t* _L_code = L_code;
+++ const lv_32fc_t* _VL_code = VL_code;
+++
+++ for(;number < halfPoints; number++)
+++ {
+++ // carrier wipe-off (vector point-to-point product)
+++ x = _mm_loadu_ps((float*)_input); // Load the ar + ai, br + bi as ar,ai,br,bi
+++ y = _mm_loadu_ps((float*)_carrier); // Load the cr + ci, dr + di as cr,ci,dr,di
+++
+++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
+++
+++ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+++
+++ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+++
+++ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+++
+++ bb_signal_sample = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+++ bb_signal_sample_shuffled = _mm_shuffle_ps(bb_signal_sample,bb_signal_sample,0xB1); // Re-arrange bb_signal_sample to be ai,ar,bi,br
+++
+++ // correlation VE,E,P,L,VL (5x vector scalar product)
+++ // VE
+++ y = _mm_loadu_ps((float*)_VE_code); // Load the cr + ci, dr + di as cr,ci,dr,di
+++
+++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
+++
+++ tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+++ tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+++
+++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+++ z_VE = _mm_add_ps(z_VE, z); // Add the complex multiplication results together
+++
+++ // Early
+++ y = _mm_loadu_ps((float*)_E_code); // Load the cr + ci, dr + di as cr,ci,dr,di
+++
+++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
+++
+++ tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+++ tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+++
+++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+++ z_E = _mm_add_ps(z_E, z); // Add the complex multiplication results together
+++
+++ // Prompt
+++ y = _mm_loadu_ps((float*)_P_code); // Load the cr + ci, dr + di as cr,ci,dr,di
+++
+++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
+++
+++ tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+++ tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+++
+++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+++ z_P = _mm_add_ps(z_P, z); // Add the complex multiplication results together
+++
+++ // Late
+++ y = _mm_loadu_ps((float*)_L_code); // Load the cr + ci, dr + di as cr,ci,dr,di
+++
+++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
+++
+++ tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+++ tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+++
+++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+++ z_L = _mm_add_ps(z_L, z); // Add the complex multiplication results together
+++
+++ // VL
+++ //x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi
+++ y = _mm_loadu_ps((float*)_VL_code); // Load the cr + ci, dr + di as cr,ci,dr,di
+++
+++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
+++
+++ tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+++ tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+++
+++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+++ z_VL = _mm_add_ps(z_VL, z); // Add the complex multiplication results together
+++
+++ /*pointer increment*/
+++ _carrier += 2;
+++ _input += 2;
+++ _VE_code += 2;
+++ _E_code += 2;
+++ _P_code += 2;
+++ _L_code +=2;
+++ _VL_code +=2;
+++ }
+++
+++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_VE[2];
+++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_E[2];
+++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_P[2];
+++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_L[2];
+++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_VL[2];
+++
+++ _mm_storeu_ps((float*)dotProductVector_VE,z_VE); // Store the results back into the dot product vector
+++ _mm_storeu_ps((float*)dotProductVector_E,z_E); // Store the results back into the dot product vector
+++ _mm_storeu_ps((float*)dotProductVector_P,z_P); // Store the results back into the dot product vector
+++ _mm_storeu_ps((float*)dotProductVector_L,z_L); // Store the results back into the dot product vector
+++ _mm_storeu_ps((float*)dotProductVector_VL,z_VL); // Store the results back into the dot product vector
+++
+++ dotProduct_VE = ( dotProductVector_VE[0] + dotProductVector_VE[1] );
+++ dotProduct_E = ( dotProductVector_E[0] + dotProductVector_E[1] );
+++ dotProduct_P = ( dotProductVector_P[0] + dotProductVector_P[1] );
+++ dotProduct_L = ( dotProductVector_L[0] + dotProductVector_L[1] );
+++ dotProduct_VL = ( dotProductVector_VL[0] + dotProductVector_VL[1] );
+++
+++ if((num_points % 2) != 0)
+++ {
+++ dotProduct_VE += (*_input) * (*_VE_code)*(*_carrier);
+++ dotProduct_E += (*_input) * (*_E_code)*(*_carrier);
+++ dotProduct_P += (*_input) * (*_P_code)*(*_carrier);
+++ dotProduct_L += (*_input) * (*_L_code)*(*_carrier);
+++ dotProduct_VL += (*_input) * (*_VL_code)*(*_carrier);
+++ }
+++
+++ *VE_out = dotProduct_VE;
+++ *E_out = dotProduct_E;
+++ *P_out = dotProduct_P;
+++ *L_out = dotProduct_L;
+++ *VL_out = dotProduct_VL;
+++}
+++#endif /* LV_HAVE_SSE3 */
+++
+++#ifdef LV_HAVE_GENERIC
+++/*!
+++ \brief Performs the carrier wipe-off mixing and the VE, Early, Prompt, Late and VL correlation
+++ \param input The input signal input
+++ \param carrier The carrier signal input
+++ \param VE_code VE PRN code replica input
+++ \param E_code Early PRN code replica input
+++ \param P_code Early PRN code replica input
+++ \param L_code Early PRN code replica input
+++ \param VL_code VL PRN code replica input
+++ \param VE_out VE correlation output
+++ \param E_out Early correlation output
+++ \param P_out Early correlation output
+++ \param L_out Early correlation output
+++ \param VL_out VL correlation output
+++ \param num_points The number of complex values in vectors
+++ */
+++static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* VE_code, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, const lv_32fc_t* VL_code, unsigned int num_points)
+++{
+++ lv_32fc_t bb_signal_sample;
+++
+++ bb_signal_sample = lv_cmake(0, 0);
+++
+++ *VE_out = 0;
+++ *E_out = 0;
+++ *P_out = 0;
+++ *L_out = 0;
+++ *VL_out = 0;
+++ // perform Early, Prompt and Late correlation
+++ for(int i=0; i < num_points; ++i)
+++ {
+++ //Perform the carrier wipe-off
+++ bb_signal_sample = input[i] * carrier[i];
+++ // Now get early, late, and prompt values for each
+++ *VE_out += bb_signal_sample * VE_code[i];
+++ *E_out += bb_signal_sample * E_code[i];
+++ *P_out += bb_signal_sample * P_code[i];
+++ *L_out += bb_signal_sample * L_code[i];
+++ *VL_out += bb_signal_sample * VL_code[i];
+++ }
+++}
+++
+++#endif /* LV_HAVE_GENERIC */
+++
+++#endif /* INCLUDED_gnsssdr_volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_u_H */
+++
+++
+++#ifndef INCLUDED_gnsssdr_volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_a_H
+++#define INCLUDED_gnsssdr_volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_a_H
+++
+++#include <inttypes.h>
+++#include <stdio.h>
+++#include <volk_gnsssdr/volk_gnsssdr_complex.h>
+++#include <float.h>
+++#include <string.h>
+++
+++#ifdef LV_HAVE_AVX
+++#include <immintrin.h>
+++/*!
+++ \brief Performs the carrier wipe-off mixing and the VE, Early, Prompt, Late and VL correlation
+++ \param input The input signal input
+++ \param carrier The carrier signal input
+++ \param VE_code VE PRN code replica input
+++ \param E_code Early PRN code replica input
+++ \param P_code Early PRN code replica input
+++ \param L_code Early PRN code replica input
+++ \param VL_code VL PRN code replica input
+++ \param VE_out VE correlation output
+++ \param E_out Early correlation output
+++ \param P_out Early correlation output
+++ \param L_out Early correlation output
+++ \param VL_out VL correlation output
+++ \param num_points The number of complex values in vectors
+++ */
+++static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_a_avx(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* VE_code, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, const lv_32fc_t* VL_code, unsigned int num_points)
+++{
+++ unsigned int number = 0;
+++ const unsigned int halfPoints = num_points / 4;
+++
+++ lv_32fc_t dotProduct_VE;
+++ lv_32fc_t dotProduct_E;
+++ lv_32fc_t dotProduct_P;
+++ lv_32fc_t dotProduct_L;
+++ lv_32fc_t dotProduct_VL;
+++
+++ // Aux vars
+++ __m256 x, y, yl, yh, z, tmp1, tmp2, z_VE, z_E, z_P, z_L, z_VL;
+++ __m256 bb_signal_sample, bb_signal_sample_shuffled;
+++
+++ z_VE = _mm256_setzero_ps();
+++ z_E = _mm256_setzero_ps();
+++ z_P = _mm256_setzero_ps();
+++ z_L = _mm256_setzero_ps();
+++ z_VL = _mm256_setzero_ps();
+++
+++ //input and output vectors
+++ const lv_32fc_t* _input = input;
+++ const lv_32fc_t* _carrier = carrier;
+++ const lv_32fc_t* _VE_code = VE_code;
+++ const lv_32fc_t* _E_code = E_code;
+++ const lv_32fc_t* _P_code = P_code;
+++ const lv_32fc_t* _L_code = L_code;
+++ const lv_32fc_t* _VL_code = VL_code;
+++
+++ for(;number < halfPoints; number++)
+++ {
+++ // carrier wipe-off (vector point-to-point product)
+++ x = _mm256_load_ps((float*)_input); // Load the ar + ai, br + bi as ar,ai,br,bi
+++ y = _mm256_load_ps((float*)_carrier); // Load the cr + ci, dr + di as cr,ci,dr,di
+++
+++ yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+++ yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di
+++
+++ tmp1 = _mm256_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+++
+++ x = _mm256_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+++
+++ tmp2 = _mm256_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+++
+++ bb_signal_sample = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+++ bb_signal_sample_shuffled = _mm256_shuffle_ps(bb_signal_sample,bb_signal_sample,0xB1); // Re-arrange bb_signal_sample to be ai,ar,bi,br
+++
+++ // correlation VE,E,P,L,VL (5x vector scalar product)
+++ // VE
+++ y = _mm256_load_ps((float*)_VE_code); // Load the cr + ci, dr + di as cr,ci,dr,di
+++
+++ yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+++ yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di
+++
+++ tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+++ tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+++
+++ z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+++ z_VE = _mm256_add_ps(z_VE, z); // Add the complex multiplication results together
+++
+++ // Early
+++ y = _mm256_load_ps((float*)_E_code); // Load the cr + ci, dr + di as cr,ci,dr,di
+++
+++ yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+++ yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di
+++
+++ tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+++ tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+++
+++ z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+++ z_E = _mm256_add_ps(z_E, z); // Add the complex multiplication results together
+++
+++ // Prompt
+++ y = _mm256_load_ps((float*)_P_code); // Load the cr + ci, dr + di as cr,ci,dr,di
+++
+++ yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+++ yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di
+++
+++ tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+++ tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+++
+++ z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+++ z_P = _mm256_add_ps(z_P, z); // Add the complex multiplication results together
+++
+++ // Late
+++ y = _mm256_load_ps((float*)_L_code); // Load the cr + ci, dr + di as cr,ci,dr,di
+++
+++ yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+++ yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di
+++
+++ tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+++ tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+++
+++ z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+++ z_L = _mm256_add_ps(z_L, z); // Add the complex multiplication results together
+++
+++ // VL
+++ y = _mm256_load_ps((float*)_VL_code); // Load the cr + ci, dr + di as cr,ci,dr,di
+++
+++ yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+++ yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di
+++
+++ tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+++ tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+++
+++ z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+++ z_VL = _mm256_add_ps(z_VL, z); // Add the complex multiplication results together
+++
+++ /*pointer increment*/
+++ _carrier += 4;
+++ _input += 4;
+++ _VE_code += 4;
+++ _E_code += 4;
+++ _P_code += 4;
+++ _L_code += 4;
+++ _VL_code += 4;
+++ }
+++
+++ __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_VE[4];
+++ __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_E[4];
+++ __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_P[4];
+++ __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_L[4];
+++ __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_VL[4];
+++
+++ _mm256_store_ps((float*)dotProductVector_VE,z_VE); // Store the results back into the dot product vector
+++ _mm256_store_ps((float*)dotProductVector_E,z_E); // Store the results back into the dot product vector
+++ _mm256_store_ps((float*)dotProductVector_P,z_P); // Store the results back into the dot product vector
+++ _mm256_store_ps((float*)dotProductVector_L,z_L); // Store the results back into the dot product vector
+++ _mm256_store_ps((float*)dotProductVector_VL,z_VL); // Store the results back into the dot product vector
+++
+++ dotProduct_VE = ( dotProductVector_VE[0] + dotProductVector_VE[1] + dotProductVector_VE[2] + dotProductVector_VE[3] );
+++ dotProduct_E = ( dotProductVector_E[0] + dotProductVector_E[1] + dotProductVector_E[2] + dotProductVector_E[3] );
+++ dotProduct_P = ( dotProductVector_P[0] + dotProductVector_P[1] + dotProductVector_P[2] + dotProductVector_P[3] );
+++ dotProduct_L = ( dotProductVector_L[0] + dotProductVector_L[1] + dotProductVector_L[2] + dotProductVector_L[3] );
+++ dotProduct_VL = ( dotProductVector_VL[0] + dotProductVector_VL[1] + dotProductVector_VL[2] + dotProductVector_VL[3] );
+++
+++ for (int i = 0; i<(num_points % 4); ++i)
+++ {
+++ dotProduct_VE += (*_input) * (*_VE_code++) * (*_carrier);
+++ dotProduct_E += (*_input) * (*_E_code++) * (*_carrier);
+++ dotProduct_P += (*_input) * (*_P_code++) * (*_carrier);
+++ dotProduct_L += (*_input) * (*_L_code++) * (*_carrier);
+++ dotProduct_VL += (*_input++) * (*_VL_code++) * (*_carrier++);
+++ }
+++
+++ *VE_out = dotProduct_VE;
+++ *E_out = dotProduct_E;
+++ *P_out = dotProduct_P;
+++ *L_out = dotProduct_L;
+++ *VL_out = dotProduct_VL;
+++}
+++#endif /* LV_HAVE_AVX */
+++
+++#ifdef LV_HAVE_SSE3
+++#include <pmmintrin.h>
+++/*!
+++ \brief Performs the carrier wipe-off mixing and the VE, Early, Prompt, Late and VL correlation
+++ \param input The input signal input
+++ \param carrier The carrier signal input
+++ \param VE_code VE PRN code replica input
+++ \param E_code Early PRN code replica input
+++ \param P_code Early PRN code replica input
+++ \param L_code Early PRN code replica input
+++ \param VL_code VL PRN code replica input
+++ \param VE_out VE correlation output
+++ \param E_out Early correlation output
+++ \param P_out Early correlation output
+++ \param L_out Early correlation output
+++ \param VL_out VL correlation output
+++ \param num_points The number of complex values in vectors
+++ */
+++static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_a_sse3(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* VE_code, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, const lv_32fc_t* VL_code, unsigned int num_points)
+++{
+++ unsigned int number = 0;
+++ const unsigned int halfPoints = num_points / 2;
+++
+++ lv_32fc_t dotProduct_VE;
+++ lv_32fc_t dotProduct_E;
+++ lv_32fc_t dotProduct_P;
+++ lv_32fc_t dotProduct_L;
+++ lv_32fc_t dotProduct_VL;
+++
+++ // Aux vars
+++ __m128 x, y, yl, yh, z, tmp1, tmp2, z_VE, z_E, z_P, z_L, z_VL;
+++ __m128 bb_signal_sample, bb_signal_sample_shuffled;
+++
+++ z_VE = _mm_setzero_ps();
+++ z_E = _mm_setzero_ps();
+++ z_P = _mm_setzero_ps();
+++ z_L = _mm_setzero_ps();
+++ z_VL = _mm_setzero_ps();
+++
+++ //input and output vectors
+++ const lv_32fc_t* _input = input;
+++ const lv_32fc_t* _carrier = carrier;
+++ const lv_32fc_t* _VE_code = VE_code;
+++ const lv_32fc_t* _E_code = E_code;
+++ const lv_32fc_t* _P_code = P_code;
+++ const lv_32fc_t* _L_code = L_code;
+++ const lv_32fc_t* _VL_code = VL_code;
+++
+++ for(;number < halfPoints; number++)
+++ {
+++ // carrier wipe-off (vector point-to-point product)
+++ x = _mm_load_ps((float*)_input); // Load the ar + ai, br + bi as ar,ai,br,bi
+++ y = _mm_load_ps((float*)_carrier); // Load the cr + ci, dr + di as cr,ci,dr,di
+++
+++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
+++
+++ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+++
+++ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+++
+++ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+++
+++ bb_signal_sample = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+++ bb_signal_sample_shuffled = _mm_shuffle_ps(bb_signal_sample,bb_signal_sample,0xB1); // Re-arrange bb_signal_sample to be ai,ar,bi,br
+++
+++ // correlation VE,E,P,L,VL (5x vector scalar product)
+++ // VE
+++ y = _mm_load_ps((float*)_VE_code); // Load the cr + ci, dr + di as cr,ci,dr,di
+++
+++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
+++
+++ tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+++ tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+++
+++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+++ z_VE = _mm_add_ps(z_VE, z); // Add the complex multiplication results together
+++
+++ // Early
+++ y = _mm_load_ps((float*)_E_code); // Load the cr + ci, dr + di as cr,ci,dr,di
+++
+++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
+++
+++ tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+++ tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+++
+++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+++ z_E = _mm_add_ps(z_E, z); // Add the complex multiplication results together
+++
+++ // Prompt
+++ y = _mm_load_ps((float*)_P_code); // Load the cr + ci, dr + di as cr,ci,dr,di
+++
+++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
+++
+++ tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+++ tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+++
+++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+++ z_P = _mm_add_ps(z_P, z); // Add the complex multiplication results together
+++
+++ // Late
+++ y = _mm_load_ps((float*)_L_code); // Load the cr + ci, dr + di as cr,ci,dr,di
+++
+++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
+++
+++ tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+++ tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+++
+++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+++ z_L = _mm_add_ps(z_L, z); // Add the complex multiplication results together
+++
+++ // VL
+++ //x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi
+++ y = _mm_load_ps((float*)_VL_code); // Load the cr + ci, dr + di as cr,ci,dr,di
+++
+++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
+++
+++ tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+++ tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+++
+++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+++ z_VL = _mm_add_ps(z_VL, z); // Add the complex multiplication results together
+++
+++ /*pointer increment*/
+++ _carrier += 2;
+++ _input += 2;
+++ _VE_code += 2;
+++ _E_code += 2;
+++ _P_code += 2;
+++ _L_code +=2;
+++ _VL_code +=2;
+++ }
+++
+++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_VE[2];
+++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_E[2];
+++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_P[2];
+++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_L[2];
+++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_VL[2];
+++
+++ _mm_store_ps((float*)dotProductVector_VE,z_VE); // Store the results back into the dot product vector
+++ _mm_store_ps((float*)dotProductVector_E,z_E); // Store the results back into the dot product vector
+++ _mm_store_ps((float*)dotProductVector_P,z_P); // Store the results back into the dot product vector
+++ _mm_store_ps((float*)dotProductVector_L,z_L); // Store the results back into the dot product vector
+++ _mm_store_ps((float*)dotProductVector_VL,z_VL); // Store the results back into the dot product vector
+++
+++ dotProduct_VE = ( dotProductVector_VE[0] + dotProductVector_VE[1] );
+++ dotProduct_E = ( dotProductVector_E[0] + dotProductVector_E[1] );
+++ dotProduct_P = ( dotProductVector_P[0] + dotProductVector_P[1] );
+++ dotProduct_L = ( dotProductVector_L[0] + dotProductVector_L[1] );
+++ dotProduct_VL = ( dotProductVector_VL[0] + dotProductVector_VL[1] );
+++
+++ if((num_points % 2) != 0)
+++ {
+++ dotProduct_VE += (*_input) * (*_VE_code)*(*_carrier);
+++ dotProduct_E += (*_input) * (*_E_code)*(*_carrier);
+++ dotProduct_P += (*_input) * (*_P_code)*(*_carrier);
+++ dotProduct_L += (*_input) * (*_L_code)*(*_carrier);
+++ dotProduct_VL += (*_input) * (*_VL_code)*(*_carrier);
+++ }
+++
+++ *VE_out = dotProduct_VE;
+++ *E_out = dotProduct_E;
+++ *P_out = dotProduct_P;
+++ *L_out = dotProduct_L;
+++ *VL_out = dotProduct_VL;
+++}
+++#endif /* LV_HAVE_SSE3 */
+++
+++#ifdef LV_HAVE_GENERIC
+++/*!
+++ \brief Performs the carrier wipe-off mixing and the VE, Early, Prompt, Late and VL correlation
+++ \param input The input signal input
+++ \param carrier The carrier signal input
+++ \param VE_code VE PRN code replica input
+++ \param E_code Early PRN code replica input
+++ \param P_code Early PRN code replica input
+++ \param L_code Early PRN code replica input
+++ \param VL_code VL PRN code replica input
+++ \param VE_out VE correlation output
+++ \param E_out Early correlation output
+++ \param P_out Early correlation output
+++ \param L_out Early correlation output
+++ \param VL_out VL correlation output
+++ \param num_points The number of complex values in vectors
+++ */
+++static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_a_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* VE_code, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, const lv_32fc_t* VL_code, unsigned int num_points)
+++{
+++ lv_32fc_t bb_signal_sample;
+++
+++ bb_signal_sample = lv_cmake(0, 0);
+++
+++ *VE_out = 0;
+++ *E_out = 0;
+++ *P_out = 0;
+++ *L_out = 0;
+++ *VL_out = 0;
+++ // perform Early, Prompt and Late correlation
+++ for(int i=0; i < num_points; ++i)
+++ {
+++ //Perform the carrier wipe-off
+++ bb_signal_sample = input[i] * carrier[i];
+++ // Now get early, late, and prompt values for each
+++ *VE_out += bb_signal_sample * VE_code[i];
+++ *E_out += bb_signal_sample * E_code[i];
+++ *P_out += bb_signal_sample * P_code[i];
+++ *L_out += bb_signal_sample * L_code[i];
+++ *VL_out += bb_signal_sample * VL_code[i];
+++ }
+++}
+++#endif /* LV_HAVE_GENERIC */
+++#endif /* INCLUDED_gnsssdr_volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_a_H */
++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_64f_accumulator_64f.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_64f_accumulator_64f.h
++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_64f_accumulator_64f.h 1970-01-01 01:00:00.000000000 +0100
+++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_64f_accumulator_64f.h 2014-10-15 01:55:08.000000000 +0200
++@@ -0,0 +1,243 @@
+++/*!
+++ * \file volk_gnsssdr_64f_accumulator_64f.h
+++ * \brief Volk protokernel: 64 bits (double) scalar accumulator
+++ * \authors <ul>
+++ * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
+++ * </ul>
+++ *
+++ * Volk protokernel that implements an accumulator of char values
+++ *
+++ * -------------------------------------------------------------------------
+++ *
+++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
+++ *
+++ * GNSS-SDR is a software defined Global Navigation
+++ * Satellite Systems receiver
+++ *
+++ * This file is part of GNSS-SDR.
+++ *
+++ * GNSS-SDR is free software: you can redistribute it and/or modify
+++ * it under the terms of the GNU General Public License as published by
+++ * the Free Software Foundation, either version 3 of the License, or
+++ * at your option) any later version.
+++ *
+++ * GNSS-SDR is distributed in the hope that it will be useful,
+++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+++ * GNU General Public License for more details.
+++ *
+++ * You should have received a copy of the GNU General Public License
+++ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
+++ *
+++ * -------------------------------------------------------------------------
+++ */
+++
+++#ifndef INCLUDED_volk_gnsssdr_64f_accumulator_64f_u_H
+++#define INCLUDED_volk_gnsssdr_64f_accumulator_64f_u_H
+++
+++#include <volk_gnsssdr/volk_gnsssdr_common.h>
+++#include <inttypes.h>
+++#include <stdio.h>
+++
+++#ifdef LV_HAVE_AVX
+++#include <immintrin.h>
+++/*!
+++ \brief Accumulates the values in the input buffer
+++ \param result The accumulated result
+++ \param inputBuffer The buffer of data to be accumulated
+++ \param num_points The number of values in inputBuffer to be accumulated
+++ */
+++static inline void volk_gnsssdr_64f_accumulator_64f_u_avx(double* result,const double* inputBuffer, unsigned int num_points){
+++ double returnValue = 0;
+++ const unsigned int sse_iters = num_points / 4;
+++
+++ const double* aPtr = inputBuffer;
+++
+++ __VOLK_ATTR_ALIGNED(32) double tempBuffer[4];
+++ __m256d accumulator = _mm256_setzero_pd();
+++ __m256d aVal = _mm256_setzero_pd();
+++
+++ for(unsigned int number = 0; number < sse_iters; number++)
+++ {
+++ aVal = _mm256_loadu_pd(aPtr);
+++ accumulator = _mm256_add_pd(accumulator, aVal);
+++ aPtr += 4;
+++ }
+++
+++ _mm256_storeu_pd((double*)tempBuffer,accumulator);
+++
+++ for(int i = 0; i<4; ++i){
+++ returnValue += tempBuffer[i];
+++ }
+++
+++ for(int i = 0; i<(num_points % 4); ++i){
+++ returnValue += (*aPtr++);
+++ }
+++
+++ *result = returnValue;
+++}
+++#endif /* LV_HAVE_AVX */
+++
+++#ifdef LV_HAVE_SSE3
+++#include <xmmintrin.h>
+++/*!
+++ \brief Accumulates the values in the input buffer
+++ \param result The accumulated result
+++ \param inputBuffer The buffer of data to be accumulated
+++ \param num_points The number of values in inputBuffer to be accumulated
+++ */
+++static inline void volk_gnsssdr_64f_accumulator_64f_u_sse3(double* result,const double* inputBuffer, unsigned int num_points){
+++ double returnValue = 0;
+++ const unsigned int sse_iters = num_points / 2;
+++
+++ const double* aPtr = inputBuffer;
+++
+++ __VOLK_ATTR_ALIGNED(16) double tempBuffer[2];
+++ __m128d accumulator = _mm_setzero_pd();
+++ __m128d aVal = _mm_setzero_pd();
+++
+++ for(unsigned int number = 0; number < sse_iters; number++)
+++ {
+++ aVal = _mm_loadu_pd(aPtr);
+++ accumulator = _mm_add_pd(accumulator, aVal);
+++ aPtr += 2;
+++ }
+++
+++ _mm_storeu_pd((double*)tempBuffer,accumulator);
+++
+++ for(int i = 0; i<2; ++i){
+++ returnValue += tempBuffer[i];
+++ }
+++
+++ for(int i = 0; i<(num_points % 2); ++i){
+++ returnValue += (*aPtr++);
+++ }
+++
+++ *result = returnValue;
+++}
+++#endif /* LV_HAVE_SSE3 */
+++
+++#ifdef LV_HAVE_GENERIC
+++/*!
+++ \brief Accumulates the values in the input buffer
+++ \param result The accumulated result
+++ \param inputBuffer The buffer of data to be accumulated
+++ \param num_points The number of values in inputBuffer to be accumulated
+++ */
+++static inline void volk_gnsssdr_64f_accumulator_64f_generic(double* result,const double* inputBuffer, unsigned int num_points){
+++ const double* aPtr = inputBuffer;
+++ double returnValue = 0;
+++
+++ for(unsigned int number = 0;number < num_points; number++){
+++ returnValue += (*aPtr++);
+++ }
+++ *result = returnValue;
+++}
+++#endif /* LV_HAVE_GENERIC */
+++
+++#endif /* INCLUDED_volk_gnsssdr_64f_accumulator_64f_u_H */
+++
+++
+++#ifndef INCLUDED_volk_gnsssdr_64f_accumulator_64f_a_H
+++#define INCLUDED_volk_gnsssdr_64f_accumulator_64f_a_H
+++
+++#include <volk_gnsssdr/volk_gnsssdr_common.h>
+++#include <inttypes.h>
+++#include <stdio.h>
+++
+++#ifdef LV_HAVE_AVX
+++#include <immintrin.h>
+++/*!
+++ \brief Accumulates the values in the input buffer
+++ \param result The accumulated result
+++ \param inputBuffer The buffer of data to be accumulated
+++ \param num_points The number of values in inputBuffer to be accumulated
+++ */
+++static inline void volk_gnsssdr_64f_accumulator_64f_a_avx(double* result,const double* inputBuffer, unsigned int num_points){
+++ double returnValue = 0;
+++ const unsigned int sse_iters = num_points / 4;
+++
+++ const double* aPtr = inputBuffer;
+++
+++ __VOLK_ATTR_ALIGNED(32) double tempBuffer[4];
+++ __m256d accumulator = _mm256_setzero_pd();
+++ __m256d aVal = _mm256_setzero_pd();
+++
+++ for(unsigned int number = 0; number < sse_iters; number++)
+++ {
+++ aVal = _mm256_load_pd(aPtr);
+++ accumulator = _mm256_add_pd(accumulator, aVal);
+++ aPtr += 4;
+++ }
+++
+++ _mm256_store_pd((double*)tempBuffer,accumulator);
+++
+++ for(int i = 0; i<4; ++i){
+++ returnValue += tempBuffer[i];
+++ }
+++
+++ for(int i = 0; i<(num_points % 4); ++i){
+++ returnValue += (*aPtr++);
+++ }
+++
+++ *result = returnValue;
+++}
+++#endif /* LV_HAVE_AVX */
+++
+++#ifdef LV_HAVE_SSE3
+++#include <xmmintrin.h>
+++/*!
+++ \brief Accumulates the values in the input buffer
+++ \param result The accumulated result
+++ \param inputBuffer The buffer of data to be accumulated
+++ \param num_points The number of values in inputBuffer to be accumulated
+++ */
+++static inline void volk_gnsssdr_64f_accumulator_64f_a_sse3(double* result,const double* inputBuffer, unsigned int num_points){
+++ double returnValue = 0;
+++ const unsigned int sse_iters = num_points / 2;
+++
+++ const double* aPtr = inputBuffer;
+++
+++ __VOLK_ATTR_ALIGNED(16) double tempBuffer[2];
+++ __m128d accumulator = _mm_setzero_pd();
+++ __m128d aVal = _mm_setzero_pd();
+++
+++ for(unsigned int number = 0; number < sse_iters; number++)
+++ {
+++ aVal = _mm_load_pd(aPtr);
+++ accumulator = _mm_add_pd(accumulator, aVal);
+++ aPtr += 2;
+++ }
+++
+++ _mm_store_pd((double*)tempBuffer,accumulator);
+++
+++ for(int i = 0; i<2; ++i){
+++ returnValue += tempBuffer[i];
+++ }
+++
+++ for(int i = 0; i<(num_points % 2); ++i){
+++ returnValue += (*aPtr++);
+++ }
+++
+++ *result = returnValue;
+++}
+++#endif /* LV_HAVE_SSE3 */
+++
+++#ifdef LV_HAVE_GENERIC
+++/*!
+++ \brief Accumulates the values in the input buffer
+++ \param result The accumulated result
+++ \param inputBuffer The buffer of data to be accumulated
+++ \param num_points The number of values in inputBuffer to be accumulated
+++ */
+++static inline void volk_gnsssdr_64f_accumulator_64f_a_generic(double* result,const double* inputBuffer, unsigned int num_points){
+++ const double* aPtr = inputBuffer;
+++ double returnValue = 0;
+++
+++ for(unsigned int number = 0;number < num_points; number++){
+++ returnValue += (*aPtr++);
+++ }
+++ *result = returnValue;
+++}
+++#endif /* LV_HAVE_GENERIC */
+++#endif /* INCLUDED_volk_gnsssdr_64f_accumulator_64f_a_H */
++\ No newline at end of file
++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_accumulator_s8i.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8i_accumulator_s8i.h
++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_accumulator_s8i.h 1970-01-01 01:00:00.000000000 +0100
+++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8i_accumulator_s8i.h 2014-10-15 01:55:08.000000000 +0200
++@@ -0,0 +1,183 @@
+++/*!
+++ * \file volk_gnsssdr_8i_accumulator_s8i.h
+++ * \brief Volk protokernel: 8 bits (char) scalar accumulator
+++ * \authors <ul>
+++ * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
+++ * </ul>
+++ *
+++ * Volk protokernel that implements an accumulator of char values
+++ *
+++ * -------------------------------------------------------------------------
+++ *
+++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
+++ *
+++ * GNSS-SDR is a software defined Global Navigation
+++ * Satellite Systems receiver
+++ *
+++ * This file is part of GNSS-SDR.
+++ *
+++ * GNSS-SDR is free software: you can redistribute it and/or modify
+++ * it under the terms of the GNU General Public License as published by
+++ * the Free Software Foundation, either version 3 of the License, or
+++ * at your option) any later version.
+++ *
+++ * GNSS-SDR is distributed in the hope that it will be useful,
+++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+++ * GNU General Public License for more details.
+++ *
+++ * You should have received a copy of the GNU General Public License
+++ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
+++ *
+++ * -------------------------------------------------------------------------
+++ */
+++
+++#ifndef INCLUDED_volk_gnsssdr_8i_accumulator_s8i_u_H
+++#define INCLUDED_volk_gnsssdr_8i_accumulator_s8i_u_H
+++
+++#include <volk_gnsssdr/volk_gnsssdr_common.h>
+++#include <inttypes.h>
+++#include <stdio.h>
+++
+++#ifdef LV_HAVE_SSE3
+++#include <xmmintrin.h>
+++/*!
+++ \brief Accumulates the values in the input buffer
+++ \param result The accumulated result
+++ \param inputBuffer The buffer of data to be accumulated
+++ \param num_points The number of values in inputBuffer to be accumulated
+++ */
+++static inline void volk_gnsssdr_8i_accumulator_s8i_u_sse3(char* result, const char* inputBuffer, unsigned int num_points){
+++ char returnValue = 0;
+++ const unsigned int sse_iters = num_points / 16;
+++
+++ const char* aPtr = inputBuffer;
+++
+++ __VOLK_ATTR_ALIGNED(16) char tempBuffer[16];
+++ __m128i accumulator = _mm_setzero_si128();
+++ __m128i aVal = _mm_setzero_si128();
+++
+++ for(unsigned int number = 0; number < sse_iters; number++){
+++ aVal = _mm_lddqu_si128((__m128i*)aPtr);
+++ accumulator = _mm_add_epi8(accumulator, aVal);
+++ aPtr += 16;
+++ }
+++ _mm_storeu_si128((__m128i*)tempBuffer,accumulator);
+++
+++ for(int i = 0; i<16; ++i){
+++ returnValue += tempBuffer[i];
+++ }
+++
+++ for(int i = 0; i<(num_points % 16); ++i){
+++ returnValue += (*aPtr++);
+++ }
+++
+++ *result = returnValue;
+++}
+++#endif /* LV_HAVE_SSE3 */
+++
+++#ifdef LV_HAVE_GENERIC
+++/*!
+++ \brief Accumulates the values in the input buffer
+++ \param result The accumulated result
+++ \param inputBuffer The buffer of data to be accumulated
+++ \param num_points The number of values in inputBuffer to be accumulated
+++ */
+++static inline void volk_gnsssdr_8i_accumulator_s8i_generic(char* result, const char* inputBuffer, unsigned int num_points){
+++ const char* aPtr = inputBuffer;
+++ char returnValue = 0;
+++
+++ for(unsigned int number = 0;number < num_points; number++){
+++ returnValue += (*aPtr++);
+++ }
+++ *result = returnValue;
+++}
+++#endif /* LV_HAVE_GENERIC */
+++
+++#endif /* INCLUDED_volk_gnsssdr_8i_accumulator_s8i_u_H */
+++
+++
+++#ifndef INCLUDED_volk_gnsssdr_8i_accumulator_s8i_a_H
+++#define INCLUDED_volk_gnsssdr_8i_accumulator_s8i_a_H
+++
+++#include <volk_gnsssdr/volk_gnsssdr_common.h>
+++#include <inttypes.h>
+++#include <stdio.h>
+++
+++#ifdef LV_HAVE_SSE3
+++#include <xmmintrin.h>
+++/*!
+++ \brief Accumulates the values in the input buffer
+++ \param result The accumulated result
+++ \param inputBuffer The buffer of data to be accumulated
+++ \param num_points The number of values in inputBuffer to be accumulated
+++ */
+++static inline void volk_gnsssdr_8i_accumulator_s8i_a_sse3(char* result, const char* inputBuffer, unsigned int num_points){
+++ char returnValue = 0;
+++ const unsigned int sse_iters = num_points / 16;
+++
+++ const char* aPtr = inputBuffer;
+++
+++ __VOLK_ATTR_ALIGNED(16) char tempBuffer[16];
+++ __m128i accumulator = _mm_setzero_si128();
+++ __m128i aVal = _mm_setzero_si128();
+++
+++ for(unsigned int number = 0; number < sse_iters; number++){
+++ aVal = _mm_load_si128((__m128i*)aPtr);
+++ accumulator = _mm_add_epi8(accumulator, aVal);
+++ aPtr += 16;
+++ }
+++ _mm_store_si128((__m128i*)tempBuffer,accumulator);
+++
+++ for(int i = 0; i<16; ++i){
+++ returnValue += tempBuffer[i];
+++ }
+++
+++ for(int i = 0; i<(num_points % 16); ++i){
+++ returnValue += (*aPtr++);
+++ }
+++
+++ *result = returnValue;
+++}
+++#endif /* LV_HAVE_SSE3 */
+++
+++#ifdef LV_HAVE_GENERIC
+++/*!
+++ \brief Accumulates the values in the input buffer
+++ \param result The accumulated result
+++ \param inputBuffer The buffer of data to be accumulated
+++ \param num_points The number of values in inputBuffer to be accumulated
+++ */
+++static inline void volk_gnsssdr_8i_accumulator_s8i_a_generic(char* result, const char* inputBuffer, unsigned int num_points){
+++ const char* aPtr = inputBuffer;
+++ char returnValue = 0;
+++
+++ for(unsigned int number = 0;number < num_points; number++){
+++ returnValue += (*aPtr++);
+++ }
+++ *result = returnValue;
+++}
+++#endif /* LV_HAVE_GENERIC */
+++
+++#ifdef LV_HAVE_ORC
+++/*!
+++ \brief Accumulates the values in the input buffer
+++ \param result The accumulated result
+++ \param inputBuffer The buffer of data to be accumulated
+++ \param num_points The number of values in inputBuffer to be accumulated
+++ */
+++extern void volk_gnsssdr_8i_accumulator_s8i_a_orc_impl(short* result, const char* inputBuffer, unsigned int num_points);
+++static inline void volk_gnsssdr_8i_accumulator_s8i_u_orc(char* result, const char* inputBuffer, unsigned int num_points){
+++
+++ short res = 0;
+++ char* resc = (char*)&res;
+++ resc++;
+++
+++ volk_gnsssdr_8i_accumulator_s8i_a_orc_impl(&res, inputBuffer, num_points);
+++
+++ *result = *resc;
+++}
+++#endif /* LV_HAVE_ORC */
+++
+++#endif /* INCLUDED_volk_gnsssdr_8i_accumulator_s8i_a_H */
+++
++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_index_max_16u.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8i_index_max_16u.h
++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_index_max_16u.h 1970-01-01 01:00:00.000000000 +0100
+++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8i_index_max_16u.h 2014-10-15 01:55:08.000000000 +0200
++@@ -0,0 +1,493 @@
+++/*!
+++ * \file volk_gnsssdr_8i_index_max_16u.h
+++ * \brief Volk protokernel: calculates the index of the maximum value in a group of 8 bits (char) scalars
+++ * \authors <ul>
+++ * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
+++ * </ul>
+++ *
+++ * Volk protokernel that returns the index of the maximum value of a group of 8 bits (char) scalars
+++ *
+++ * -------------------------------------------------------------------------
+++ *
+++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
+++ *
+++ * GNSS-SDR is a software defined Global Navigation
+++ * Satellite Systems receiver
+++ *
+++ * This file is part of GNSS-SDR.
+++ *
+++ * GNSS-SDR is free software: you can redistribute it and/or modify
+++ * it under the terms of the GNU General Public License as published by
+++ * the Free Software Foundation, either version 3 of the License, or
+++ * at your option) any later version.
+++ *
+++ * GNSS-SDR is distributed in the hope that it will be useful,
+++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+++ * GNU General Public License for more details.
+++ *
+++ * You should have received a copy of the GNU General Public License
+++ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
+++ *
+++ * -------------------------------------------------------------------------
+++ */
+++
+++#ifndef INCLUDED_volk_gnsssdr_8i_index_max_16u_u_H
+++#define INCLUDED_volk_gnsssdr_8i_index_max_16u_u_H
+++
+++#include <volk_gnsssdr/volk_gnsssdr_common.h>
+++#include <inttypes.h>
+++#include <stdio.h>
+++
+++#ifdef LV_HAVE_AVX
+++#include "immintrin.h"
+++/*!
+++ \brief Returns the index of the max value in src0
+++ \param target The index of the max value in src0
+++ \param src0 The buffer of data to be analysed
+++ \param num_points The number of values in src0 to be analysed
+++ */
+++static inline void volk_gnsssdr_8i_index_max_16u_u_avx(unsigned int* target, const char* src0, unsigned int num_points) {
+++ if(num_points > 0){
+++ const unsigned int sse_iters = num_points / 32;
+++
+++ char* basePtr = (char*)src0;
+++ char* inputPtr = (char*)src0;
+++ char max = src0[0];
+++ unsigned int index = 0;
+++ __VOLK_ATTR_ALIGNED(32) char currentValuesBuffer[32];
+++ __m256i ones, compareResults, currentValues;
+++ __m128i compareResultslo, compareResultshi, maxValues, lo, hi;
+++
+++ ones = _mm256_set1_epi8(0xFF);
+++ maxValues = _mm_set1_epi8(max);
+++
+++ for(unsigned int number = 0; number < sse_iters; number++)
+++ {
+++ currentValues = _mm256_lddqu_si256((__m256i*)inputPtr);
+++
+++ lo = _mm256_castsi256_si128(currentValues);
+++ hi = _mm256_extractf128_si256(currentValues,1);
+++
+++ compareResultslo = _mm_cmpgt_epi8(maxValues, lo);
+++ compareResultshi = _mm_cmpgt_epi8(maxValues, hi);
+++
+++ //compareResults = _mm256_set_m128i(compareResultshi , compareResultslo); //not defined in some versions of immintrin.h
+++ compareResults = _mm256_insertf128_si256(_mm256_castsi128_si256(compareResultslo),(compareResultshi),1);
+++
+++ if (!_mm256_testc_si256(compareResults, ones))
+++ {
+++ _mm256_storeu_si256((__m256i*)&currentValuesBuffer, currentValues);
+++
+++ for(int i = 0; i < 32; i++)
+++ {
+++ if(currentValuesBuffer[i] > max)
+++ {
+++ index = inputPtr - basePtr + i;
+++ max = currentValuesBuffer[i];
+++ }
+++ }
+++ maxValues = _mm_set1_epi8(max);
+++ }
+++
+++ inputPtr += 32;
+++ }
+++
+++ for(int i = 0; i<(num_points % 32); ++i)
+++ {
+++ if(src0[i] > max)
+++ {
+++ index = i;
+++ max = src0[i];
+++ }
+++ }
+++ target[0] = index;
+++ }
+++}
+++
+++#endif /*LV_HAVE_AVX*/
+++
+++#ifdef LV_HAVE_SSE4_1
+++#include<smmintrin.h>
+++/*!
+++ \brief Returns the index of the max value in src0
+++ \param target The index of the max value in src0
+++ \param src0 The buffer of data to be analysed
+++ \param num_points The number of values in src0 to be analysed
+++ */
+++static inline void volk_gnsssdr_8i_index_max_16u_u_sse4_1(unsigned int* target, const char* src0, unsigned int num_points) {
+++ if(num_points > 0){
+++ const unsigned int sse_iters = num_points / 16;
+++
+++ char* basePtr = (char*)src0;
+++ char* inputPtr = (char*)src0;
+++ char max = src0[0];
+++ unsigned int index = 0;
+++ __VOLK_ATTR_ALIGNED(16) char currentValuesBuffer[16];
+++ __m128i maxValues, compareResults, currentValues;
+++
+++ maxValues = _mm_set1_epi8(max);
+++
+++ for(unsigned int number = 0; number < sse_iters; number++)
+++ {
+++ currentValues = _mm_lddqu_si128((__m128i*)inputPtr);
+++
+++ compareResults = _mm_cmpgt_epi8(maxValues, currentValues);
+++
+++ if (!_mm_test_all_ones(compareResults))
+++ {
+++ _mm_storeu_si128((__m128i*)&currentValuesBuffer, currentValues);
+++
+++ for(int i = 0; i < 16; i++)
+++ {
+++ if(currentValuesBuffer[i] > max)
+++ {
+++ index = inputPtr - basePtr + i;
+++ max = currentValuesBuffer[i];
+++ }
+++ }
+++ maxValues = _mm_set1_epi8(max);
+++ }
+++
+++ inputPtr += 16;
+++ }
+++
+++ for(int i = 0; i<(num_points % 16); ++i)
+++ {
+++ if(src0[i] > max)
+++ {
+++ index = i;
+++ max = src0[i];
+++ }
+++ }
+++ target[0] = index;
+++ }
+++}
+++
+++#endif /*LV_HAVE_SSE4_1*/
+++
+++#ifdef LV_HAVE_SSE2
+++#include<xmmintrin.h>
+++/*!
+++ \brief Returns the index of the max value in src0
+++ \param target The index of the max value in src0
+++ \param src0 The buffer of data to be analysed
+++ \param num_points The number of values in src0 to be analysed
+++ */
+++static inline void volk_gnsssdr_8i_index_max_16u_u_sse2(unsigned int* target, const char* src0, unsigned int num_points) {
+++ if(num_points > 0){
+++ const unsigned int sse_iters = num_points / 16;
+++
+++ char* basePtr = (char*)src0;
+++ char* inputPtr = (char*)src0;
+++ char max = src0[0];
+++ unsigned int index = 0;
+++ unsigned short mask;
+++ __VOLK_ATTR_ALIGNED(16) char currentValuesBuffer[16];
+++ __m128i maxValues, compareResults, currentValues;
+++
+++ maxValues = _mm_set1_epi8(max);
+++
+++ for(unsigned int number = 0; number < sse_iters; number++)
+++ {
+++ currentValues = _mm_loadu_si128((__m128i*)inputPtr);
+++ compareResults = _mm_cmpgt_epi8(maxValues, currentValues);
+++ mask = _mm_movemask_epi8(compareResults);
+++
+++ if (mask != 0xFFFF)
+++ {
+++ _mm_storeu_si128((__m128i*)&currentValuesBuffer, currentValues);
+++ mask = ~mask;
+++ int i = 0;
+++ while (mask > 0)
+++ {
+++ if ((mask & 1) == 1)
+++ {
+++ if(currentValuesBuffer[i] > max)
+++ {
+++ index = inputPtr - basePtr + i;
+++ max = currentValuesBuffer[i];
+++ }
+++ }
+++ i++;
+++ mask >>= 1;
+++ }
+++ maxValues = _mm_set1_epi8(max);
+++ }
+++ inputPtr += 16;
+++ }
+++
+++ for(int i = 0; i<(num_points % 16); ++i)
+++ {
+++ if(src0[i] > max)
+++ {
+++ index = i;
+++ max = src0[i];
+++ }
+++ }
+++ target[0] = index;
+++ }
+++}
+++
+++#endif /*LV_HAVE_SSE2*/
+++
+++#ifdef LV_HAVE_GENERIC
+++/*!
+++ \brief Returns the index of the max value in src0
+++ \param target The index of the max value in src0
+++ \param src0 The buffer of data to be analysed
+++ \param num_points The number of values in src0 to be analysed
+++ */
+++static inline void volk_gnsssdr_8i_index_max_16u_generic(unsigned int* target, const char* src0, unsigned int num_points) {
+++
+++ if(num_points > 0)
+++ {
+++ char max = src0[0];
+++ unsigned int index = 0;
+++
+++ for(unsigned int i = 1; i < num_points; ++i)
+++ {
+++ if(src0[i] > max)
+++ {
+++ index = i;
+++ max = src0[i];
+++ }
+++ }
+++ target[0] = index;
+++ }
+++}
+++
+++#endif /*LV_HAVE_GENERIC*/
+++
+++#endif /*INCLUDED_volk_gnsssdr_8i_index_max_16u_u_H*/
+++
+++
+++#ifndef INCLUDED_volk_gnsssdr_8i_index_max_16u_a_H
+++#define INCLUDED_volk_gnsssdr_8i_index_max_16u_a_H
+++
+++#include <volk_gnsssdr/volk_gnsssdr_common.h>
+++#include <inttypes.h>
+++#include <stdio.h>
+++
+++#ifdef LV_HAVE_AVX
+++#include "immintrin.h"
+++/*!
+++ \brief Returns the index of the max value in src0
+++ \param target The index of the max value in src0
+++ \param src0 The buffer of data to be analysed
+++ \param num_points The number of values in src0 to be analysed
+++ */
+++static inline void volk_gnsssdr_8i_index_max_16u_a_avx(unsigned int* target, const char* src0, unsigned int num_points) {
+++ if(num_points > 0){
+++ const unsigned int sse_iters = num_points / 32;
+++
+++ char* basePtr = (char*)src0;
+++ char* inputPtr = (char*)src0;
+++ char max = src0[0];
+++ unsigned int index = 0;
+++ __VOLK_ATTR_ALIGNED(32) char currentValuesBuffer[32];
+++ __m256i ones, compareResults, currentValues;
+++ __m128i compareResultslo, compareResultshi, maxValues, lo, hi;
+++
+++ ones = _mm256_set1_epi8(0xFF);
+++ maxValues = _mm_set1_epi8(max);
+++
+++ for(unsigned int number = 0; number < sse_iters; number++)
+++ {
+++ currentValues = _mm256_load_si256((__m256i*)inputPtr);
+++
+++ lo = _mm256_castsi256_si128(currentValues);
+++ hi = _mm256_extractf128_si256(currentValues,1);
+++
+++ compareResultslo = _mm_cmpgt_epi8(maxValues, lo);
+++ compareResultshi = _mm_cmpgt_epi8(maxValues, hi);
+++
+++ //compareResults = _mm256_set_m128i(compareResultshi , compareResultslo); //not defined in some versions of immintrin.h
+++ compareResults = _mm256_insertf128_si256(_mm256_castsi128_si256(compareResultslo),(compareResultshi),1);
+++
+++ if (!_mm256_testc_si256(compareResults, ones))
+++ {
+++ _mm256_store_si256((__m256i*)&currentValuesBuffer, currentValues);
+++
+++ for(int i = 0; i < 32; i++)
+++ {
+++ if(currentValuesBuffer[i] > max)
+++ {
+++ index = inputPtr - basePtr + i;
+++ max = currentValuesBuffer[i];
+++ }
+++ }
+++ maxValues = _mm_set1_epi8(max);
+++ }
+++
+++ inputPtr += 32;
+++ }
+++
+++ for(int i = 0; i<(num_points % 32); ++i)
+++ {
+++ if(src0[i] > max)
+++ {
+++ index = i;
+++ max = src0[i];
+++ }
+++ }
+++ target[0] = index;
+++ }
+++}
+++
+++#endif /*LV_HAVE_AVX*/
+++
+++#ifdef LV_HAVE_SSE4_1
+++#include "smmintrin.h"
+++#include "emmintrin.h"
+++/*!
+++ \brief Returns the index of the max value in src0
+++ \param target The index of the max value in src0
+++ \param src0 The buffer of data to be analysed
+++ \param num_points The number of values in src0 to be analysed
+++ */
+++static inline void volk_gnsssdr_8i_index_max_16u_a_sse4_1(unsigned int* target, const char* src0, unsigned int num_points) {
+++ if(num_points > 0){
+++ const unsigned int sse_iters = num_points / 16;
+++
+++ char* basePtr = (char*)src0;
+++ char* inputPtr = (char*)src0;
+++ char max = src0[0];
+++ unsigned int index = 0;
+++ __VOLK_ATTR_ALIGNED(16) char currentValuesBuffer[16];
+++ __m128i maxValues, compareResults, currentValues;
+++
+++ maxValues = _mm_set1_epi8(max);
+++
+++ for(unsigned int number = 0; number < sse_iters; number++)
+++ {
+++ currentValues = _mm_load_si128((__m128i*)inputPtr);
+++
+++ compareResults = _mm_cmpgt_epi8(maxValues, currentValues);
+++
+++ if (!_mm_test_all_ones(compareResults))
+++ {
+++ _mm_store_si128((__m128i*)&currentValuesBuffer, currentValues);
+++
+++ for(int i = 0; i < 16; i++)
+++ {
+++ if(currentValuesBuffer[i] > max)
+++ {
+++ index = inputPtr - basePtr + i;
+++ max = currentValuesBuffer[i];
+++ }
+++ }
+++ maxValues = _mm_set1_epi8(max);
+++ }
+++
+++ inputPtr += 16;
+++ }
+++
+++ for(int i = 0; i<(num_points % 16); ++i)
+++ {
+++ if(src0[i] > max)
+++ {
+++ index = i;
+++ max = src0[i];
+++ }
+++ }
+++ target[0] = index;
+++ }
+++}
+++
+++#endif /*LV_HAVE_SSE4_1*/
+++
+++#ifdef LV_HAVE_SSE2
+++#include "emmintrin.h"
+++/*!
+++ \brief Returns the index of the max value in src0
+++ \param target The index of the max value in src0
+++ \param src0 The buffer of data to be analysed
+++ \param num_points The number of values in src0 to be analysed
+++ */
+++static inline void volk_gnsssdr_8i_index_max_16u_a_sse2(unsigned int* target, const char* src0, unsigned int num_points) {
+++ if(num_points > 0){
+++ const unsigned int sse_iters = num_points / 16;
+++
+++ char* basePtr = (char*)src0;
+++ char* inputPtr = (char*)src0;
+++ char max = src0[0];
+++ unsigned int index = 0;
+++ unsigned short mask;
+++ __VOLK_ATTR_ALIGNED(16) char currentValuesBuffer[16];
+++ __m128i maxValues, compareResults, currentValues;
+++
+++ maxValues = _mm_set1_epi8(max);
+++
+++ for(unsigned int number = 0; number < sse_iters; number++)
+++ {
+++ currentValues = _mm_load_si128((__m128i*)inputPtr);
+++ compareResults = _mm_cmpgt_epi8(maxValues, currentValues);
+++ mask = _mm_movemask_epi8(compareResults);
+++
+++ if (mask != 0xFFFF)
+++ {
+++ _mm_store_si128((__m128i*)&currentValuesBuffer, currentValues);
+++ mask = ~mask;
+++ int i = 0;
+++ while (mask > 0)
+++ {
+++ if ((mask & 1) == 1)
+++ {
+++ if(currentValuesBuffer[i] > max)
+++ {
+++ index = inputPtr - basePtr + i;
+++ max = currentValuesBuffer[i];
+++ }
+++ }
+++ i++;
+++ mask >>= 1;
+++ }
+++ maxValues = _mm_set1_epi8(max);
+++ }
+++ inputPtr += 16;
+++ }
+++
+++ for(int i = 0; i<(num_points % 16); ++i)
+++ {
+++ if(src0[i] > max)
+++ {
+++ index = i;
+++ max = src0[i];
+++ }
+++ }
+++ target[0] = index;
+++ }
+++}
+++
+++#endif /*LV_HAVE_SSE2*/
+++
+++#ifdef LV_HAVE_GENERIC
+++/*!
+++ \brief Returns the index of the max value in src0
+++ \param target The index of the max value in src0
+++ \param src0 The buffer of data to be analysed
+++ \param num_points The number of values in src0 to be analysed
+++ */
+++static inline void volk_gnsssdr_8i_index_max_16u_a_generic(unsigned int* target, const char* src0, unsigned int num_points) {
+++
+++ if(num_points > 0)
+++ {
+++ char max = src0[0];
+++ unsigned int index = 0;
+++
+++ for(unsigned int i = 1; i < num_points; ++i)
+++ {
+++ if(src0[i] > max)
+++ {
+++ index = i;
+++ max = src0[i];
+++ }
+++ }
+++ target[0] = index;
+++ }
+++}
+++
+++#endif /*LV_HAVE_GENERIC*/
+++
+++#endif /*INCLUDED_volk_gnsssdr_8i_index_max_16u_a_H*/
++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_max_s8i.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8i_max_s8i.h
++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_max_s8i.h 1970-01-01 01:00:00.000000000 +0100
+++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8i_max_s8i.h 2014-10-15 01:55:08.000000000 +0200
++@@ -0,0 +1,327 @@
+++/*!
+++ * \file volk_gnsssdr_8i_max_s8i.h
+++ * \brief Volk protokernel: calculates the maximum value in a group of 8 bits (char) scalars
+++ * \authors <ul>
+++ * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
+++ * </ul>
+++ *
+++ * Volk protokernel that returns the maximum value of a group of 8 bits (char) scalars
+++ *
+++ * -------------------------------------------------------------------------
+++ *
+++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
+++ *
+++ * GNSS-SDR is a software defined Global Navigation
+++ * Satellite Systems receiver
+++ *
+++ * This file is part of GNSS-SDR.
+++ *
+++ * GNSS-SDR is free software: you can redistribute it and/or modify
+++ * it under the terms of the GNU General Public License as published by
+++ * the Free Software Foundation, either version 3 of the License, or
+++ * at your option) any later version.
+++ *
+++ * GNSS-SDR is distributed in the hope that it will be useful,
+++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+++ * GNU General Public License for more details.
+++ *
+++ * You should have received a copy of the GNU General Public License
+++ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
+++ *
+++ * -------------------------------------------------------------------------
+++ */
+++
+++#ifndef INCLUDED_volk_gnsssdr_8i_max_s8i_u_H
+++#define INCLUDED_volk_gnsssdr_8i_max_s8i_u_H
+++
+++#include <volk_gnsssdr/volk_gnsssdr_common.h>
+++#include <inttypes.h>
+++#include <stdio.h>
+++
+++#ifdef LV_HAVE_SSE4_1
+++#include<smmintrin.h>
+++/*!
+++ \brief Returns the max value in src0
+++ \param target The max value in src0
+++ \param src0 The buffer of data to be analysed
+++ \param num_points The number of values in src0 to be analysed
+++ */
+++static inline void volk_gnsssdr_8i_max_s8i_u_sse4_1(char target, const char* src0, unsigned int num_points) {
+++ if(num_points > 0){
+++ const unsigned int sse_iters = num_points / 16;
+++
+++ char* inputPtr = (char*)src0;
+++ char max = src0[0];
+++ __VOLK_ATTR_ALIGNED(16) char maxValuesBuffer[16];
+++ __m128i maxValues, compareResults, currentValues;
+++
+++ maxValues = _mm_set1_epi8(max);
+++
+++ for(unsigned int number = 0; number < sse_iters; number++)
+++ {
+++ currentValues = _mm_loadu_si128((__m128i*)inputPtr);
+++ compareResults = _mm_cmpgt_epi8(maxValues, currentValues);
+++ maxValues = _mm_blendv_epi8(currentValues, maxValues, compareResults);
+++ inputPtr += 16;
+++ }
+++
+++ _mm_storeu_si128((__m128i*)maxValuesBuffer, maxValues);
+++
+++ for(int i = 0; i<16; ++i)
+++ {
+++ if(maxValuesBuffer[i] > max)
+++ {
+++ max = maxValuesBuffer[i];
+++ }
+++ }
+++
+++ for(int i = 0; i<(num_points % 16); ++i)
+++ {
+++ if(src0[i] > max)
+++ {
+++ max = src0[i];
+++ }
+++ }
+++ target = max;
+++ }
+++}
+++
+++#endif /*LV_HAVE_SSE4_1*/
+++
+++#ifdef LV_HAVE_SSE2
+++#include<xmmintrin.h>
+++/*!
+++ \brief Returns the max value in src0
+++ \param target The max value in src0
+++ \param src0 The buffer of data to be analysed
+++ \param num_points The number of values in src0 to be analysed
+++ */
+++static inline void volk_gnsssdr_8i_max_s8i_u_sse2(char target, const char* src0, unsigned int num_points) {
+++ if(num_points > 0){
+++ const unsigned int sse_iters = num_points / 16;
+++
+++ char* inputPtr = (char*)src0;
+++ char max = src0[0];
+++ unsigned short mask;
+++ __VOLK_ATTR_ALIGNED(16) char currentValuesBuffer[16];
+++ __m128i maxValues, compareResults, currentValues;
+++
+++ maxValues = _mm_set1_epi8(max);
+++
+++ for(unsigned int number = 0; number < sse_iters; number++)
+++ {
+++ currentValues = _mm_loadu_si128((__m128i*)inputPtr);
+++ compareResults = _mm_cmpgt_epi8(maxValues, currentValues);
+++ mask = _mm_movemask_epi8(compareResults);
+++
+++ if (mask != 0xFFFF)
+++ {
+++ _mm_storeu_si128((__m128i*)&currentValuesBuffer, currentValues);
+++ mask = ~mask;
+++ int i = 0;
+++ while (mask > 0)
+++ {
+++ if ((mask & 1) == 1)
+++ {
+++ if(currentValuesBuffer[i] > max)
+++ {
+++ max = currentValuesBuffer[i];
+++ }
+++ }
+++ i++;
+++ mask >>= 1;
+++ }
+++ maxValues = _mm_set1_epi8(max);
+++ }
+++ inputPtr += 16;
+++ }
+++
+++ for(int i = 0; i<(num_points % 16); ++i)
+++ {
+++ if(src0[i] > max)
+++ {
+++ max = src0[i];
+++ }
+++ }
+++ target = max;
+++ }
+++}
+++
+++#endif /*LV_HAVE_SSE2*/
+++
+++#ifdef LV_HAVE_GENERIC
+++/*!
+++ \brief Returns the max value in src0
+++ \param target The max value in src0
+++ \param src0 The buffer of data to be analysed
+++ \param num_points The number of values in src0 to be analysed
+++ */
+++static inline void volk_gnsssdr_8i_max_s8i_generic(char target, const char* src0, unsigned int num_points) {
+++ if(num_points > 0)
+++ {
+++ char max = src0[0];
+++
+++ for(unsigned int i = 1; i < num_points; ++i)
+++ {
+++ if(src0[i] > max)
+++ {
+++ max = src0[i];
+++ }
+++ }
+++ target = max;
+++ }
+++}
+++
+++#endif /*LV_HAVE_GENERIC*/
+++
+++#endif /*INCLUDED_volk_gnsssdr_8i_max_s8i_u_H*/
+++
+++
+++#ifndef INCLUDED_volk_gnsssdr_8i_max_s8i_a_H
+++#define INCLUDED_volk_gnsssdr_8i_max_s8i_a_H
+++
+++#include <volk_gnsssdr/volk_gnsssdr_common.h>
+++#include <inttypes.h>
+++#include <stdio.h>
+++
+++#ifdef LV_HAVE_SSE4_1
+++#include "smmintrin.h"
+++/*!
+++ \brief Returns the max value in src0
+++ \param target The max value in src0
+++ \param src0 The buffer of data to be analysed
+++ \param num_points The number of values in src0 to be analysed
+++ */
+++static inline void volk_gnsssdr_8i_max_s8i_a_sse4_1(char target, const char* src0, unsigned int num_points) {
+++ if(num_points > 0){
+++ const unsigned int sse_iters = num_points / 16;
+++
+++ char* inputPtr = (char*)src0;
+++ char max = src0[0];
+++ __VOLK_ATTR_ALIGNED(16) char maxValuesBuffer[16];
+++ __m128i maxValues, compareResults, currentValues;
+++
+++ maxValues = _mm_set1_epi8(max);
+++
+++ for(unsigned int number = 0; number < sse_iters; number++)
+++ {
+++ currentValues = _mm_load_si128((__m128i*)inputPtr);
+++ compareResults = _mm_cmpgt_epi8(maxValues, currentValues);
+++ maxValues = _mm_blendv_epi8(currentValues, maxValues, compareResults);
+++ inputPtr += 16;
+++ }
+++
+++ _mm_store_si128((__m128i*)maxValuesBuffer, maxValues);
+++
+++ for(int i = 0; i<16; ++i)
+++ {
+++ if(maxValuesBuffer[i] > max)
+++ {
+++ max = maxValuesBuffer[i];
+++ }
+++ }
+++
+++ for(int i = 0; i<(num_points % 16); ++i)
+++ {
+++ if(src0[i] > max)
+++ {
+++ max = src0[i];
+++ }
+++ }
+++ target = max;
+++ }
+++}
+++
+++#endif /*LV_HAVE_SSE4_1*/
+++
+++#ifdef LV_HAVE_SSE2
+++#include "emmintrin.h"
+++/*!
+++ \brief Returns the max value in src0
+++ \param target The max value in src0
+++ \param src0 The buffer of data to be analysed
+++ \param num_points The number of values in src0 to be analysed
+++ */
+++static inline void volk_gnsssdr_8i_max_s8i_a_sse2(char target, const char* src0, unsigned int num_points) {
+++ if(num_points > 0){
+++ const unsigned int sse_iters = num_points / 16;
+++
+++ char* inputPtr = (char*)src0;
+++ char max = src0[0];
+++ unsigned short mask;
+++ __VOLK_ATTR_ALIGNED(16) char currentValuesBuffer[16];
+++ __m128i maxValues, compareResults, currentValues;
+++
+++ maxValues = _mm_set1_epi8(max);
+++
+++ for(unsigned int number = 0; number < sse_iters; number++)
+++ {
+++ currentValues = _mm_load_si128((__m128i*)inputPtr);
+++ compareResults = _mm_cmpgt_epi8(maxValues, currentValues);
+++ mask = _mm_movemask_epi8(compareResults);
+++
+++ if (mask != 0xFFFF)
+++ {
+++ _mm_store_si128((__m128i*)&currentValuesBuffer, currentValues);
+++ mask = ~mask;
+++ int i = 0;
+++ while (mask > 0)
+++ {
+++ if ((mask & 1) == 1)
+++ {
+++ if(currentValuesBuffer[i] > max)
+++ {
+++ max = currentValuesBuffer[i];
+++ }
+++ }
+++ i++;
+++ mask >>= 1;
+++ }
+++ maxValues = _mm_set1_epi8(max);
+++ }
+++ inputPtr += 16;
+++ }
+++
+++ for(int i = 0; i<(num_points % 16); ++i)
+++ {
+++ if(src0[i] > max)
+++ {
+++ max = src0[i];
+++ }
+++ }
+++ target = max;
+++ }
+++}
+++
+++#endif /*LV_HAVE_SSE2*/
+++
+++#ifdef LV_HAVE_GENERIC
+++/*!
+++ \brief Returns the max value in src0
+++ \param target The max value in src0
+++ \param src0 The buffer of data to be analysed
+++ \param num_points The number of values in src0 to be analysed
+++ */
+++static inline void volk_gnsssdr_8i_max_s8i_a_generic(char target, const char* src0, unsigned int num_points) {
+++ if(num_points > 0)
+++ {
+++ if(num_points > 0)
+++ {
+++ char max = src0[0];
+++
+++ for(unsigned int i = 1; i < num_points; ++i)
+++ {
+++ if(src0[i] > max)
+++ {
+++ max = src0[i];
+++ }
+++ }
+++ target = max;
+++ }
+++ }
+++}
+++
+++#endif /*LV_HAVE_GENERIC*/
+++
+++#endif /*INCLUDED_volk_gnsssdr_8i_max_s8i_a_H*/
++\ No newline at end of file
++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_x2_add_8i.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8i_x2_add_8i.h
++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_x2_add_8i.h 1970-01-01 01:00:00.000000000 +0100
+++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8i_x2_add_8i.h 2014-10-15 01:55:08.000000000 +0200
++@@ -0,0 +1,184 @@
+++/*!
+++ * \file volk_gnsssdr_8i_x2_add_8i.h
+++ * \brief Volk protokernel: adds pairs of 8 bits (char) scalars
+++ * \authors <ul>
+++ * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
+++ * </ul>
+++ *
+++ * Volk protokernel that adds pairs of 8 bits (char) scalars
+++ *
+++ * -------------------------------------------------------------------------
+++ *
+++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
+++ *
+++ * GNSS-SDR is a software defined Global Navigation
+++ * Satellite Systems receiver
+++ *
+++ * This file is part of GNSS-SDR.
+++ *
+++ * GNSS-SDR is free software: you can redistribute it and/or modify
+++ * it under the terms of the GNU General Public License as published by
+++ * the Free Software Foundation, either version 3 of the License, or
+++ * at your option) any later version.
+++ *
+++ * GNSS-SDR is distributed in the hope that it will be useful,
+++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+++ * GNU General Public License for more details.
+++ *
+++ * You should have received a copy of the GNU General Public License
+++ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
+++ *
+++ * -------------------------------------------------------------------------
+++ */
+++
+++#ifndef INCLUDED_volk_gnsssdr_8i_x2_add_8i_u_H
+++#define INCLUDED_volk_gnsssdr_8i_x2_add_8i_u_H
+++
+++#include <inttypes.h>
+++#include <stdio.h>
+++
+++#ifdef LV_HAVE_SSE2
+++#include "pmmintrin.h"
+++/*!
+++ \brief Adds the two input vectors and store their results in the third vector
+++ \param cVector The vector where the results will be stored
+++ \param aVector One of the vectors to be added
+++ \param bVector One of the vectors to be added
+++ \param num_points The number of values in aVector and bVector to be added together and stored into cVector
+++ */
+++static inline void volk_gnsssdr_8i_x2_add_8i_u_sse2(char* cVector, const char* aVector, const char* bVector, unsigned int num_points){
+++
+++ const unsigned int sse_iters = num_points / 16;
+++
+++ char* cPtr = cVector;
+++ const char* aPtr = aVector;
+++ const char* bPtr= bVector;
+++
+++ __m128i aVal, bVal, cVal;
+++
+++ for(int number = 0; number < sse_iters; number++){
+++
+++ aVal = _mm_lddqu_si128((__m128i*)aPtr);
+++ bVal = _mm_lddqu_si128((__m128i*)bPtr);
+++
+++ cVal = _mm_add_epi8(aVal, bVal);
+++
+++ _mm_storeu_si128((__m128i*)cPtr,cVal); // Store the results back into the C container
+++
+++ aPtr += 16;
+++ bPtr += 16;
+++ cPtr += 16;
+++ }
+++
+++ for(int i = 0; i<(num_points % 16); ++i)
+++ {
+++ *cPtr++ = (*aPtr++) + (*bPtr++);
+++ }
+++}
+++#endif /* LV_HAVE_SSE2 */
+++
+++#ifdef LV_HAVE_GENERIC
+++/*!
+++ \brief Adds the two input vectors and store their results in the third vector
+++ \param cVector The vector where the results will be stored
+++ \param aVector One of the vectors to be added
+++ \param bVector One of the vectors to be added
+++ \param num_points The number of values in aVector and bVector to be added together and stored into cVector
+++ */
+++static inline void volk_gnsssdr_8i_x2_add_8i_generic(char* cVector, const char* aVector, const char* bVector, unsigned int num_points){
+++ char* cPtr = cVector;
+++ const char* aPtr = aVector;
+++ const char* bPtr= bVector;
+++ unsigned int number = 0;
+++
+++ for(number = 0; number < num_points; number++){
+++ *cPtr++ = (*aPtr++) + (*bPtr++);
+++ }
+++}
+++#endif /* LV_HAVE_GENERIC */
+++
+++#endif /* INCLUDED_volk_gnsssdr_8i_x2_add_8i_u_H */
+++
+++
+++#ifndef INCLUDED_volk_gnsssdr_8i_x2_add_8i_a_H
+++#define INCLUDED_volk_gnsssdr_8i_x2_add_8i_a_H
+++
+++#include <inttypes.h>
+++#include <stdio.h>
+++
+++#ifdef LV_HAVE_SSE2
+++#include "pmmintrin.h"
+++/*!
+++ \brief Adds the two input vectors and store their results in the third vector
+++ \param cVector The vector where the results will be stored
+++ \param aVector One of the vectors to be added
+++ \param bVector One of the vectors to be added
+++ \param num_points The number of values in aVector and bVector to be added together and stored into cVector
+++ */
+++static inline void volk_gnsssdr_8i_x2_add_8i_a_sse2(char* cVector, const char* aVector, const char* bVector, unsigned int num_points){
+++
+++ const unsigned int sse_iters = num_points / 16;
+++
+++ char* cPtr = cVector;
+++ const char* aPtr = aVector;
+++ const char* bPtr= bVector;
+++
+++ __m128i aVal, bVal, cVal;
+++
+++ for(int number = 0; number < sse_iters; number++){
+++
+++ aVal = _mm_load_si128((__m128i*)aPtr);
+++ bVal = _mm_load_si128((__m128i*)bPtr);
+++
+++ cVal = _mm_add_epi8(aVal, bVal);
+++
+++ _mm_store_si128((__m128i*)cPtr,cVal); // Store the results back into the C container
+++
+++ aPtr += 16;
+++ bPtr += 16;
+++ cPtr += 16;
+++ }
+++
+++ for(int i = 0; i<(num_points % 16); ++i)
+++ {
+++ *cPtr++ = (*aPtr++) + (*bPtr++);
+++ }
+++}
+++#endif /* LV_HAVE_SSE2 */
+++
+++#ifdef LV_HAVE_GENERIC
+++/*!
+++ \brief Adds the two input vectors and store their results in the third vector
+++ \param cVector The vector where the results will be stored
+++ \param aVector One of the vectors to be added
+++ \param bVector One of the vectors to be added
+++ \param num_points The number of values in aVector and bVector to be added together and stored into cVector
+++ */
+++static inline void volk_gnsssdr_8i_x2_add_8i_a_generic(char* cVector, const char* aVector, const char* bVector, unsigned int num_points){
+++ char* cPtr = cVector;
+++ const char* aPtr = aVector;
+++ const char* bPtr= bVector;
+++ unsigned int number = 0;
+++
+++ for(number = 0; number < num_points; number++){
+++ *cPtr++ = (*aPtr++) + (*bPtr++);
+++ }
+++}
+++#endif /* LV_HAVE_GENERIC */
+++
+++#ifdef LV_HAVE_ORC
+++/*!
+++ \brief Adds the two input vectors and store their results in the third vector
+++ \param cVector The vector where the results will be stored
+++ \param aVector One of the vectors to be added
+++ \param bVector One of the vectors to be added
+++ \param num_points The number of values in aVector and bVector to be added together and stored into cVector
+++ */
+++extern void volk_gnsssdr_8i_x2_add_8i_a_orc_impl(char* cVector, const char* aVector, const char* bVector, unsigned int num_points);
+++static inline void volk_gnsssdr_8i_x2_add_8i_u_orc(char* cVector, const char* aVector, const char* bVector, unsigned int num_points){
+++ volk_gnsssdr_8i_x2_add_8i_a_orc_impl(cVector, aVector, bVector, num_points);
+++}
+++#endif /* LV_HAVE_ORC */
+++
+++#endif /* INCLUDED_volk_gnsssdr_8i_x2_add_8i_a_H */
++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_conjugate_8ic.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_conjugate_8ic.h
++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_conjugate_8ic.h 1970-01-01 01:00:00.000000000 +0100
+++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_conjugate_8ic.h 2014-10-15 01:55:08.000000000 +0200
++@@ -0,0 +1,326 @@
+++/*!
+++ * \file volk_gnsssdr_8ic_conjugate_8ic.h
+++ * \brief Volk protokernel: calculates the conjugate of a 16 bits vector
+++ * \authors <ul>
+++ * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
+++ * </ul>
+++ *
+++ * Volk protokernel that calculates the conjugate of a
+++ * 16 bits vector (8 bits the real part and 8 bits the imaginary part)
+++ *
+++ * -------------------------------------------------------------------------
+++ *
+++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
+++ *
+++ * GNSS-SDR is a software defined Global Navigation
+++ * Satellite Systems receiver
+++ *
+++ * This file is part of GNSS-SDR.
+++ *
+++ * GNSS-SDR is free software: you can redistribute it and/or modify
+++ * it under the terms of the GNU General Public License as published by
+++ * the Free Software Foundation, either version 3 of the License, or
+++ * at your option) any later version.
+++ *
+++ * GNSS-SDR is distributed in the hope that it will be useful,
+++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+++ * GNU General Public License for more details.
+++ *
+++ * You should have received a copy of the GNU General Public License
+++ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
+++ *
+++ * -------------------------------------------------------------------------
+++ */
+++
+++#ifndef INCLUDED_volk_gnsssdr_8ic_conjugate_8ic_u_H
+++#define INCLUDED_volk_gnsssdr_8ic_conjugate_8ic_u_H
+++
+++#include <inttypes.h>
+++#include <stdio.h>
+++#include <volk_gnsssdr/volk_gnsssdr_complex.h>
+++
+++#ifdef LV_HAVE_AVX
+++#include "immintrin.h"
+++/*!
+++ \brief Takes the conjugate of an unsigned char vector.
+++ \param cVector The vector where the results will be stored
+++ \param aVector Vector to be conjugated
+++ \param num_points The number of unsigned char values in aVector to be conjugated and stored into cVector
+++ */
+++static inline void volk_gnsssdr_8ic_conjugate_8ic_u_avx(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points){
+++ const unsigned int sse_iters = num_points / 16;
+++
+++ lv_8sc_t* c = cVector;
+++ const lv_8sc_t* a = aVector;
+++
+++ __m256 tmp;
+++ __m128i tmp128lo, tmp128hi;
+++ __m256 conjugator1 = _mm256_castsi256_ps(_mm256_setr_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255));
+++ __m128i conjugator2 = _mm_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1);
+++
+++ for (int i = 0; i < sse_iters; ++i)
+++ {
+++ tmp = _mm256_loadu_ps((float*)a);
+++ tmp = _mm256_xor_ps(tmp, conjugator1);
+++ tmp128lo = _mm256_castsi256_si128(_mm256_castps_si256(tmp));
+++ tmp128lo = _mm_add_epi8(tmp128lo, conjugator2);
+++ tmp128hi = _mm256_extractf128_si256(_mm256_castps_si256(tmp),1);
+++ tmp128hi = _mm_add_epi8(tmp128hi, conjugator2);
+++ //tmp = _mm256_set_m128i(tmp128hi , tmp128lo); //not defined in some versions of immintrin.h
+++ tmp = _mm256_insertf128_si256(_mm256_castsi128_si256(tmp128lo),(tmp128hi),1);
+++ _mm256_storeu_ps((float*)c, tmp);
+++
+++ a += 16;
+++ c += 16;
+++ }
+++
+++ for (int i = 0; i<(num_points % 16); ++i)
+++ {
+++ *c++ = lv_conj(*a++);
+++ }
+++}
+++#endif /* LV_HAVE_AVX */
+++
+++#ifdef LV_HAVE_SSSE3
+++#include "tmmintrin.h"
+++/*!
+++ \brief Takes the conjugate of an unsigned char vector.
+++ \param cVector The vector where the results will be stored
+++ \param aVector Vector to be conjugated
+++ \param num_points The number of unsigned char values in aVector to be conjugated and stored into cVector
+++ */
+++static inline void volk_gnsssdr_8ic_conjugate_8ic_u_ssse3(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points){
+++ const unsigned int sse_iters = num_points / 8;
+++
+++ lv_8sc_t* c = cVector;
+++ const lv_8sc_t* a = aVector;
+++ __m128i tmp;
+++
+++ __m128i conjugator = _mm_setr_epi8(1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1);
+++
+++ for (int i = 0; i < sse_iters; ++i)
+++ {
+++ tmp = _mm_lddqu_si128((__m128i*)a);
+++ tmp = _mm_sign_epi8(tmp, conjugator);
+++ _mm_storeu_si128((__m128i*)c, tmp);
+++ a += 8;
+++ c += 8;
+++ }
+++
+++ for (int i = 0; i<(num_points % 8); ++i)
+++ {
+++ *c++ = lv_conj(*a++);
+++ }
+++
+++}
+++#endif /* LV_HAVE_SSSE3 */
+++
+++#ifdef LV_HAVE_SSE3
+++#include <pmmintrin.h>
+++/*!
+++ \brief Takes the conjugate of an unsigned char vector.
+++ \param cVector The vector where the results will be stored
+++ \param aVector Vector to be conjugated
+++ \param num_points The number of unsigned char values in aVector to be conjugated and stored into cVector
+++ */
+++static inline void volk_gnsssdr_8ic_conjugate_8ic_u_sse3(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points){
+++ const unsigned int sse_iters = num_points / 8;
+++
+++ lv_8sc_t* c = cVector;
+++ const lv_8sc_t* a = aVector;
+++ __m128i tmp;
+++
+++ __m128i conjugator1 = _mm_setr_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
+++ __m128i conjugator2 = _mm_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1);
+++
+++ for (int i = 0; i < sse_iters; ++i)
+++ {
+++ tmp = _mm_lddqu_si128((__m128i*)a);
+++ tmp = _mm_xor_si128(tmp, conjugator1);
+++ tmp = _mm_add_epi8(tmp, conjugator2);
+++ _mm_storeu_si128((__m128i*)c, tmp);
+++ a += 8;
+++ c += 8;
+++ }
+++
+++ for (int i = 0; i<(num_points % 8); ++i)
+++ {
+++ *c++ = lv_conj(*a++);
+++ }
+++
+++}
+++#endif /* LV_HAVE_SSE3 */
+++
+++#ifdef LV_HAVE_GENERIC
+++/*!
+++ \brief Takes the conjugate of an unsigned char vector.
+++ \param cVector The vector where the results will be stored
+++ \param aVector Vector to be conjugated
+++ \param num_points The number of unsigned char values in aVector to be conjugated and stored into cVector
+++ */
+++static inline void volk_gnsssdr_8ic_conjugate_8ic_generic(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points){
+++ lv_8sc_t* cPtr = cVector;
+++ const lv_8sc_t* aPtr = aVector;
+++ unsigned int number = 0;
+++
+++ for(number = 0; number < num_points; number++){
+++ *cPtr++ = lv_conj(*aPtr++);
+++ }
+++}
+++#endif /* LV_HAVE_GENERIC */
+++
+++#endif /* INCLUDED_volk_gnsssdr_8ic_conjugate_8ic_u_H */
+++
+++
+++#ifndef INCLUDED_volk_gnsssdr_8ic_conjugate_8ic_a_H
+++#define INCLUDED_volk_gnsssdr_8ic_conjugate_8ic_a_H
+++
+++#include <inttypes.h>
+++#include <stdio.h>
+++#include <volk_gnsssdr/volk_gnsssdr_complex.h>
+++
+++#ifdef LV_HAVE_AVX
+++#include "immintrin.h"
+++/*!
+++ \brief Takes the conjugate of an unsigned char vector.
+++ \param cVector The vector where the results will be stored
+++ \param aVector Vector to be conjugated
+++ \param num_points The number of unsigned char values in aVector to be conjugated and stored into cVector
+++ */
+++static inline void volk_gnsssdr_8ic_conjugate_8ic_a_avx(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points){
+++ const unsigned int sse_iters = num_points / 16;
+++
+++ lv_8sc_t* c = cVector;
+++ const lv_8sc_t* a = aVector;
+++
+++ __m256 tmp;
+++ __m128i tmp128lo, tmp128hi;
+++ __m256 conjugator1 = _mm256_castsi256_ps(_mm256_setr_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255));
+++ __m128i conjugator2 = _mm_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1);
+++
+++ for (int i = 0; i < sse_iters; ++i)
+++ {
+++ tmp = _mm256_load_ps((float*)a);
+++ tmp = _mm256_xor_ps(tmp, conjugator1);
+++ tmp128lo = _mm256_castsi256_si128(_mm256_castps_si256(tmp));
+++ tmp128lo = _mm_add_epi8(tmp128lo, conjugator2);
+++ tmp128hi = _mm256_extractf128_si256(_mm256_castps_si256(tmp),1);
+++ tmp128hi = _mm_add_epi8(tmp128hi, conjugator2);
+++ //tmp = _mm256_set_m128i(tmp128hi , tmp128lo); //not defined in some versions of immintrin.h
+++ tmp = _mm256_insertf128_si256(_mm256_castsi128_si256(tmp128lo),(tmp128hi),1);
+++ _mm256_store_ps((float*)c, tmp);
+++
+++ a += 16;
+++ c += 16;
+++ }
+++
+++ for (int i = 0; i<(num_points % 16); ++i)
+++ {
+++ *c++ = lv_conj(*a++);
+++ }
+++}
+++#endif /* LV_HAVE_AVX */
+++
+++#ifdef LV_HAVE_SSSE3
+++#include "tmmintrin.h"
+++/*!
+++ \brief Takes the conjugate of an unsigned char vector.
+++ \param cVector The vector where the results will be stored
+++ \param aVector Vector to be conjugated
+++ \param num_points The number of unsigned char values in aVector to be conjugated and stored into cVector
+++ */
+++static inline void volk_gnsssdr_8ic_conjugate_8ic_a_ssse3(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points){
+++ const unsigned int sse_iters = num_points / 8;
+++
+++ lv_8sc_t* c = cVector;
+++ const lv_8sc_t* a = aVector;
+++ __m128i tmp;
+++
+++ __m128i conjugator = _mm_setr_epi8(1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1);
+++
+++ for (int i = 0; i < sse_iters; ++i)
+++ {
+++ tmp = _mm_load_si128((__m128i*)a);
+++ tmp = _mm_sign_epi8(tmp, conjugator);
+++ _mm_store_si128((__m128i*)c, tmp);
+++ a += 8;
+++ c += 8;
+++ }
+++
+++ for (int i = 0; i<(num_points % 8); ++i)
+++ {
+++ *c++ = lv_conj(*a++);
+++ }
+++
+++}
+++#endif /* LV_HAVE_SSSE3 */
+++
+++#ifdef LV_HAVE_SSE3
+++#include <pmmintrin.h>
+++/*!
+++ \brief Takes the conjugate of an unsigned char vector.
+++ \param cVector The vector where the results will be stored
+++ \param aVector Vector to be conjugated
+++ \param num_points The number of unsigned char values in aVector to be conjugated and stored into cVector
+++ */
+++static inline void volk_gnsssdr_8ic_conjugate_8ic_a_sse3(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points){
+++ const unsigned int sse_iters = num_points / 8;
+++
+++ lv_8sc_t* c = cVector;
+++ const lv_8sc_t* a = aVector;
+++ __m128i tmp;
+++
+++ __m128i conjugator1 = _mm_setr_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
+++ __m128i conjugator2 = _mm_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1);
+++
+++ for (int i = 0; i < sse_iters; ++i)
+++ {
+++ tmp = _mm_load_si128((__m128i*)a);
+++ tmp = _mm_xor_si128(tmp, conjugator1);
+++ tmp = _mm_add_epi8(tmp, conjugator2);
+++ _mm_store_si128((__m128i*)c, tmp);
+++ a += 8;
+++ c += 8;
+++ }
+++
+++ for (int i = 0; i<(num_points % 8); ++i)
+++ {
+++ *c++ = lv_conj(*a++);
+++ }
+++
+++}
+++#endif /* LV_HAVE_SSE3 */
+++
+++#ifdef LV_HAVE_GENERIC
+++/*!
+++ \brief Takes the conjugate of an unsigned char vector.
+++ \param cVector The vector where the results will be stored
+++ \param aVector Vector to be conjugated
+++ \param num_points The number of unsigned char values in aVector to be conjugated and stored into cVector
+++ */
+++static inline void volk_gnsssdr_8ic_conjugate_8ic_a_generic(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points){
+++ lv_8sc_t* cPtr = cVector;
+++ const lv_8sc_t* aPtr = aVector;
+++ unsigned int number = 0;
+++
+++ for(number = 0; number < num_points; number++){
+++ *cPtr++ = lv_conj(*aPtr++);
+++ }
+++}
+++#endif /* LV_HAVE_GENERIC */
+++
+++#ifdef LV_HAVE_ORC
+++/*!
+++ \brief Takes the conjugate of an unsigned char vector.
+++ \param cVector The vector where the results will be stored
+++ \param aVector Vector to be conjugated
+++ \param num_points The number of unsigned char values in aVector to be conjugated and stored into cVector
+++ */
+++extern void volk_gnsssdr_8ic_conjugate_8ic_a_orc_impl(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points);
+++static inline void volk_gnsssdr_8ic_conjugate_8ic_u_orc(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points){
+++ volk_gnsssdr_8ic_conjugate_8ic_a_orc_impl(cVector, aVector, num_points);
+++}
+++#endif /* LV_HAVE_ORC */
+++
+++#endif /* INCLUDED_volk_gnsssdr_8ic_conjugate_8ic_a_H */
++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_magnitude_squared_8i.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_magnitude_squared_8i.h
++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_magnitude_squared_8i.h 1970-01-01 01:00:00.000000000 +0100
+++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_magnitude_squared_8i.h 2014-10-15 01:55:08.000000000 +0200
++@@ -0,0 +1,320 @@
+++/*!
+++ * \file volk_gnsssdr_8ic_magnitude_squared_8i.h
+++ * \brief Volk protokernel: calculates the magnitude squared of a 16 bits vector
+++ * \authors <ul>
+++ * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
+++ * </ul>
+++ *
+++ * Volk protokernel that calculates the magnitude squared of a
+++ * 16 bits vector (8 bits the real part and 8 bits the imaginary part)
+++ * result = (real*real) + (imag*imag)
+++ *
+++ * -------------------------------------------------------------------------
+++ *
+++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
+++ *
+++ * GNSS-SDR is a software defined Global Navigation
+++ * Satellite Systems receiver
+++ *
+++ * This file is part of GNSS-SDR.
+++ *
+++ * GNSS-SDR is free software: you can redistribute it and/or modify
+++ * it under the terms of the GNU General Public License as published by
+++ * the Free Software Foundation, either version 3 of the License, or
+++ * at your option) any later version.
+++ *
+++ * GNSS-SDR is distributed in the hope that it will be useful,
+++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+++ * GNU General Public License for more details.
+++ *
+++ * You should have received a copy of the GNU General Public License
+++ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
+++ *
+++ * -------------------------------------------------------------------------
+++ */
+++
+++#ifndef INCLUDED_volk_gnsssdr_8ic_magnitude_squared_8i_u_H
+++#define INCLUDED_volk_gnsssdr_8ic_magnitude_squared_8i_u_H
+++
+++#include <inttypes.h>
+++#include <stdio.h>
+++#include <math.h>
+++
+++#ifdef LV_HAVE_SSE3
+++#include <pmmintrin.h>
+++#include "tmmintrin.h"
+++/*!
+++ \brief Calculates the magnitude squared of complexVector and stores the results in magnitudeVector
+++ \param complexVector The vector containing the complex input values
+++ \param magnitudeVector The vector containing the real output values
+++ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+++ */
+++static inline void volk_gnsssdr_8ic_magnitude_squared_8i_u_sse3(char* magnitudeVector, const lv_8sc_t* complexVector, unsigned int num_points){
+++
+++ const unsigned int sse_iters = num_points / 16;
+++
+++ const char* complexVectorPtr = (char*)complexVector;
+++ char* magnitudeVectorPtr = magnitudeVector;
+++
+++ __m128i zero, result8;
+++ __m128i avector, avectorhi, avectorlo, avectorlomult, avectorhimult, aadded, maska;
+++ __m128i bvector, bvectorhi, bvectorlo, bvectorlomult, bvectorhimult, badded, maskb;
+++
+++ zero = _mm_setzero_si128();
+++ maska = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
+++ maskb = _mm_set_epi8(14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
+++
+++ for(int number = 0;number < sse_iters; number++)
+++ {
+++ avector = _mm_lddqu_si128((__m128i*)complexVectorPtr);
+++ avectorlo = _mm_unpacklo_epi8 (avector, zero);
+++ avectorhi = _mm_unpackhi_epi8 (avector, zero);
+++ avectorlomult = _mm_mullo_epi16 (avectorlo, avectorlo);
+++ avectorhimult = _mm_mullo_epi16 (avectorhi, avectorhi);
+++ aadded = _mm_hadd_epi16 (avectorlomult, avectorhimult);
+++
+++ complexVectorPtr += 16;
+++
+++ bvector = _mm_lddqu_si128((__m128i*)complexVectorPtr);
+++ bvectorlo = _mm_unpacklo_epi8 (bvector, zero);
+++ bvectorhi = _mm_unpackhi_epi8 (bvector, zero);
+++ bvectorlomult = _mm_mullo_epi16 (bvectorlo, bvectorlo);
+++ bvectorhimult = _mm_mullo_epi16 (bvectorhi, bvectorhi);
+++ badded = _mm_hadd_epi16 (bvectorlomult, bvectorhimult);
+++
+++ complexVectorPtr += 16;
+++
+++ result8 = _mm_or_si128(_mm_shuffle_epi8(aadded, maska), _mm_shuffle_epi8(badded, maskb));
+++
+++ _mm_storeu_si128((__m128i*)magnitudeVectorPtr, result8);
+++
+++ magnitudeVectorPtr += 16;
+++
+++
+++ }
+++
+++ for (int i = 0; i<(num_points % 16); ++i)
+++ {
+++ const char valReal = *complexVectorPtr++;
+++ const char valImag = *complexVectorPtr++;
+++ *magnitudeVectorPtr++ = (valReal * valReal) + (valImag * valImag);
+++ }
+++}
+++#endif /* LV_HAVE_SSE3 */
+++
+++//#ifdef LV_HAVE_SSE
+++//#include <xmmintrin.h>
+++///*!
+++// \brief Calculates the magnitude squared of complexVector and stores the results in magnitudeVector
+++// \param complexVector The vector containing the complex input values
+++// \param magnitudeVector The vector containing the real output values
+++// \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+++// */
+++//static inline void volk_gnsssdr_8ic_magnitude_squared_8i_u_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
+++// unsigned int number = 0;
+++// const unsigned int quarterPoints = num_points / 4;
+++//
+++// const float* complexVectorPtr = (float*)complexVector;
+++// float* magnitudeVectorPtr = magnitudeVector;
+++//
+++// __m128 cplxValue1, cplxValue2, iValue, qValue, result;
+++// for(;number < quarterPoints; number++){
+++// cplxValue1 = _mm_loadu_ps(complexVectorPtr);
+++// complexVectorPtr += 4;
+++//
+++// cplxValue2 = _mm_loadu_ps(complexVectorPtr);
+++// complexVectorPtr += 4;
+++//
+++// // Arrange in i1i2i3i4 format
+++// iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+++// // Arrange in q1q2q3q4 format
+++// qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
+++//
+++// iValue = _mm_mul_ps(iValue, iValue); // Square the I values
+++// qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values
+++//
+++// result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values
+++//
+++// _mm_storeu_ps(magnitudeVectorPtr, result);
+++// magnitudeVectorPtr += 4;
+++// }
+++//
+++// number = quarterPoints * 4;
+++// for(; number < num_points; number++){
+++// float val1Real = *complexVectorPtr++;
+++// float val1Imag = *complexVectorPtr++;
+++// *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
+++// }
+++//}
+++//#endif /* LV_HAVE_SSE */
+++
+++#ifdef LV_HAVE_GENERIC
+++/*!
+++ \brief Calculates the magnitude squared of complexVector and stores the results in magnitudeVector
+++ \param complexVector The vector containing the complex input values
+++ \param magnitudeVector The vector containing the real output values
+++ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+++ */
+++static inline void volk_gnsssdr_8ic_magnitude_squared_8i_generic(char* magnitudeVector, const lv_8sc_t* complexVector, unsigned int num_points){
+++ const char* complexVectorPtr = (char*)complexVector;
+++ char* magnitudeVectorPtr = magnitudeVector;
+++
+++ for(int number = 0; number < num_points; number++){
+++ const char real = *complexVectorPtr++;
+++ const char imag = *complexVectorPtr++;
+++ *magnitudeVectorPtr++ = (real*real) + (imag*imag);
+++ }
+++}
+++#endif /* LV_HAVE_GENERIC */
+++
+++#endif /* INCLUDED_volk_gnsssdr_32fc_magnitude_32f_u_H */
+++
+++
+++#ifndef INCLUDED_volk_gnsssdr_8ic_magnitude_squared_8i_a_H
+++#define INCLUDED_volk_gnsssdr_8ic_magnitude_squared_8i_a_H
+++
+++#include <inttypes.h>
+++#include <stdio.h>
+++#include <math.h>
+++
+++#ifdef LV_HAVE_SSE3
+++#include <pmmintrin.h>
+++/*!
+++ \brief Calculates the magnitude squared of complexVector and stores the results in magnitudeVector
+++ \param complexVector The vector containing the complex input values
+++ \param magnitudeVector The vector containing the real output values
+++ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+++ */
+++static inline void volk_gnsssdr_8ic_magnitude_squared_8i_a_sse3(char* magnitudeVector, const lv_8sc_t* complexVector, unsigned int num_points){
+++
+++ const unsigned int sse_iters = num_points / 16;
+++
+++ const char* complexVectorPtr = (char*)complexVector;
+++ char* magnitudeVectorPtr = magnitudeVector;
+++
+++ __m128i zero, result8;
+++ __m128i avector, avectorhi, avectorlo, avectorlomult, avectorhimult, aadded, maska;
+++ __m128i bvector, bvectorhi, bvectorlo, bvectorlomult, bvectorhimult, badded, maskb;
+++
+++ zero = _mm_setzero_si128();
+++ maska = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
+++ maskb = _mm_set_epi8(14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
+++
+++ for(int number = 0;number < sse_iters; number++)
+++ {
+++ avector = _mm_load_si128((__m128i*)complexVectorPtr);
+++ avectorlo = _mm_unpacklo_epi8 (avector, zero);
+++ avectorhi = _mm_unpackhi_epi8 (avector, zero);
+++ avectorlomult = _mm_mullo_epi16 (avectorlo, avectorlo);
+++ avectorhimult = _mm_mullo_epi16 (avectorhi, avectorhi);
+++ aadded = _mm_hadd_epi16 (avectorlomult, avectorhimult);
+++
+++ complexVectorPtr += 16;
+++
+++ bvector = _mm_load_si128((__m128i*)complexVectorPtr);
+++ bvectorlo = _mm_unpacklo_epi8 (bvector, zero);
+++ bvectorhi = _mm_unpackhi_epi8 (bvector, zero);
+++ bvectorlomult = _mm_mullo_epi16 (bvectorlo, bvectorlo);
+++ bvectorhimult = _mm_mullo_epi16 (bvectorhi, bvectorhi);
+++ badded = _mm_hadd_epi16 (bvectorlomult, bvectorhimult);
+++
+++ complexVectorPtr += 16;
+++
+++ result8 = _mm_or_si128(_mm_shuffle_epi8(aadded, maska), _mm_shuffle_epi8(badded, maskb));
+++
+++ _mm_store_si128((__m128i*)magnitudeVectorPtr, result8);
+++
+++ magnitudeVectorPtr += 16;
+++
+++
+++ }
+++
+++ for (int i = 0; i<(num_points % 16); ++i)
+++ {
+++ const char valReal = *complexVectorPtr++;
+++ const char valImag = *complexVectorPtr++;
+++ *magnitudeVectorPtr++ = (valReal * valReal) + (valImag * valImag);
+++ }
+++}
+++#endif /* LV_HAVE_SSE3 */
+++
+++//#ifdef LV_HAVE_SSE
+++//#include <xmmintrin.h>
+++///*!
+++// \brief Calculates the magnitude squared of complexVector and stores the results in magnitudeVector
+++// \param complexVector The vector containing the complex input values
+++// \param magnitudeVector The vector containing the real output values
+++// \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+++// */
+++//static inline void volk_gnsssdr_8ic_magnitude_squared_8i_a_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
+++// unsigned int number = 0;
+++// const unsigned int quarterPoints = num_points / 4;
+++//
+++// const float* complexVectorPtr = (float*)complexVector;
+++// float* magnitudeVectorPtr = magnitudeVector;
+++//
+++// __m128 cplxValue1, cplxValue2, iValue, qValue, result;
+++// for(;number < quarterPoints; number++){
+++// cplxValue1 = _mm_load_ps(complexVectorPtr);
+++// complexVectorPtr += 4;
+++//
+++// cplxValue2 = _mm_load_ps(complexVectorPtr);
+++// complexVectorPtr += 4;
+++//
+++// // Arrange in i1i2i3i4 format
+++// iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+++// // Arrange in q1q2q3q4 format
+++// qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
+++//
+++// iValue = _mm_mul_ps(iValue, iValue); // Square the I values
+++// qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values
+++//
+++// result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values
+++//
+++// _mm_store_ps(magnitudeVectorPtr, result);
+++// magnitudeVectorPtr += 4;
+++// }
+++//
+++// number = quarterPoints * 4;
+++// for(; number < num_points; number++){
+++// float val1Real = *complexVectorPtr++;
+++// float val1Imag = *complexVectorPtr++;
+++// *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
+++// }
+++//}
+++//#endif /* LV_HAVE_SSE */
+++
+++#ifdef LV_HAVE_GENERIC
+++/*!
+++ \brief Calculates the magnitude squared of complexVector and stores the results in magnitudeVector
+++ \param complexVector The vector containing the complex input values
+++ \param magnitudeVector The vector containing the real output values
+++ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+++ */
+++static inline void volk_gnsssdr_8ic_magnitude_squared_8i_a_generic(char* magnitudeVector, const lv_8sc_t* complexVector, unsigned int num_points){
+++ const char* complexVectorPtr = (char*)complexVector;
+++ char* magnitudeVectorPtr = magnitudeVector;
+++
+++ for(int number = 0; number < num_points; number++){
+++ const char real = *complexVectorPtr++;
+++ const char imag = *complexVectorPtr++;
+++ *magnitudeVectorPtr++ = (real*real) + (imag*imag);
+++ }
+++}
+++#endif /* LV_HAVE_GENERIC */
+++
+++#ifdef LV_HAVE_ORC
+++/*!
+++ \brief Calculates the magnitude squared of complexVector and stores the results in magnitudeVector
+++ \param complexVector The vector containing the complex input values
+++ \param magnitudeVector The vector containing the real output values
+++ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+++ */
+++extern void volk_gnsssdr_8ic_magnitude_squared_8i_a_orc_impl(char* magnitudeVector, const lv_8sc_t* complexVector, unsigned int num_points);
+++static inline void volk_gnsssdr_8ic_magnitude_squared_8i_u_orc(char* magnitudeVector, const lv_8sc_t* complexVector, unsigned int num_points){
+++ volk_gnsssdr_8ic_magnitude_squared_8i_a_orc_impl(magnitudeVector, complexVector, num_points);
+++}
+++#endif /* LV_HAVE_ORC */
+++
+++#endif /* INCLUDED_volk_gnsssdr_32fc_magnitude_32f_a_H */
++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_s8ic_multiply_8ic.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_s8ic_multiply_8ic.h
++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_s8ic_multiply_8ic.h 1970-01-01 01:00:00.000000000 +0100
+++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_s8ic_multiply_8ic.h 2014-10-15 01:55:08.000000000 +0200
++@@ -0,0 +1,271 @@
+++/*!
+++ * \file volk_gnsssdr_8ic_s8ic_multiply_8ic.h
+++ * \brief Volk protokernel: multiplies a group of 16 bits vectors by one constant vector
+++ * \authors <ul>
+++ * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
+++ * </ul>
+++ *
+++ * Volk protokernel that multiplies a group of 16 bits vectors
+++ * (8 bits the real part and 8 bits the imaginary part) by one constant vector
+++ *
+++ * -------------------------------------------------------------------------
+++ *
+++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
+++ *
+++ * GNSS-SDR is a software defined Global Navigation
+++ * Satellite Systems receiver
+++ *
+++ * This file is part of GNSS-SDR.
+++ *
+++ * GNSS-SDR is free software: you can redistribute it and/or modify
+++ * it under the terms of the GNU General Public License as published by
+++ * the Free Software Foundation, either version 3 of the License, or
+++ * at your option) any later version.
+++ *
+++ * GNSS-SDR is distributed in the hope that it will be useful,
+++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+++ * GNU General Public License for more details.
+++ *
+++ * You should have received a copy of the GNU General Public License
+++ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
+++ *
+++ * -------------------------------------------------------------------------
+++ */
+++
+++#ifndef INCLUDED_volk_gnsssdr_8ic_s8ic_multiply_8ic_u_H
+++#define INCLUDED_volk_gnsssdr_8ic_s8ic_multiply_8ic_u_H
+++
+++#include <inttypes.h>
+++#include <stdio.h>
+++#include <volk_gnsssdr/volk_gnsssdr_complex.h>
+++#include <float.h>
+++
+++#ifdef LV_HAVE_SSE3
+++#include <pmmintrin.h>
+++/*!
+++ \brief Multiplies the input vector by a scalar and stores the results in the third vector
+++ \param cVector The vector where the results will be stored
+++ \param aVector The vector to be multiplied
+++ \param scalar The complex scalar to multiply aVector
+++ \param num_points The number of complex values in aVector to be multiplied by sacalar and stored into cVector
+++ */
+++static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_u_sse3(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t scalar, unsigned int num_points){
+++
+++ const unsigned int sse_iters = num_points / 8;
+++
+++ __m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc;
+++
+++ lv_8sc_t* c = cVector;
+++ const lv_8sc_t* a = aVector;
+++
+++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
+++
+++ y = _mm_set1_epi16 (*(short*)&scalar);
+++ imagy = _mm_srli_si128 (y, 1);
+++ imagy = _mm_and_si128 (imagy, mult1);
+++ realy = _mm_and_si128 (y, mult1);
+++
+++ for(int number = 0;number < sse_iters; number++){
+++
+++ x = _mm_lddqu_si128((__m128i*)a);
+++
+++ imagx = _mm_srli_si128 (x, 1);
+++ imagx = _mm_and_si128 (imagx, mult1);
+++ realx = _mm_and_si128 (x, mult1);
+++
+++ realx_mult_realy = _mm_mullo_epi16 (realx, realy);
+++ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);
+++ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);
+++ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);
+++
+++ realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
+++ realc = _mm_and_si128 (realc, mult1);
+++ imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
+++ imagc = _mm_and_si128 (imagc, mult1);
+++ imagc = _mm_slli_si128 (imagc, 1);
+++
+++ totalc = _mm_or_si128 (realc, imagc);
+++
+++ _mm_storeu_si128((__m128i*)c, totalc);
+++
+++ a += 8;
+++ c += 8;
+++ }
+++
+++ for (int i = 0; i<(num_points % 8); ++i)
+++ {
+++ *c++ = (*a++) * scalar;
+++ }
+++
+++}
+++#endif /* LV_HAVE_SSE3 */
+++
+++#ifdef LV_HAVE_GENERIC
+++/*!
+++ \brief Multiplies the input vector by a scalar and stores the results in the third vector
+++ \param cVector The vector where the results will be stored
+++ \param aVector The vector to be multiplied
+++ \param scalar The complex scalar to multiply aVector
+++ \param num_points The number of complex values in aVector to be multiplied by sacalar and stored into cVector
+++ */
+++static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_generic(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t scalar, unsigned int num_points){
+++
+++ /*lv_8sc_t* cPtr = cVector;
+++ const lv_8sc_t* aPtr = aVector;
+++
+++ for (int i = 0; i<num_points; ++i)
+++ {
+++ *cPtr++ = (*aPtr++) * scalar;
+++ }*/
+++
+++ lv_8sc_t* cPtr = cVector;
+++ const lv_8sc_t* aPtr = aVector;
+++ unsigned int number = num_points;
+++
+++ // unwrap loop
+++ while (number >= 8){
+++ *cPtr++ = (*aPtr++) * scalar;
+++ *cPtr++ = (*aPtr++) * scalar;
+++ *cPtr++ = (*aPtr++) * scalar;
+++ *cPtr++ = (*aPtr++) * scalar;
+++ *cPtr++ = (*aPtr++) * scalar;
+++ *cPtr++ = (*aPtr++) * scalar;
+++ *cPtr++ = (*aPtr++) * scalar;
+++ *cPtr++ = (*aPtr++) * scalar;
+++ number -= 8;
+++ }
+++
+++ // clean up any remaining
+++ while (number-- > 0)
+++ *cPtr++ = *aPtr++ * scalar;
+++}
+++#endif /* LV_HAVE_GENERIC */
+++
+++#endif /* INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_u_H */
+++
+++
+++#ifndef INCLUDED_volk_gnsssdr_8ic_s8ic_multiply_8ic_a_H
+++#define INCLUDED_volk_gnsssdr_8ic_s8ic_multiply_8ic_a_H
+++
+++#include <inttypes.h>
+++#include <stdio.h>
+++#include <volk_gnsssdr/volk_gnsssdr_complex.h>
+++#include <float.h>
+++
+++#ifdef LV_HAVE_SSE3
+++#include <pmmintrin.h>
+++/*!
+++ \brief Multiplies the input vector by a scalar and stores the results in the third vector
+++ \param cVector The vector where the results will be stored
+++ \param aVector The vector to be multiplied
+++ \param scalar The complex scalar to multiply aVector
+++ \param num_points The number of complex values in aVector to be multiplied by sacalar and stored into cVector
+++ */
+++static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_a_sse3(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t scalar, unsigned int num_points){
+++
+++ const unsigned int sse_iters = num_points / 8;
+++
+++ __m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc;
+++
+++ lv_8sc_t* c = cVector;
+++ const lv_8sc_t* a = aVector;
+++
+++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
+++
+++ y = _mm_set1_epi16 (*(short*)&scalar);
+++ imagy = _mm_srli_si128 (y, 1);
+++ imagy = _mm_and_si128 (imagy, mult1);
+++ realy = _mm_and_si128 (y, mult1);
+++
+++ for(int number = 0;number < sse_iters; number++){
+++
+++ x = _mm_load_si128((__m128i*)a);
+++
+++ imagx = _mm_srli_si128 (x, 1);
+++ imagx = _mm_and_si128 (imagx, mult1);
+++ realx = _mm_and_si128 (x, mult1);
+++
+++ realx_mult_realy = _mm_mullo_epi16 (realx, realy);
+++ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);
+++ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);
+++ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);
+++
+++ realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
+++ realc = _mm_and_si128 (realc, mult1);
+++ imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
+++ imagc = _mm_and_si128 (imagc, mult1);
+++ imagc = _mm_slli_si128 (imagc, 1);
+++
+++ totalc = _mm_or_si128 (realc, imagc);
+++
+++ _mm_store_si128((__m128i*)c, totalc);
+++
+++ a += 8;
+++ c += 8;
+++ }
+++
+++ for (int i = 0; i<(num_points % 8); ++i)
+++ {
+++ *c++ = (*a++) * scalar;
+++ }
+++
+++}
+++#endif /* LV_HAVE_SSE3 */
+++
+++#ifdef LV_HAVE_GENERIC
+++/*!
+++ \brief Multiplies the input vector by a scalar and stores the results in the third vector
+++ \param cVector The vector where the results will be stored
+++ \param aVector The vector to be multiplied
+++ \param scalar The complex scalar to multiply aVector
+++ \param num_points The number of complex values in aVector to be multiplied by sacalar and stored into cVector
+++ */
+++static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_a_generic(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t scalar, unsigned int num_points){
+++
+++ /*lv_8sc_t* cPtr = cVector;
+++ const lv_8sc_t* aPtr = aVector;
+++
+++ for (int i = 0; i<num_points; ++i)
+++ {
+++ *cPtr++ = (*aPtr++) * scalar;
+++ }*/
+++
+++ lv_8sc_t* cPtr = cVector;
+++ const lv_8sc_t* aPtr = aVector;
+++ unsigned int number = num_points;
+++
+++ // unwrap loop
+++ while (number >= 8){
+++ *cPtr++ = (*aPtr++) * scalar;
+++ *cPtr++ = (*aPtr++) * scalar;
+++ *cPtr++ = (*aPtr++) * scalar;
+++ *cPtr++ = (*aPtr++) * scalar;
+++ *cPtr++ = (*aPtr++) * scalar;
+++ *cPtr++ = (*aPtr++) * scalar;
+++ *cPtr++ = (*aPtr++) * scalar;
+++ *cPtr++ = (*aPtr++) * scalar;
+++ number -= 8;
+++ }
+++
+++ // clean up any remaining
+++ while (number-- > 0)
+++ *cPtr++ = *aPtr++ * scalar;
+++}
+++#endif /* LV_HAVE_GENERIC */
+++
+++#ifdef LV_HAVE_ORC
+++/*!
+++ \brief Multiplies the input vector by a scalar and stores the results in the third vector
+++ \param cVector The vector where the results will be stored
+++ \param aVector The vector to be multiplied
+++ \param scalar The complex scalar to multiply aVector
+++ \param num_points The number of complex values in aVector to be multiplied by sacalar and stored into cVector
+++ */
+++extern void volk_gnsssdr_8ic_s8ic_multiply_8ic_a_orc_impl(lv_8sc_t* cVector, const lv_8sc_t* aVector, const char scalarreal, const char scalarimag, unsigned int num_points);
+++static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_u_orc(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t scalar, unsigned int num_points){
+++ volk_gnsssdr_8ic_s8ic_multiply_8ic_a_orc_impl(cVector, aVector, lv_creal(scalar), lv_cimag(scalar), num_points);
+++}
+++#endif /* LV_HAVE_ORC */
+++
+++#endif /* INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_a_H */
++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x2_dot_prod_8ic.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x2_dot_prod_8ic.h
++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x2_dot_prod_8ic.h 1970-01-01 01:00:00.000000000 +0100
+++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x2_dot_prod_8ic.h 2014-10-15 01:55:08.000000000 +0200
++@@ -0,0 +1,499 @@
+++/*!
+++ * \file volk_gnsssdr_8ic_x2_dot_prod_8ic.h
+++ * \brief Volk protokernel: multiplies two 16 bits vectors and accumulates them
+++ * \authors <ul>
+++ * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
+++ * </ul>
+++ *
+++ * Volk protokernel that multiplies two 16 bits vectors (8 bits the real part
+++ * and 8 bits the imaginary part) and accumulates them
+++ *
+++ * -------------------------------------------------------------------------
+++ *
+++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
+++ *
+++ * GNSS-SDR is a software defined Global Navigation
+++ * Satellite Systems receiver
+++ *
+++ * This file is part of GNSS-SDR.
+++ *
+++ * GNSS-SDR is free software: you can redistribute it and/or modify
+++ * it under the terms of the GNU General Public License as published by
+++ * the Free Software Foundation, either version 3 of the License, or
+++ * at your option) any later version.
+++ *
+++ * GNSS-SDR is distributed in the hope that it will be useful,
+++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+++ * GNU General Public License for more details.
+++ *
+++ * You should have received a copy of the GNU General Public License
+++ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
+++ *
+++ * -------------------------------------------------------------------------
+++ */
+++
+++#ifndef INCLUDED_volk_gnsssdr_8ic_x2_dot_prod_8ic_u_H
+++#define INCLUDED_volk_gnsssdr_8ic_x2_dot_prod_8ic_u_H
+++
+++#include <volk_gnsssdr/volk_gnsssdr_common.h>
+++#include <volk_gnsssdr/volk_gnsssdr_complex.h>
+++#include <stdio.h>
+++#include <string.h>
+++
+++#ifdef LV_HAVE_GENERIC
+++/*!
+++ \brief Multiplies the two input complex vectors and accumulates them, storing the result in the third vector
+++ \param cVector The vector where the accumulated result will be stored
+++ \param aVector One of the vectors to be multiplied and accumulated
+++ \param bVector One of the vectors to be multiplied and accumulated
+++ \param num_points The number of complex values in aVector and bVector to be multiplied together, accumulated and stored into cVector
+++ */
+++static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_generic(lv_8sc_t* result, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points) {
+++
+++ /*lv_8sc_t* cPtr = result;
+++ const lv_8sc_t* aPtr = input;
+++ const lv_8sc_t* bPtr = taps;
+++
+++ for(int number = 0; number < num_points; number++){
+++ *cPtr += (*aPtr++) * (*bPtr++);
+++ }*/
+++
+++ char * res = (char*) result;
+++ char * in = (char*) input;
+++ char * tp = (char*) taps;
+++ unsigned int n_2_ccomplex_blocks = num_points/2;
+++ unsigned int isodd = num_points & 1;
+++
+++ char sum0[2] = {0,0};
+++ char sum1[2] = {0,0};
+++ unsigned int i = 0;
+++
+++ for(i = 0; i < n_2_ccomplex_blocks; ++i) {
+++ sum0[0] += in[0] * tp[0] - in[1] * tp[1];
+++ sum0[1] += in[0] * tp[1] + in[1] * tp[0];
+++ sum1[0] += in[2] * tp[2] - in[3] * tp[3];
+++ sum1[1] += in[2] * tp[3] + in[3] * tp[2];
+++
+++ in += 4;
+++ tp += 4;
+++ }
+++
+++ res[0] = sum0[0] + sum1[0];
+++ res[1] = sum0[1] + sum1[1];
+++
+++ // Cleanup if we had an odd number of points
+++ for(i = 0; i < isodd; ++i) {
+++ *result += input[num_points - 1] * taps[num_points - 1];
+++ }
+++}
+++
+++#endif /*LV_HAVE_GENERIC*/
+++
+++#ifdef LV_HAVE_SSE2
+++#include "emmintrin.h"
+++/*!
+++ \brief Multiplies the two input complex vectors and accumulates them, storing the result in the third vector
+++ \param cVector The vector where the accumulated result will be stored
+++ \param aVector One of the vectors to be multiplied and accumulated
+++ \param bVector One of the vectors to be multiplied and accumulated
+++ \param num_points The number of complex values in aVector and bVector to be multiplied together, accumulated and stored into cVector
+++ */
+++static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_sse2(lv_8sc_t* result, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points) {
+++
+++ lv_8sc_t dotProduct;
+++ memset(&dotProduct, 0x0, 2*sizeof(char));
+++
+++ const lv_8sc_t* a = input;
+++ const lv_8sc_t* b = taps;
+++
+++ const unsigned int sse_iters = num_points/8;
+++
+++ if (sse_iters>0)
+++ {
+++ __m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc, realcacc, imagcacc;
+++
+++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
+++ realcacc = _mm_setzero_si128();
+++ imagcacc = _mm_setzero_si128();
+++
+++ for(int number = 0; number < sse_iters; number++){
+++
+++ x = _mm_lddqu_si128((__m128i*)a);
+++ y = _mm_lddqu_si128((__m128i*)b);
+++
+++ imagx = _mm_srli_si128 (x, 1);
+++ imagx = _mm_and_si128 (imagx, mult1);
+++ realx = _mm_and_si128 (x, mult1);
+++
+++ imagy = _mm_srli_si128 (y, 1);
+++ imagy = _mm_and_si128 (imagy, mult1);
+++ realy = _mm_and_si128 (y, mult1);
+++
+++ realx_mult_realy = _mm_mullo_epi16 (realx, realy);
+++ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);
+++ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);
+++ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);
+++
+++ realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
+++ imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
+++
+++ realcacc = _mm_add_epi16 (realcacc, realc);
+++ imagcacc = _mm_add_epi16 (imagcacc, imagc);
+++
+++ a += 8;
+++ b += 8;
+++ }
+++
+++ realcacc = _mm_and_si128 (realcacc, mult1);
+++ imagcacc = _mm_and_si128 (imagcacc, mult1);
+++ imagcacc = _mm_slli_si128 (imagcacc, 1);
+++
+++ totalc = _mm_or_si128 (realcacc, imagcacc);
+++
+++ __VOLK_ATTR_ALIGNED(16) lv_8sc_t dotProductVector[8];
+++
+++ _mm_storeu_si128((__m128i*)dotProductVector,totalc); // Store the results back into the dot product vector
+++
+++ for (int i = 0; i<8; ++i)
+++ {
+++ dotProduct += dotProductVector[i];
+++ }
+++ }
+++
+++ for (int i = 0; i<(num_points % 8); ++i)
+++ {
+++ dotProduct += (*a++) * (*b++);
+++ }
+++
+++ *result = dotProduct;
+++}
+++
+++#endif /*LV_HAVE_SSE2*/
+++
+++#ifdef LV_HAVE_SSE4_1
+++#include "smmintrin.h"
+++/*!
+++ \brief Multiplies the two input complex vectors and accumulates them, storing the result in the third vector
+++ \param cVector The vector where the accumulated result will be stored
+++ \param aVector One of the vectors to be multiplied and accumulated
+++ \param bVector One of the vectors to be multiplied and accumulated
+++ \param num_points The number of complex values in aVector and bVector to be multiplied together, accumulated and stored into cVector
+++ */
+++static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_sse4_1(lv_8sc_t* result, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points) {
+++
+++ lv_8sc_t dotProduct;
+++ memset(&dotProduct, 0x0, 2*sizeof(char));
+++
+++ const lv_8sc_t* a = input;
+++ const lv_8sc_t* b = taps;
+++
+++ const unsigned int sse_iters = num_points/8;
+++
+++ if (sse_iters>0)
+++ {
+++ __m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc, realcacc, imagcacc;
+++
+++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
+++ realcacc = _mm_setzero_si128();
+++ imagcacc = _mm_setzero_si128();
+++
+++ for(int number = 0; number < sse_iters; number++){
+++
+++ x = _mm_lddqu_si128((__m128i*)a);
+++ y = _mm_lddqu_si128((__m128i*)b);
+++
+++ imagx = _mm_srli_si128 (x, 1);
+++ imagx = _mm_and_si128 (imagx, mult1);
+++ realx = _mm_and_si128 (x, mult1);
+++
+++ imagy = _mm_srli_si128 (y, 1);
+++ imagy = _mm_and_si128 (imagy, mult1);
+++ realy = _mm_and_si128 (y, mult1);
+++
+++ realx_mult_realy = _mm_mullo_epi16 (realx, realy);
+++ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);
+++ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);
+++ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);
+++
+++ realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
+++ imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
+++
+++ realcacc = _mm_add_epi16 (realcacc, realc);
+++ imagcacc = _mm_add_epi16 (imagcacc, imagc);
+++
+++ a += 8;
+++ b += 8;
+++ }
+++
+++ imagcacc = _mm_slli_si128 (imagcacc, 1);
+++
+++ totalc = _mm_blendv_epi8 (imagcacc, realcacc, mult1);
+++
+++ __VOLK_ATTR_ALIGNED(16) lv_8sc_t dotProductVector[8];
+++
+++ _mm_storeu_si128((__m128i*)dotProductVector,totalc); // Store the results back into the dot product vector
+++
+++ for (int i = 0; i<8; ++i)
+++ {
+++ dotProduct += dotProductVector[i];
+++ }
+++ }
+++
+++ for (int i = 0; i<(num_points % 8); ++i)
+++ {
+++ dotProduct += (*a++) * (*b++);
+++ }
+++
+++ *result = dotProduct;
+++}
+++
+++#endif /*LV_HAVE_SSE4_1*/
+++
+++#endif /*INCLUDED_volk_gnsssdr_8ic_x2_dot_prod_8ic_u_H*/
+++
+++
+++#ifndef INCLUDED_volk_gnsssdr_8ic_x2_dot_prod_8ic_a_H
+++#define INCLUDED_volk_gnsssdr_8ic_x2_dot_prod_8ic_a_H
+++
+++#include <volk_gnsssdr/volk_gnsssdr_common.h>
+++#include <volk_gnsssdr/volk_gnsssdr_complex.h>
+++#include <stdio.h>
+++#include <string.h>
+++
+++
+++#ifdef LV_HAVE_GENERIC
+++/*!
+++ \brief Multiplies the two input complex vectors and accumulates them, storing the result in the third vector
+++ \param cVector The vector where the accumulated result will be stored
+++ \param aVector One of the vectors to be multiplied and accumulated
+++ \param bVector One of the vectors to be multiplied and accumulated
+++ \param num_points The number of complex values in aVector and bVector to be multiplied together, accumulated and stored into cVector
+++ */
+++static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_generic(lv_8sc_t* result, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points) {
+++
+++ /*lv_8sc_t* cPtr = result;
+++ const lv_8sc_t* aPtr = input;
+++ const lv_8sc_t* bPtr = taps;
+++
+++ for(int number = 0; number < num_points; number++){
+++ *cPtr += (*aPtr++) * (*bPtr++);
+++ }*/
+++
+++ char * res = (char*) result;
+++ char * in = (char*) input;
+++ char * tp = (char*) taps;
+++ unsigned int n_2_ccomplex_blocks = num_points/2;
+++ unsigned int isodd = num_points & 1;
+++
+++ char sum0[2] = {0,0};
+++ char sum1[2] = {0,0};
+++ unsigned int i = 0;
+++
+++ for(i = 0; i < n_2_ccomplex_blocks; ++i) {
+++ sum0[0] += in[0] * tp[0] - in[1] * tp[1];
+++ sum0[1] += in[0] * tp[1] + in[1] * tp[0];
+++ sum1[0] += in[2] * tp[2] - in[3] * tp[3];
+++ sum1[1] += in[2] * tp[3] + in[3] * tp[2];
+++
+++ in += 4;
+++ tp += 4;
+++ }
+++
+++ res[0] = sum0[0] + sum1[0];
+++ res[1] = sum0[1] + sum1[1];
+++
+++ // Cleanup if we had an odd number of points
+++ for(i = 0; i < isodd; ++i) {
+++ *result += input[num_points - 1] * taps[num_points - 1];
+++ }
+++}
+++
+++#endif /*LV_HAVE_GENERIC*/
+++
+++#ifdef LV_HAVE_SSE2
+++#include "emmintrin.h"
+++/*!
+++ \brief Multiplies the two input complex vectors and accumulates them, storing the result in the third vector
+++ \param cVector The vector where the accumulated result will be stored
+++ \param aVector One of the vectors to be multiplied and accumulated
+++ \param bVector One of the vectors to be multiplied and accumulated
+++ \param num_points The number of complex values in aVector and bVector to be multiplied together, accumulated and stored into cVector
+++ */
+++static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_sse2(lv_8sc_t* result, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points) {
+++
+++ lv_8sc_t dotProduct;
+++ memset(&dotProduct, 0x0, 2*sizeof(char));
+++
+++ const lv_8sc_t* a = input;
+++ const lv_8sc_t* b = taps;
+++
+++ const unsigned int sse_iters = num_points/8;
+++
+++ if (sse_iters>0)
+++ {
+++ __m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc, realcacc, imagcacc;
+++
+++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
+++ realcacc = _mm_setzero_si128();
+++ imagcacc = _mm_setzero_si128();
+++
+++ for(int number = 0; number < sse_iters; number++){
+++
+++ x = _mm_load_si128((__m128i*)a);
+++ y = _mm_load_si128((__m128i*)b);
+++
+++ imagx = _mm_srli_si128 (x, 1);
+++ imagx = _mm_and_si128 (imagx, mult1);
+++ realx = _mm_and_si128 (x, mult1);
+++
+++ imagy = _mm_srli_si128 (y, 1);
+++ imagy = _mm_and_si128 (imagy, mult1);
+++ realy = _mm_and_si128 (y, mult1);
+++
+++ realx_mult_realy = _mm_mullo_epi16 (realx, realy);
+++ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);
+++ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);
+++ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);
+++
+++ realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
+++ imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
+++
+++ realcacc = _mm_add_epi16 (realcacc, realc);
+++ imagcacc = _mm_add_epi16 (imagcacc, imagc);
+++
+++ a += 8;
+++ b += 8;
+++ }
+++
+++ realcacc = _mm_and_si128 (realcacc, mult1);
+++ imagcacc = _mm_and_si128 (imagcacc, mult1);
+++ imagcacc = _mm_slli_si128 (imagcacc, 1);
+++
+++ totalc = _mm_or_si128 (realcacc, imagcacc);
+++
+++ __VOLK_ATTR_ALIGNED(16) lv_8sc_t dotProductVector[8];
+++
+++ _mm_store_si128((__m128i*)dotProductVector,totalc); // Store the results back into the dot product vector
+++
+++ for (int i = 0; i<8; ++i)
+++ {
+++ dotProduct += dotProductVector[i];
+++ }
+++ }
+++
+++ for (int i = 0; i<(num_points % 8); ++i)
+++ {
+++ dotProduct += (*a++) * (*b++);
+++ }
+++
+++ *result = dotProduct;
+++}
+++
+++#endif /*LV_HAVE_SSE2*/
+++
+++#ifdef LV_HAVE_SSE4_1
+++#include "smmintrin.h"
+++/*!
+++ \brief Multiplies the two input complex vectors and accumulates them, storing the result in the third vector
+++ \param cVector The vector where the accumulated result will be stored
+++ \param aVector One of the vectors to be multiplied and accumulated
+++ \param bVector One of the vectors to be multiplied and accumulated
+++ \param num_points The number of complex values in aVector and bVector to be multiplied together, accumulated and stored into cVector
+++ */
+++static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_sse4_1(lv_8sc_t* result, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points) {
+++
+++ lv_8sc_t dotProduct;
+++ memset(&dotProduct, 0x0, 2*sizeof(char));
+++
+++ const lv_8sc_t* a = input;
+++ const lv_8sc_t* b = taps;
+++
+++ const unsigned int sse_iters = num_points/8;
+++
+++ if (sse_iters>0)
+++ {
+++ __m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc, realcacc, imagcacc;
+++
+++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
+++ realcacc = _mm_setzero_si128();
+++ imagcacc = _mm_setzero_si128();
+++
+++ for(int number = 0; number < sse_iters; number++){
+++
+++ x = _mm_load_si128((__m128i*)a);
+++ y = _mm_load_si128((__m128i*)b);
+++
+++ imagx = _mm_srli_si128 (x, 1);
+++ imagx = _mm_and_si128 (imagx, mult1);
+++ realx = _mm_and_si128 (x, mult1);
+++
+++ imagy = _mm_srli_si128 (y, 1);
+++ imagy = _mm_and_si128 (imagy, mult1);
+++ realy = _mm_and_si128 (y, mult1);
+++
+++ realx_mult_realy = _mm_mullo_epi16 (realx, realy);
+++ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);
+++ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);
+++ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);
+++
+++ realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
+++ imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
+++
+++ realcacc = _mm_add_epi16 (realcacc, realc);
+++ imagcacc = _mm_add_epi16 (imagcacc, imagc);
+++
+++ a += 8;
+++ b += 8;
+++ }
+++
+++ imagcacc = _mm_slli_si128 (imagcacc, 1);
+++
+++ totalc = _mm_blendv_epi8 (imagcacc, realcacc, mult1);
+++
+++ __VOLK_ATTR_ALIGNED(16) lv_8sc_t dotProductVector[8];
+++
+++ _mm_store_si128((__m128i*)dotProductVector,totalc); // Store the results back into the dot product vector
+++
+++ for (int i = 0; i<8; ++i)
+++ {
+++ dotProduct += dotProductVector[i];
+++ }
+++ }
+++
+++ for (int i = 0; i<(num_points % 8); ++i)
+++ {
+++ dotProduct += (*a++) * (*b++);
+++ }
+++
+++ *result = dotProduct;
+++}
+++
+++#endif /*LV_HAVE_SSE4_1*/
+++
+++#ifdef LV_HAVE_ORC
+++/*!
+++ \brief Multiplies the two input complex vectors and accumulates them, storing the result in the third vector
+++ \param cVector The vector where the accumulated result will be stored
+++ \param aVector One of the vectors to be multiplied and accumulated
+++ \param bVector One of the vectors to be multiplied and accumulated
+++ \param num_points The number of complex values in aVector and bVector to be multiplied together, accumulated and stored into cVector
+++ */
+++extern void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_orc_impl(short* resRealShort, short* resImagShort, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points);
+++static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_orc(lv_8sc_t* result, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points){
+++
+++ short resReal = 0;
+++ char* resRealChar = (char*)&resReal;
+++ resRealChar++;
+++
+++ short resImag = 0;
+++ char* resImagChar = (char*)&resImag;
+++ resImagChar++;
+++
+++ volk_gnsssdr_8ic_x2_dot_prod_8ic_a_orc_impl(&resReal, &resImag, input, taps, num_points);
+++
+++ *result = lv_cmake(*resRealChar, *resImagChar);
+++}
+++#endif /* LV_HAVE_ORC */
+++
+++#endif /*INCLUDED_volk_gnsssdr_8ic_x2_dot_prod_8ic_a_H*/
++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x2_multiply_8ic.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x2_multiply_8ic.h
++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x2_multiply_8ic.h 1970-01-01 01:00:00.000000000 +0100
+++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x2_multiply_8ic.h 2014-10-15 01:55:08.000000000 +0200
++@@ -0,0 +1,346 @@
+++/*!
+++ * \file volk_gnsssdr_8ic_x2_multiply_8ic.h
+++ * \brief Volk protokernel: multiplies two 16 bits vectors
+++ * \authors <ul>
+++ * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
+++ * </ul>
+++ *
+++ * Volk protokernel that multiplies two 16 bits vectors (8 bits the real part
+++ * and 8 bits the imaginary part)
+++ *
+++ * -------------------------------------------------------------------------
+++ *
+++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
+++ *
+++ * GNSS-SDR is a software defined Global Navigation
+++ * Satellite Systems receiver
+++ *
+++ * This file is part of GNSS-SDR.
+++ *
+++ * GNSS-SDR is free software: you can redistribute it and/or modify
+++ * it under the terms of the GNU General Public License as published by
+++ * the Free Software Foundation, either version 3 of the License, or
+++ * at your option) any later version.
+++ *
+++ * GNSS-SDR is distributed in the hope that it will be useful,
+++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+++ * GNU General Public License for more details.
+++ *
+++ * You should have received a copy of the GNU General Public License
+++ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
+++ *
+++ * -------------------------------------------------------------------------
+++ */
+++
+++#ifndef INCLUDED_volk_gnsssdr_8ic_x2_multiply_8ic_u_H
+++#define INCLUDED_volk_gnsssdr_8ic_x2_multiply_8ic_u_H
+++
+++#include <inttypes.h>
+++#include <stdio.h>
+++#include <volk_gnsssdr/volk_gnsssdr_complex.h>
+++
+++#ifdef LV_HAVE_SSE2
+++#include "emmintrin.h"
+++/*!
+++ \brief Multiplies the two input complex vectors and stores their results in the third vector
+++ \param cVector The vector where the results will be stored
+++ \param aVector One of the vectors to be multiplied
+++ \param bVector One of the vectors to be multiplied
+++ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+++ */
+++static inline void volk_gnsssdr_8ic_x2_multiply_8ic_u_sse2(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){
+++
+++ const unsigned int sse_iters = num_points / 8;
+++
+++ __m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc;
+++ lv_8sc_t* c = cVector;
+++ const lv_8sc_t* a = aVector;
+++ const lv_8sc_t* b = bVector;
+++
+++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
+++
+++ for(int number = 0;number < sse_iters; number++){
+++
+++ x = _mm_lddqu_si128((__m128i*)a);
+++ y = _mm_lddqu_si128((__m128i*)b);
+++
+++ imagx = _mm_srli_si128 (x, 1);
+++ imagx = _mm_and_si128 (imagx, mult1);
+++ realx = _mm_and_si128 (x, mult1);
+++
+++ imagy = _mm_srli_si128 (y, 1);
+++ imagy = _mm_and_si128 (imagy, mult1);
+++ realy = _mm_and_si128 (y, mult1);
+++
+++ realx_mult_realy = _mm_mullo_epi16 (realx, realy);
+++ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);
+++ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);
+++ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);
+++
+++ realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
+++ realc = _mm_and_si128 (realc, mult1);
+++ imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
+++ imagc = _mm_and_si128 (imagc, mult1);
+++ imagc = _mm_slli_si128 (imagc, 1);
+++
+++ totalc = _mm_or_si128 (realc, imagc);
+++
+++ _mm_storeu_si128((__m128i*)c, totalc);
+++
+++ a += 8;
+++ b += 8;
+++ c += 8;
+++ }
+++
+++ for (int i = 0; i<(num_points % 8); ++i)
+++ {
+++ *c++ = (*a++) * (*b++);
+++ }
+++}
+++#endif /* LV_HAVE_SSE2 */
+++
+++#ifdef LV_HAVE_SSE4_1
+++#include "smmintrin.h"
+++/*!
+++ \brief Multiplies the two input complex vectors and stores their results in the third vector
+++ \param cVector The vector where the results will be stored
+++ \param aVector One of the vectors to be multiplied
+++ \param bVector One of the vectors to be multiplied
+++ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+++ */
+++static inline void volk_gnsssdr_8ic_x2_multiply_8ic_u_sse4_1(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){
+++
+++ const unsigned int sse_iters = num_points / 8;
+++
+++ __m128i x, y, zero;
+++ __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc;
+++ lv_8sc_t* c = cVector;
+++ const lv_8sc_t* a = aVector;
+++ const lv_8sc_t* b = bVector;
+++
+++ zero = _mm_setzero_si128();
+++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
+++
+++ for(int number = 0;number < sse_iters; number++){
+++
+++ x = _mm_lddqu_si128((__m128i*)a);
+++ y = _mm_lddqu_si128((__m128i*)b);
+++
+++ imagx = _mm_srli_si128 (x, 1);
+++ imagx = _mm_and_si128 (imagx, mult1);
+++ realx = _mm_and_si128 (x, mult1);
+++
+++ imagy = _mm_srli_si128 (y, 1);
+++ imagy = _mm_and_si128 (imagy, mult1);
+++ realy = _mm_and_si128 (y, mult1);
+++
+++ realx_mult_realy = _mm_mullo_epi16 (realx, realy);
+++ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);
+++ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);
+++ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);
+++
+++ realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
+++ imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
+++ imagc = _mm_slli_si128 (imagc, 1);
+++
+++ totalc = _mm_blendv_epi8 (imagc, realc, mult1);
+++
+++ _mm_storeu_si128((__m128i*)c, totalc);
+++
+++ a += 8;
+++ b += 8;
+++ c += 8;
+++ }
+++
+++ for (int i = 0; i<(num_points % 8); ++i)
+++ {
+++ *c++ = (*a++) * (*b++);
+++ }
+++}
+++#endif /* LV_HAVE_SSE4_1 */
+++
+++#ifdef LV_HAVE_GENERIC
+++/*!
+++ \brief Multiplies the two input complex vectors and stores their results in the third vector
+++ \param cVector The vector where the results will be stored
+++ \param aVector One of the vectors to be multiplied
+++ \param bVector One of the vectors to be multiplied
+++ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+++ */
+++static inline void volk_gnsssdr_8ic_x2_multiply_8ic_generic(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){
+++ lv_8sc_t* cPtr = cVector;
+++ const lv_8sc_t* aPtr = aVector;
+++ const lv_8sc_t* bPtr = bVector;
+++
+++ for(int number = 0; number < num_points; number++){
+++ *cPtr++ = (*aPtr++) * (*bPtr++);
+++ }
+++}
+++#endif /* LV_HAVE_GENERIC */
+++
+++#endif /* INCLUDED_volk_gnsssdr_8ic_x2_multiply_8ic_u_H */
+++
+++
+++#ifndef INCLUDED_volk_gnsssdr_8ic_x2_multiply_8ic_a_H
+++#define INCLUDED_volk_gnsssdr_8ic_x2_multiply_8ic_a_H
+++
+++#include <inttypes.h>
+++#include <stdio.h>
+++#include <volk_gnsssdr/volk_gnsssdr_complex.h>
+++
+++#ifdef LV_HAVE_SSE2
+++#include "emmintrin.h"
+++/*!
+++ \brief Multiplies the two input complex vectors and stores their results in the third vector
+++ \param cVector The vector where the results will be stored
+++ \param aVector One of the vectors to be multiplied
+++ \param bVector One of the vectors to be multiplied
+++ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+++ */
+++static inline void volk_gnsssdr_8ic_x2_multiply_8ic_a_sse2(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){
+++
+++ const unsigned int sse_iters = num_points / 8;
+++
+++ __m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc;
+++ lv_8sc_t* c = cVector;
+++ const lv_8sc_t* a = aVector;
+++ const lv_8sc_t* b = bVector;
+++
+++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
+++
+++ for(int number = 0;number < sse_iters; number++){
+++
+++ x = _mm_load_si128((__m128i*)a);
+++ y = _mm_load_si128((__m128i*)b);
+++
+++ imagx = _mm_srli_si128 (x, 1);
+++ imagx = _mm_and_si128 (imagx, mult1);
+++ realx = _mm_and_si128 (x, mult1);
+++
+++ imagy = _mm_srli_si128 (y, 1);
+++ imagy = _mm_and_si128 (imagy, mult1);
+++ realy = _mm_and_si128 (y, mult1);
+++
+++ realx_mult_realy = _mm_mullo_epi16 (realx, realy);
+++ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);
+++ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);
+++ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);
+++
+++ realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
+++ realc = _mm_and_si128 (realc, mult1);
+++ imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
+++ imagc = _mm_and_si128 (imagc, mult1);
+++ imagc = _mm_slli_si128 (imagc, 1);
+++
+++ totalc = _mm_or_si128 (realc, imagc);
+++
+++ _mm_store_si128((__m128i*)c, totalc);
+++
+++ a += 8;
+++ b += 8;
+++ c += 8;
+++ }
+++
+++ for (int i = 0; i<(num_points % 8); ++i)
+++ {
+++ *c++ = (*a++) * (*b++);
+++ }
+++}
+++#endif /* LV_HAVE_SSE2 */
+++
+++#ifdef LV_HAVE_SSE4_1
+++#include "smmintrin.h"
+++/*!
+++ \brief Multiplies the two input complex vectors and stores their results in the third vector
+++ \param cVector The vector where the results will be stored
+++ \param aVector One of the vectors to be multiplied
+++ \param bVector One of the vectors to be multiplied
+++ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+++ */
+++static inline void volk_gnsssdr_8ic_x2_multiply_8ic_a_sse4_1(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){
+++
+++ const unsigned int sse_iters = num_points / 8;
+++
+++ __m128i x, y, zero;
+++ __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc;
+++ lv_8sc_t* c = cVector;
+++ const lv_8sc_t* a = aVector;
+++ const lv_8sc_t* b = bVector;
+++
+++ zero = _mm_setzero_si128();
+++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
+++
+++ for(int number = 0;number < sse_iters; number++){
+++
+++ x = _mm_load_si128((__m128i*)a);
+++ y = _mm_load_si128((__m128i*)b);
+++
+++ imagx = _mm_srli_si128 (x, 1);
+++ imagx = _mm_and_si128 (imagx, mult1);
+++ realx = _mm_and_si128 (x, mult1);
+++
+++ imagy = _mm_srli_si128 (y, 1);
+++ imagy = _mm_and_si128 (imagy, mult1);
+++ realy = _mm_and_si128 (y, mult1);
+++
+++ realx_mult_realy = _mm_mullo_epi16 (realx, realy);
+++ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);
+++ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);
+++ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);
+++
+++ realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
+++ imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
+++ imagc = _mm_slli_si128 (imagc, 1);
+++
+++ totalc = _mm_blendv_epi8 (imagc, realc, mult1);
+++
+++ _mm_store_si128((__m128i*)c, totalc);
+++
+++ a += 8;
+++ b += 8;
+++ c += 8;
+++ }
+++
+++ for (int i = 0; i<(num_points % 8); ++i)
+++ {
+++ *c++ = (*a++) * (*b++);
+++ }
+++}
+++#endif /* LV_HAVE_SSE4_1 */
+++
+++#ifdef LV_HAVE_GENERIC
+++/*!
+++ \brief Multiplies the two input complex vectors and stores their results in the third vector
+++ \param cVector The vector where the results will be stored
+++ \param aVector One of the vectors to be multiplied
+++ \param bVector One of the vectors to be multiplied
+++ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+++ */
+++static inline void volk_gnsssdr_8ic_x2_multiply_8ic_a_generic(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){
+++ lv_8sc_t* cPtr = cVector;
+++ const lv_8sc_t* aPtr = aVector;
+++ const lv_8sc_t* bPtr = bVector;
+++
+++ for(int number = 0; number < num_points; number++){
+++ *cPtr++ = (*aPtr++) * (*bPtr++);
+++ }
+++
+++}
+++#endif /* LV_HAVE_GENERIC */
+++
+++#ifdef LV_HAVE_ORC
+++/*!
+++ \brief Multiplies the two input complex vectors and stores their results in the third vector
+++ \param cVector The vector where the results will be stored
+++ \param aVector One of the vectors to be multiplied
+++ \param bVector One of the vectors to be multiplied
+++ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+++ */
+++extern void volk_gnsssdr_8ic_x2_multiply_8ic_a_orc_impl(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points);
+++static inline void volk_gnsssdr_8ic_x2_multiply_8ic_u_orc(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){
+++ volk_gnsssdr_8ic_x2_multiply_8ic_a_orc_impl(cVector, aVector, bVector, num_points);
+++}
+++#endif /* LV_HAVE_ORC */
+++
+++#endif /* INCLUDED_volk_gnsssdr_8ic_x2_multiply_8ic_a_H */
++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3.h
++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3.h 1970-01-01 01:00:00.000000000 +0100
+++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3.h 2014-10-15 01:55:08.000000000 +0200
++@@ -0,0 +1,613 @@
+++/*!
+++ * \file volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3.h
+++ * \brief Volk protokernel: performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation with 16 bits vectors, and accumulates the results into float32.
+++ * \authors <ul>
+++ * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
+++ * </ul>
+++ *
+++ * Volk protokernel that performs the carrier wipe-off mixing and the
+++ * Early, Prompt, and Late correlation with 16 bits vectors (8 bits the
+++ * real part and 8 bits the imaginary part), and accumulates the result
+++ * in 32 bits single point values, returning float32 values:
+++ * - The carrier wipe-off is done by multiplying the input signal by the
+++ * carrier (multiplication of 16 bits vectors) It returns the input
+++ * signal in base band (BB)
+++ * - Early values are calculated by multiplying the input signal in BB by the
+++ * early code (multiplication of 16 bits vectors), accumulating the results into float32 values
+++ * - Prompt values are calculated by multiplying the input signal in BB by the
+++ * prompt code (multiplication of 16 bits vectors), accumulating the results into float32 values
+++ * - Late values are calculated by multiplying the input signal in BB by the
+++ * late code (multiplication of 16 bits vectors), accumulating the results into float32 values
+++ *
+++ * -------------------------------------------------------------------------
+++ *
+++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
+++ *
+++ * GNSS-SDR is a software defined Global Navigation
+++ * Satellite Systems receiver
+++ *
+++ * This file is part of GNSS-SDR.
+++ *
+++ * GNSS-SDR is free software: you can redistribute it and/or modify
+++ * it under the terms of the GNU General Public License as published by
+++ * the Free Software Foundation, either version 3 of the License, or
+++ * at your option) any later version.
+++ *
+++ * GNSS-SDR is distributed in the hope that it will be useful,
+++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+++ * GNU General Public License for more details.
+++ *
+++ * You should have received a copy of the GNU General Public License
+++ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
+++ *
+++ * -------------------------------------------------------------------------
+++ */
+++
+++#ifndef INCLUDED_gnsssdr_volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_u_H
+++#define INCLUDED_gnsssdr_volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_u_H
+++
+++#include <inttypes.h>
+++#include <stdio.h>
+++#include <volk_gnsssdr/volk_gnsssdr_complex.h>
+++#include <float.h>
+++#include <string.h>
+++
+++#ifdef LV_HAVE_SSE4_1
+++#include "smmintrin.h"
+++#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h"
+++#include "CommonMacros/CommonMacros.h"
+++/*!
+++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
+++ \param input The input signal input
+++ \param carrier The carrier signal input
+++ \param E_code Early PRN code replica input
+++ \param P_code Early PRN code replica input
+++ \param L_code Early PRN code replica input
+++ \param E_out Early correlation output
+++ \param P_out Early correlation output
+++ \param L_out Early correlation output
+++ \param num_points The number of complex values in vectors
+++ */
+++static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_u_sse4_1(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points)
+++{
+++ const unsigned int sse_iters = num_points / 8;
+++
+++ __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample;
+++ __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output;
+++
+++ __m128 E_code_acc, P_code_acc, L_code_acc;
+++ __m128i input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2;
+++ __m128 output_ps;
+++
+++ const lv_8sc_t* input_ptr = input;
+++ const lv_8sc_t* carrier_ptr = carrier;
+++
+++ const lv_8sc_t* E_code_ptr = E_code;
+++ lv_32fc_t* E_out_ptr = E_out;
+++ const lv_8sc_t* L_code_ptr = L_code;
+++ lv_32fc_t* L_out_ptr = L_out;
+++ const lv_8sc_t* P_code_ptr = P_code;
+++ lv_32fc_t* P_out_ptr = P_out;
+++
+++ *E_out_ptr = 0;
+++ *P_out_ptr = 0;
+++ *L_out_ptr = 0;
+++
+++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
+++
+++ E_code_acc = _mm_setzero_ps();
+++ L_code_acc = _mm_setzero_ps();
+++ P_code_acc = _mm_setzero_ps();
+++
+++ if (sse_iters>0)
+++ {
+++ for(int number = 0;number < sse_iters; number++){
+++
+++ //Perform the carrier wipe-off
+++ x = _mm_lddqu_si128((__m128i*)input_ptr);
+++ y = _mm_lddqu_si128((__m128i*)carrier_ptr);
+++
+++ CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(x, mult1, realx, imagx)
+++ CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy)
+++
+++ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample)
+++
+++ //Get early values
+++ y = _mm_lddqu_si128((__m128i*)E_code_ptr);
+++
+++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
+++
+++ E_code_acc = _mm_add_ps (E_code_acc, output_ps);
+++
+++ //Get prompt values
+++ y = _mm_lddqu_si128((__m128i*)P_code_ptr);
+++
+++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
+++
+++ P_code_acc = _mm_add_ps (P_code_acc, output_ps);
+++
+++ //Get late values
+++ y = _mm_lddqu_si128((__m128i*)L_code_ptr);
+++
+++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
+++
+++ L_code_acc = _mm_add_ps (L_code_acc, output_ps);
+++
+++ input_ptr += 8;
+++ carrier_ptr += 8;
+++ E_code_ptr += 8;
+++ P_code_ptr += 8;
+++ L_code_ptr += 8;
+++ }
+++
+++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2];
+++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2];
+++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2];
+++
+++ _mm_storeu_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector
+++ _mm_storeu_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector
+++ _mm_storeu_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector
+++
+++ for (int i = 0; i<2; ++i)
+++ {
+++ *E_out_ptr += E_dotProductVector[i];
+++ *P_out_ptr += P_dotProductVector[i];
+++ *L_out_ptr += L_dotProductVector[i];
+++ }
+++ }
+++
+++ lv_8sc_t bb_signal_sample;
+++ for(int i=0; i < num_points%8; ++i)
+++ {
+++ //Perform the carrier wipe-off
+++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
+++ // Now get early, late, and prompt values for each
+++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++));
+++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++));
+++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++));
+++ }
+++}
+++#endif /* LV_HAVE_SSE4_1 */
+++
+++#ifdef LV_HAVE_SSE2
+++#include "emmintrin.h"
+++#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h"
+++#include "CommonMacros/CommonMacros.h"
+++/*!
+++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
+++ \param input The input signal input
+++ \param carrier The carrier signal input
+++ \param E_code Early PRN code replica input
+++ \param P_code Early PRN code replica input
+++ \param L_code Early PRN code replica input
+++ \param E_out Early correlation output
+++ \param P_out Early correlation output
+++ \param L_out Early correlation output
+++ \param num_points The number of complex values in vectors
+++ */
+++static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_u_sse2(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points)
+++{
+++ const unsigned int sse_iters = num_points / 8;
+++
+++ __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample;
+++ __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output;
+++
+++ __m128 E_code_acc, P_code_acc, L_code_acc;
+++ __m128i input_i_1, input_i_2, output_i32;
+++ __m128 output_ps_1, output_ps_2;
+++
+++ const lv_8sc_t* input_ptr = input;
+++ const lv_8sc_t* carrier_ptr = carrier;
+++
+++ const lv_8sc_t* E_code_ptr = E_code;
+++ lv_32fc_t* E_out_ptr = E_out;
+++ const lv_8sc_t* L_code_ptr = L_code;
+++ lv_32fc_t* L_out_ptr = L_out;
+++ const lv_8sc_t* P_code_ptr = P_code;
+++ lv_32fc_t* P_out_ptr = P_out;
+++
+++ *E_out_ptr = 0;
+++ *P_out_ptr = 0;
+++ *L_out_ptr = 0;
+++
+++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
+++
+++ E_code_acc = _mm_setzero_ps();
+++ L_code_acc = _mm_setzero_ps();
+++ P_code_acc = _mm_setzero_ps();
+++
+++ if (sse_iters>0)
+++ {
+++ for(int number = 0;number < sse_iters; number++){
+++
+++ //Perform the carrier wipe-off
+++ x = _mm_lddqu_si128((__m128i*)input_ptr);
+++ y = _mm_lddqu_si128((__m128i*)carrier_ptr);
+++
+++ CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(x, mult1, realx, imagx)
+++ CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy)
+++
+++ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample)
+++
+++ //Get early values
+++ y = _mm_lddqu_si128((__m128i*)E_code_ptr);
+++
+++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
+++
+++ E_code_acc = _mm_add_ps (E_code_acc, output_ps_1);
+++ E_code_acc = _mm_add_ps (E_code_acc, output_ps_2);
+++
+++ //Get prompt values
+++ y = _mm_lddqu_si128((__m128i*)P_code_ptr);
+++
+++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
+++
+++ P_code_acc = _mm_add_ps (P_code_acc, output_ps_1);
+++ P_code_acc = _mm_add_ps (P_code_acc, output_ps_2);
+++
+++ //Get late values
+++ y = _mm_lddqu_si128((__m128i*)L_code_ptr);
+++
+++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
+++
+++ L_code_acc = _mm_add_ps (L_code_acc, output_ps_1);
+++ L_code_acc = _mm_add_ps (L_code_acc, output_ps_2);
+++
+++ input_ptr += 8;
+++ carrier_ptr += 8;
+++ E_code_ptr += 8;
+++ P_code_ptr += 8;
+++ L_code_ptr += 8;
+++ }
+++
+++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2];
+++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2];
+++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2];
+++
+++ _mm_storeu_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector
+++ _mm_storeu_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector
+++ _mm_storeu_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector
+++
+++ for (int i = 0; i<2; ++i)
+++ {
+++ *E_out_ptr += E_dotProductVector[i];
+++ *P_out_ptr += P_dotProductVector[i];
+++ *L_out_ptr += L_dotProductVector[i];
+++ }
+++ }
+++
+++ lv_8sc_t bb_signal_sample;
+++ for(int i=0; i < num_points%8; ++i)
+++ {
+++ //Perform the carrier wipe-off
+++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
+++ // Now get early, late, and prompt values for each
+++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++));
+++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++));
+++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++));
+++ }
+++}
+++#endif /* LV_HAVE_SSE2 */
+++
+++#ifdef LV_HAVE_GENERIC
+++/*!
+++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
+++ \param input The input signal input
+++ \param carrier The carrier signal input
+++ \param E_code Early PRN code replica input
+++ \param P_code Early PRN code replica input
+++ \param L_code Early PRN code replica input
+++ \param E_out Early correlation output
+++ \param P_out Early correlation output
+++ \param L_out Early correlation output
+++ \param num_points The number of complex values in vectors
+++ */
+++static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_generic(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points)
+++{
+++ lv_8sc_t bb_signal_sample;
+++
+++ bb_signal_sample = lv_cmake(0, 0);
+++
+++ *E_out = 0;
+++ *P_out = 0;
+++ *L_out = 0;
+++ // perform Early, Prompt and Late correlation
+++ for(int i=0; i < num_points; ++i)
+++ {
+++ //Perform the carrier wipe-off
+++ bb_signal_sample = input[i] * carrier[i];
+++ // Now get early, late, and prompt values for each
+++ *E_out += (lv_32fc_t) (bb_signal_sample * E_code[i]);
+++ *P_out += (lv_32fc_t) (bb_signal_sample * P_code[i]);
+++ *L_out += (lv_32fc_t) (bb_signal_sample * L_code[i]);
+++ }
+++}
+++
+++#endif /* LV_HAVE_GENERIC */
+++
+++#endif /* INCLUDED_gnsssdr_volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_u_H */
+++
+++
+++#ifndef INCLUDED_gnsssdr_volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_a_H
+++#define INCLUDED_gnsssdr_volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_a_H
+++
+++#include <inttypes.h>
+++#include <stdio.h>
+++#include <volk_gnsssdr/volk_gnsssdr_complex.h>
+++#include <float.h>
+++#include <string.h>
+++
+++#ifdef LV_HAVE_SSE4_1
+++#include "smmintrin.h"
+++#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h"
+++#include "CommonMacros/CommonMacros.h"
+++/*!
+++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
+++ \param input The input signal input
+++ \param carrier The carrier signal input
+++ \param E_code Early PRN code replica input
+++ \param P_code Early PRN code replica input
+++ \param L_code Early PRN code replica input
+++ \param E_out Early correlation output
+++ \param P_out Early correlation output
+++ \param L_out Early correlation output
+++ \param num_points The number of complex values in vectors
+++ */
+++static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_a_sse4_1(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points)
+++{
+++ const unsigned int sse_iters = num_points / 8;
+++
+++ __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample;
+++ __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output;
+++
+++ __m128 E_code_acc, P_code_acc, L_code_acc;
+++ __m128i input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2;
+++ __m128 output_ps;
+++
+++ const lv_8sc_t* input_ptr = input;
+++ const lv_8sc_t* carrier_ptr = carrier;
+++
+++ const lv_8sc_t* E_code_ptr = E_code;
+++ lv_32fc_t* E_out_ptr = E_out;
+++ const lv_8sc_t* L_code_ptr = L_code;
+++ lv_32fc_t* L_out_ptr = L_out;
+++ const lv_8sc_t* P_code_ptr = P_code;
+++ lv_32fc_t* P_out_ptr = P_out;
+++
+++ *E_out_ptr = 0;
+++ *P_out_ptr = 0;
+++ *L_out_ptr = 0;
+++
+++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
+++
+++ E_code_acc = _mm_setzero_ps();
+++ L_code_acc = _mm_setzero_ps();
+++ P_code_acc = _mm_setzero_ps();
+++
+++ if (sse_iters>0)
+++ {
+++ for(int number = 0;number < sse_iters; number++){
+++
+++ //Perform the carrier wipe-off
+++ x = _mm_load_si128((__m128i*)input_ptr);
+++ y = _mm_load_si128((__m128i*)carrier_ptr);
+++
+++ CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(x, mult1, realx, imagx)
+++ CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy)
+++
+++ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample)
+++
+++ //Get early values
+++ y = _mm_load_si128((__m128i*)E_code_ptr);
+++
+++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
+++
+++ E_code_acc = _mm_add_ps (E_code_acc, output_ps);
+++
+++ //Get prompt values
+++ y = _mm_load_si128((__m128i*)P_code_ptr);
+++
+++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
+++
+++ P_code_acc = _mm_add_ps (P_code_acc, output_ps);
+++
+++ //Get late values
+++ y = _mm_load_si128((__m128i*)L_code_ptr);
+++
+++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
+++
+++ L_code_acc = _mm_add_ps (L_code_acc, output_ps);
+++
+++ input_ptr += 8;
+++ carrier_ptr += 8;
+++ E_code_ptr += 8;
+++ P_code_ptr += 8;
+++ L_code_ptr += 8;
+++ }
+++
+++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2];
+++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2];
+++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2];
+++
+++ _mm_store_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector
+++ _mm_store_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector
+++ _mm_store_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector
+++
+++ for (int i = 0; i<2; ++i)
+++ {
+++ *E_out_ptr += E_dotProductVector[i];
+++ *P_out_ptr += P_dotProductVector[i];
+++ *L_out_ptr += L_dotProductVector[i];
+++ }
+++ }
+++
+++ lv_8sc_t bb_signal_sample;
+++ for(int i=0; i < num_points%8; ++i)
+++ {
+++ //Perform the carrier wipe-off
+++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
+++ // Now get early, late, and prompt values for each
+++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++));
+++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++));
+++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++));
+++ }
+++}
+++#endif /* LV_HAVE_SSE4_1 */
+++
+++#ifdef LV_HAVE_SSE2
+++#include "emmintrin.h"
+++#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h"
+++#include "CommonMacros/CommonMacros.h"
+++/*!
+++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
+++ \param input The input signal input
+++ \param carrier The carrier signal input
+++ \param E_code Early PRN code replica input
+++ \param P_code Early PRN code replica input
+++ \param L_code Early PRN code replica input
+++ \param E_out Early correlation output
+++ \param P_out Early correlation output
+++ \param L_out Early correlation output
+++ \param num_points The number of complex values in vectors
+++ */
+++static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_a_sse2(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points)
+++{
+++ const unsigned int sse_iters = num_points / 8;
+++
+++ __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample;
+++ __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output;
+++
+++ __m128 E_code_acc, P_code_acc, L_code_acc;
+++ __m128i input_i_1, input_i_2, output_i32;
+++ __m128 output_ps_1, output_ps_2;
+++
+++ const lv_8sc_t* input_ptr = input;
+++ const lv_8sc_t* carrier_ptr = carrier;
+++
+++ const lv_8sc_t* E_code_ptr = E_code;
+++ lv_32fc_t* E_out_ptr = E_out;
+++ const lv_8sc_t* L_code_ptr = L_code;
+++ lv_32fc_t* L_out_ptr = L_out;
+++ const lv_8sc_t* P_code_ptr = P_code;
+++ lv_32fc_t* P_out_ptr = P_out;
+++
+++ *E_out_ptr = 0;
+++ *P_out_ptr = 0;
+++ *L_out_ptr = 0;
+++
+++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
+++
+++ E_code_acc = _mm_setzero_ps();
+++ L_code_acc = _mm_setzero_ps();
+++ P_code_acc = _mm_setzero_ps();
+++
+++ if (sse_iters>0)
+++ {
+++ for(int number = 0;number < sse_iters; number++){
+++
+++ //Perform the carrier wipe-off
+++ x = _mm_load_si128((__m128i*)input_ptr);
+++ y = _mm_load_si128((__m128i*)carrier_ptr);
+++
+++ CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(x, mult1, realx, imagx)
+++ CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy)
+++
+++ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample)
+++
+++ //Get early values
+++ y = _mm_load_si128((__m128i*)E_code_ptr);
+++
+++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
+++
+++ E_code_acc = _mm_add_ps (E_code_acc, output_ps_1);
+++ E_code_acc = _mm_add_ps (E_code_acc, output_ps_2);
+++
+++ //Get prompt values
+++ y = _mm_load_si128((__m128i*)P_code_ptr);
+++
+++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
+++
+++ P_code_acc = _mm_add_ps (P_code_acc, output_ps_1);
+++ P_code_acc = _mm_add_ps (P_code_acc, output_ps_2);
+++
+++ //Get late values
+++ y = _mm_load_si128((__m128i*)L_code_ptr);
+++
+++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
+++
+++ L_code_acc = _mm_add_ps (L_code_acc, output_ps_1);
+++ L_code_acc = _mm_add_ps (L_code_acc, output_ps_2);
+++
+++ input_ptr += 8;
+++ carrier_ptr += 8;
+++ E_code_ptr += 8;
+++ P_code_ptr += 8;
+++ L_code_ptr += 8;
+++ }
+++
+++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2];
+++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2];
+++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2];
+++
+++ _mm_store_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector
+++ _mm_store_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector
+++ _mm_store_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector
+++
+++ for (int i = 0; i<2; ++i)
+++ {
+++ *E_out_ptr += E_dotProductVector[i];
+++ *P_out_ptr += P_dotProductVector[i];
+++ *L_out_ptr += L_dotProductVector[i];
+++ }
+++ }
+++
+++ lv_8sc_t bb_signal_sample;
+++ for(int i=0; i < num_points%8; ++i)
+++ {
+++ //Perform the carrier wipe-off
+++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
+++ // Now get early, late, and prompt values for each
+++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++));
+++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++));
+++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++));
+++ }
+++}
+++#endif /* LV_HAVE_SSE2 */
+++
+++#ifdef LV_HAVE_GENERIC
+++/*!
+++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
+++ \param input The input signal input
+++ \param carrier The carrier signal input
+++ \param E_code Early PRN code replica input
+++ \param P_code Early PRN code replica input
+++ \param L_code Early PRN code replica input
+++ \param E_out Early correlation output
+++ \param P_out Early correlation output
+++ \param L_out Early correlation output
+++ \param num_points The number of complex values in vectors
+++ */
+++static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_a_generic(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points)
+++{
+++ lv_8sc_t bb_signal_sample;
+++
+++ bb_signal_sample = lv_cmake(0, 0);
+++
+++ *E_out = 0;
+++ *P_out = 0;
+++ *L_out = 0;
+++ // perform Early, Prompt and Late correlation
+++ for(int i=0; i < num_points; ++i)
+++ {
+++ //Perform the carrier wipe-off
+++ bb_signal_sample = input[i] * carrier[i];
+++ // Now get early, late, and prompt values for each
+++ *E_out += (lv_32fc_t) (bb_signal_sample * E_code[i]);
+++ *P_out += (lv_32fc_t) (bb_signal_sample * P_code[i]);
+++ *L_out += (lv_32fc_t) (bb_signal_sample * L_code[i]);
+++ }
+++}
+++
+++#endif /* LV_HAVE_GENERIC */
+++#endif /* INCLUDED_gnsssdr_volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_a_H */
++\ No newline at end of file
++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3.h
++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3.h 1970-01-01 01:00:00.000000000 +0100
+++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3.h 2014-10-15 01:55:08.000000000 +0200
++@@ -0,0 +1,874 @@
+++/*!
+++ * \file volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3.h
+++ * \brief Volk protokernel: performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation with 16 bits vectors
+++ * \authors <ul>
+++ * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
+++ * </ul>
+++ *
+++ * Volk protokernel that performs the carrier wipe-off mixing and the
+++ * Early, Prompt, and Late correlation with 16 bits vectors (8 bits the
+++ * real part and 8 bits the imaginary part):
+++ * - The carrier wipe-off is done by multiplying the input signal by the
+++ * carrier (multiplication of 16 bits vectors) It returns the input
+++ * signal in base band (BB)
+++ * - Early values are calculated by multiplying the input signal in BB by the
+++ * early code (multiplication of 16 bits vectors), accumulating the results
+++ * - Prompt values are calculated by multiplying the input signal in BB by the
+++ * prompt code (multiplication of 16 bits vectors), accumulating the results
+++ * - Late values are calculated by multiplying the input signal in BB by the
+++ * late code (multiplication of 16 bits vectors), accumulating the results
+++ *
+++ * -------------------------------------------------------------------------
+++ *
+++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
+++ *
+++ * GNSS-SDR is a software defined Global Navigation
+++ * Satellite Systems receiver
+++ *
+++ * This file is part of GNSS-SDR.
+++ *
+++ * GNSS-SDR is free software: you can redistribute it and/or modify
+++ * it under the terms of the GNU General Public License as published by
+++ * the Free Software Foundation, either version 3 of the License, or
+++ * at your option) any later version.
+++ *
+++ * GNSS-SDR is distributed in the hope that it will be useful,
+++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+++ * GNU General Public License for more details.
+++ *
+++ * You should have received a copy of the GNU General Public License
+++ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
+++ *
+++ * -------------------------------------------------------------------------
+++ */
+++
+++#ifndef INCLUDED_gnsssdr_volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_u_H
+++#define INCLUDED_gnsssdr_volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_u_H
+++
+++#include <inttypes.h>
+++#include <stdio.h>
+++#include <volk_gnsssdr/volk_gnsssdr_complex.h>
+++#include <float.h>
+++#include <string.h>
+++
+++#ifdef LV_HAVE_SSE4_1
+++#include "smmintrin.h"
+++ /*!
+++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
+++ \param input The input signal input
+++ \param carrier The carrier signal input
+++ \param E_code Early PRN code replica input
+++ \param P_code Early PRN code replica input
+++ \param L_code Early PRN code replica input
+++ \param E_out Early correlation output
+++ \param P_out Early correlation output
+++ \param L_out Early correlation output
+++ \param num_points The number of complex values in vectors
+++ */
+++static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_u_sse4_1(lv_8sc_t* E_out, lv_8sc_t* P_out, lv_8sc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points)
+++{
+++ const unsigned int sse_iters = num_points / 8;
+++
+++ __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample, real_E_code_acc, imag_E_code_acc, real_L_code_acc, imag_L_code_acc, real_P_code_acc, imag_P_code_acc;
+++ __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output;
+++
+++ const lv_8sc_t* input_ptr = input;
+++ const lv_8sc_t* carrier_ptr = carrier;
+++
+++ const lv_8sc_t* E_code_ptr = E_code;
+++ lv_8sc_t* E_out_ptr = E_out;
+++ const lv_8sc_t* L_code_ptr = L_code;
+++ lv_8sc_t* L_out_ptr = L_out;
+++ const lv_8sc_t* P_code_ptr = P_code;
+++ lv_8sc_t* P_out_ptr = P_out;
+++
+++ *E_out_ptr = 0;
+++ *P_out_ptr = 0;
+++ *L_out_ptr = 0;
+++
+++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
+++
+++ real_E_code_acc = _mm_setzero_si128();
+++ imag_E_code_acc = _mm_setzero_si128();
+++ real_L_code_acc = _mm_setzero_si128();
+++ imag_L_code_acc = _mm_setzero_si128();
+++ real_P_code_acc = _mm_setzero_si128();
+++ imag_P_code_acc = _mm_setzero_si128();
+++
+++ if (sse_iters>0)
+++ {
+++ for(int number = 0;number < sse_iters; number++){
+++
+++ //Perform the carrier wipe-off
+++ x = _mm_lddqu_si128((__m128i*)input_ptr);
+++ y = _mm_lddqu_si128((__m128i*)carrier_ptr);
+++
+++ imagx = _mm_srli_si128 (x, 1);
+++ imagx = _mm_and_si128 (imagx, mult1);
+++ realx = _mm_and_si128 (x, mult1);
+++
+++ imagy = _mm_srli_si128 (y, 1);
+++ imagy = _mm_and_si128 (imagy, mult1);
+++ realy = _mm_and_si128 (y, mult1);
+++
+++ realx_mult_realy = _mm_mullo_epi16 (realx, realy);
+++ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);
+++ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);
+++ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);
+++
+++ real_bb_signal_sample = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
+++ imag_bb_signal_sample = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
+++
+++ //Get early values
+++ y = _mm_lddqu_si128((__m128i*)E_code_ptr);
+++
+++ imagy = _mm_srli_si128 (y, 1);
+++ imagy = _mm_and_si128 (imagy, mult1);
+++ realy = _mm_and_si128 (y, mult1);
+++
+++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy);
+++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy);
+++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy);
+++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy);
+++
+++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
+++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
+++
+++ real_E_code_acc = _mm_add_epi16 (real_E_code_acc, real_output);
+++ imag_E_code_acc = _mm_add_epi16 (imag_E_code_acc, imag_output);
+++
+++ //Get late values
+++ y = _mm_lddqu_si128((__m128i*)L_code_ptr);
+++
+++ imagy = _mm_srli_si128 (y, 1);
+++ imagy = _mm_and_si128 (imagy, mult1);
+++ realy = _mm_and_si128 (y, mult1);
+++
+++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy);
+++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy);
+++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy);
+++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy);
+++
+++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
+++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
+++
+++ real_L_code_acc = _mm_add_epi16 (real_L_code_acc, real_output);
+++ imag_L_code_acc = _mm_add_epi16 (imag_L_code_acc, imag_output);
+++
+++ //Get prompt values
+++ y = _mm_lddqu_si128((__m128i*)P_code_ptr);
+++
+++ imagy = _mm_srli_si128 (y, 1);
+++ imagy = _mm_and_si128 (imagy, mult1);
+++ realy = _mm_and_si128 (y, mult1);
+++
+++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy);
+++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy);
+++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy);
+++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy);
+++
+++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
+++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
+++
+++ real_P_code_acc = _mm_add_epi16 (real_P_code_acc, real_output);
+++ imag_P_code_acc = _mm_add_epi16 (imag_P_code_acc, imag_output);
+++
+++ input_ptr += 8;
+++ carrier_ptr += 8;
+++ E_code_ptr += 8;
+++ L_code_ptr += 8;
+++ P_code_ptr += 8;
+++ }
+++
+++ __VOLK_ATTR_ALIGNED(16) lv_8sc_t E_dotProductVector[8];
+++ __VOLK_ATTR_ALIGNED(16) lv_8sc_t L_dotProductVector[8];
+++ __VOLK_ATTR_ALIGNED(16) lv_8sc_t P_dotProductVector[8];
+++
+++ imag_E_code_acc = _mm_slli_si128 (imag_E_code_acc, 1);
+++ output = _mm_blendv_epi8 (imag_E_code_acc, real_E_code_acc, mult1);
+++ _mm_storeu_si128((__m128i*)E_dotProductVector, output);
+++
+++ imag_L_code_acc = _mm_slli_si128 (imag_L_code_acc, 1);
+++ output = _mm_blendv_epi8 (imag_L_code_acc, real_L_code_acc, mult1);
+++ _mm_storeu_si128((__m128i*)L_dotProductVector, output);
+++
+++ imag_P_code_acc = _mm_slli_si128 (imag_P_code_acc, 1);
+++ output = _mm_blendv_epi8 (imag_P_code_acc, real_P_code_acc, mult1);
+++ _mm_storeu_si128((__m128i*)P_dotProductVector, output);
+++
+++ for (int i = 0; i<8; ++i)
+++ {
+++ *E_out_ptr += E_dotProductVector[i];
+++ *L_out_ptr += L_dotProductVector[i];
+++ *P_out_ptr += P_dotProductVector[i];
+++ }
+++ }
+++
+++ lv_8sc_t bb_signal_sample;
+++ for(int i=0; i < num_points%8; ++i)
+++ {
+++ //Perform the carrier wipe-off
+++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
+++ // Now get early, late, and prompt values for each
+++ *E_out_ptr += bb_signal_sample * (*E_code_ptr++);
+++ *P_out_ptr += bb_signal_sample * (*P_code_ptr++);
+++ *L_out_ptr += bb_signal_sample * (*L_code_ptr++);
+++ }
+++}
+++
+++#endif /* LV_HAVE_SSE4_1 */
+++
+++#ifdef LV_HAVE_SSE2
+++#include "emmintrin.h"
+++/*!
+++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
+++ \param input The input signal input
+++ \param carrier The carrier signal input
+++ \param E_code Early PRN code replica input
+++ \param P_code Early PRN code replica input
+++ \param L_code Early PRN code replica input
+++ \param E_out Early correlation output
+++ \param P_out Early correlation output
+++ \param L_out Early correlation output
+++ \param num_points The number of complex values in vectors
+++ */
+++static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_u_sse2(lv_8sc_t* E_out, lv_8sc_t* P_out, lv_8sc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points)
+++{
+++ const unsigned int sse_iters = num_points / 8;
+++
+++ __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample, real_E_code_acc, imag_E_code_acc, real_L_code_acc, imag_L_code_acc, real_P_code_acc, imag_P_code_acc;
+++ __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output;
+++
+++ const lv_8sc_t* input_ptr = input;
+++ const lv_8sc_t* carrier_ptr = carrier;
+++
+++ const lv_8sc_t* E_code_ptr = E_code;
+++ lv_8sc_t* E_out_ptr = E_out;
+++ const lv_8sc_t* L_code_ptr = L_code;
+++ lv_8sc_t* L_out_ptr = L_out;
+++ const lv_8sc_t* P_code_ptr = P_code;
+++ lv_8sc_t* P_out_ptr = P_out;
+++
+++ *E_out_ptr = 0;
+++ *P_out_ptr = 0;
+++ *L_out_ptr = 0;
+++
+++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
+++
+++ real_E_code_acc = _mm_setzero_si128();
+++ imag_E_code_acc = _mm_setzero_si128();
+++ real_L_code_acc = _mm_setzero_si128();
+++ imag_L_code_acc = _mm_setzero_si128();
+++ real_P_code_acc = _mm_setzero_si128();
+++ imag_P_code_acc = _mm_setzero_si128();
+++
+++ if (sse_iters>0)
+++ {
+++ for(int number = 0;number < sse_iters; number++){
+++
+++ //Perform the carrier wipe-off
+++ x = _mm_lddqu_si128((__m128i*)input_ptr);
+++ y = _mm_lddqu_si128((__m128i*)carrier_ptr);
+++
+++ imagx = _mm_srli_si128 (x, 1);
+++ imagx = _mm_and_si128 (imagx, mult1);
+++ realx = _mm_and_si128 (x, mult1);
+++
+++ imagy = _mm_srli_si128 (y, 1);
+++ imagy = _mm_and_si128 (imagy, mult1);
+++ realy = _mm_and_si128 (y, mult1);
+++
+++ realx_mult_realy = _mm_mullo_epi16 (realx, realy);
+++ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);
+++ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);
+++ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);
+++
+++ real_bb_signal_sample = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
+++ imag_bb_signal_sample = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
+++
+++ //Get early values
+++ y = _mm_lddqu_si128((__m128i*)E_code_ptr);
+++
+++ imagy = _mm_srli_si128 (y, 1);
+++ imagy = _mm_and_si128 (imagy, mult1);
+++ realy = _mm_and_si128 (y, mult1);
+++
+++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy);
+++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy);
+++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy);
+++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy);
+++
+++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
+++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
+++
+++ real_E_code_acc = _mm_add_epi16 (real_E_code_acc, real_output);
+++ imag_E_code_acc = _mm_add_epi16 (imag_E_code_acc, imag_output);
+++
+++ //Get late values
+++ y = _mm_lddqu_si128((__m128i*)L_code_ptr);
+++
+++ imagy = _mm_srli_si128 (y, 1);
+++ imagy = _mm_and_si128 (imagy, mult1);
+++ realy = _mm_and_si128 (y, mult1);
+++
+++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy);
+++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy);
+++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy);
+++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy);
+++
+++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
+++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
+++
+++ real_L_code_acc = _mm_add_epi16 (real_L_code_acc, real_output);
+++ imag_L_code_acc = _mm_add_epi16 (imag_L_code_acc, imag_output);
+++
+++ //Get prompt values
+++ y = _mm_lddqu_si128((__m128i*)P_code_ptr);
+++
+++ imagy = _mm_srli_si128 (y, 1);
+++ imagy = _mm_and_si128 (imagy, mult1);
+++ realy = _mm_and_si128 (y, mult1);
+++
+++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy);
+++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy);
+++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy);
+++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy);
+++
+++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
+++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
+++
+++ real_P_code_acc = _mm_add_epi16 (real_P_code_acc, real_output);
+++ imag_P_code_acc = _mm_add_epi16 (imag_P_code_acc, imag_output);
+++
+++ input_ptr += 8;
+++ carrier_ptr += 8;
+++ E_code_ptr += 8;
+++ L_code_ptr += 8;
+++ P_code_ptr += 8;
+++ }
+++
+++ __VOLK_ATTR_ALIGNED(16) lv_8sc_t E_dotProductVector[8];
+++ __VOLK_ATTR_ALIGNED(16) lv_8sc_t L_dotProductVector[8];
+++ __VOLK_ATTR_ALIGNED(16) lv_8sc_t P_dotProductVector[8];
+++
+++ real_E_code_acc = _mm_and_si128 (real_E_code_acc, mult1);
+++ imag_E_code_acc = _mm_and_si128 (imag_E_code_acc, mult1);
+++ imag_E_code_acc = _mm_slli_si128 (imag_E_code_acc, 1);
+++ output = _mm_or_si128 (real_E_code_acc, imag_E_code_acc);
+++ _mm_storeu_si128((__m128i*)E_dotProductVector, output);
+++
+++ real_L_code_acc = _mm_and_si128 (real_L_code_acc, mult1);
+++ imag_L_code_acc = _mm_and_si128 (imag_L_code_acc, mult1);
+++ imag_L_code_acc = _mm_slli_si128 (imag_L_code_acc, 1);
+++ output = _mm_or_si128 (real_L_code_acc, imag_L_code_acc);
+++ _mm_storeu_si128((__m128i*)L_dotProductVector, output);
+++
+++ real_P_code_acc = _mm_and_si128 (real_P_code_acc, mult1);
+++ imag_P_code_acc = _mm_and_si128 (imag_P_code_acc, mult1);
+++ imag_P_code_acc = _mm_slli_si128 (imag_P_code_acc, 1);
+++ output = _mm_or_si128 (real_P_code_acc, imag_P_code_acc);
+++ _mm_storeu_si128((__m128i*)P_dotProductVector, output);
+++
+++ for (int i = 0; i<8; ++i)
+++ {
+++ *E_out_ptr += E_dotProductVector[i];
+++ *L_out_ptr += L_dotProductVector[i];
+++ *P_out_ptr += P_dotProductVector[i];
+++ }
+++ }
+++
+++ lv_8sc_t bb_signal_sample;
+++ for(int i=0; i < num_points%8; ++i)
+++ {
+++ //Perform the carrier wipe-off
+++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
+++ // Now get early, late, and prompt values for each
+++ *E_out_ptr += bb_signal_sample * (*E_code_ptr++);
+++ *P_out_ptr += bb_signal_sample * (*P_code_ptr++);
+++ *L_out_ptr += bb_signal_sample * (*L_code_ptr++);
+++ }
+++}
+++
+++#endif /* LV_HAVE_SSE2 */
+++
+++#ifdef LV_HAVE_GENERIC
+++/*!
+++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
+++ \param input The input signal input
+++ \param carrier The carrier signal input
+++ \param E_code Early PRN code replica input
+++ \param P_code Early PRN code replica input
+++ \param L_code Early PRN code replica input
+++ \param E_out Early correlation output
+++ \param P_out Early correlation output
+++ \param L_out Early correlation output
+++ \param num_points The number of complex values in vectors
+++ */
+++static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_generic(lv_8sc_t* E_out, lv_8sc_t* P_out, lv_8sc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points)
+++{
+++ lv_8sc_t bb_signal_sample;
+++
+++ bb_signal_sample = lv_cmake(0, 0);
+++
+++ *E_out = 0;
+++ *P_out = 0;
+++ *L_out = 0;
+++ // perform Early, Prompt and Late correlation
+++ for(int i=0; i < num_points; ++i)
+++ {
+++ //Perform the carrier wipe-off
+++ bb_signal_sample = input[i] * carrier[i];
+++ // Now get early, late, and prompt values for each
+++ *E_out += bb_signal_sample * E_code[i];
+++ *P_out += bb_signal_sample * P_code[i];
+++ *L_out += bb_signal_sample * L_code[i];
+++ }
+++}
+++
+++#endif /* LV_HAVE_GENERIC */
+++
+++#endif /* INCLUDED_gnsssdr_volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_u_H */
+++
+++
+++#ifndef INCLUDED_gnsssdr_volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_a_H
+++#define INCLUDED_gnsssdr_volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_a_H
+++
+++#include <inttypes.h>
+++#include <stdio.h>
+++#include <volk_gnsssdr/volk_gnsssdr_complex.h>
+++#include <float.h>
+++#include <string.h>
+++
+++#ifdef LV_HAVE_SSE4_1
+++#include "smmintrin.h"
+++/*!
+++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
+++ \param input The input signal input
+++ \param carrier The carrier signal input
+++ \param E_code Early PRN code replica input
+++ \param P_code Early PRN code replica input
+++ \param L_code Early PRN code replica input
+++ \param E_out Early correlation output
+++ \param P_out Early correlation output
+++ \param L_out Early correlation output
+++ \param num_points The number of complex values in vectors
+++ */
+++static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_a_sse4_1(lv_8sc_t* E_out, lv_8sc_t* P_out, lv_8sc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points)
+++{
+++ const unsigned int sse_iters = num_points / 8;
+++
+++ __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample, real_E_code_acc, imag_E_code_acc, real_L_code_acc, imag_L_code_acc, real_P_code_acc, imag_P_code_acc;
+++ __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output;
+++
+++ const lv_8sc_t* input_ptr = input;
+++ const lv_8sc_t* carrier_ptr = carrier;
+++
+++ const lv_8sc_t* E_code_ptr = E_code;
+++ lv_8sc_t* E_out_ptr = E_out;
+++ const lv_8sc_t* L_code_ptr = L_code;
+++ lv_8sc_t* L_out_ptr = L_out;
+++ const lv_8sc_t* P_code_ptr = P_code;
+++ lv_8sc_t* P_out_ptr = P_out;
+++
+++ *E_out_ptr = 0;
+++ *P_out_ptr = 0;
+++ *L_out_ptr = 0;
+++
+++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
+++
+++ real_E_code_acc = _mm_setzero_si128();
+++ imag_E_code_acc = _mm_setzero_si128();
+++ real_L_code_acc = _mm_setzero_si128();
+++ imag_L_code_acc = _mm_setzero_si128();
+++ real_P_code_acc = _mm_setzero_si128();
+++ imag_P_code_acc = _mm_setzero_si128();
+++
+++ if (sse_iters>0)
+++ {
+++ for(int number = 0;number < sse_iters; number++){
+++
+++ //Perform the carrier wipe-off
+++ x = _mm_load_si128((__m128i*)input_ptr);
+++ y = _mm_load_si128((__m128i*)carrier_ptr);
+++
+++ imagx = _mm_srli_si128 (x, 1);
+++ imagx = _mm_and_si128 (imagx, mult1);
+++ realx = _mm_and_si128 (x, mult1);
+++
+++ imagy = _mm_srli_si128 (y, 1);
+++ imagy = _mm_and_si128 (imagy, mult1);
+++ realy = _mm_and_si128 (y, mult1);
+++
+++ realx_mult_realy = _mm_mullo_epi16 (realx, realy);
+++ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);
+++ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);
+++ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);
+++
+++ real_bb_signal_sample = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
+++ imag_bb_signal_sample = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
+++
+++ //Get early values
+++ y = _mm_load_si128((__m128i*)E_code_ptr);
+++
+++ imagy = _mm_srli_si128 (y, 1);
+++ imagy = _mm_and_si128 (imagy, mult1);
+++ realy = _mm_and_si128 (y, mult1);
+++
+++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy);
+++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy);
+++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy);
+++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy);
+++
+++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
+++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
+++
+++ real_E_code_acc = _mm_add_epi16 (real_E_code_acc, real_output);
+++ imag_E_code_acc = _mm_add_epi16 (imag_E_code_acc, imag_output);
+++
+++ //Get late values
+++ y = _mm_load_si128((__m128i*)L_code_ptr);
+++
+++ imagy = _mm_srli_si128 (y, 1);
+++ imagy = _mm_and_si128 (imagy, mult1);
+++ realy = _mm_and_si128 (y, mult1);
+++
+++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy);
+++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy);
+++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy);
+++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy);
+++
+++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
+++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
+++
+++ real_L_code_acc = _mm_add_epi16 (real_L_code_acc, real_output);
+++ imag_L_code_acc = _mm_add_epi16 (imag_L_code_acc, imag_output);
+++
+++ //Get prompt values
+++ y = _mm_load_si128((__m128i*)P_code_ptr);
+++
+++ imagy = _mm_srli_si128 (y, 1);
+++ imagy = _mm_and_si128 (imagy, mult1);
+++ realy = _mm_and_si128 (y, mult1);
+++
+++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy);
+++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy);
+++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy);
+++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy);
+++
+++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
+++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
+++
+++ real_P_code_acc = _mm_add_epi16 (real_P_code_acc, real_output);
+++ imag_P_code_acc = _mm_add_epi16 (imag_P_code_acc, imag_output);
+++
+++ input_ptr += 8;
+++ carrier_ptr += 8;
+++ E_code_ptr += 8;
+++ L_code_ptr += 8;
+++ P_code_ptr += 8;
+++ }
+++
+++ __VOLK_ATTR_ALIGNED(16) lv_8sc_t E_dotProductVector[8];
+++ __VOLK_ATTR_ALIGNED(16) lv_8sc_t L_dotProductVector[8];
+++ __VOLK_ATTR_ALIGNED(16) lv_8sc_t P_dotProductVector[8];
+++
+++ imag_E_code_acc = _mm_slli_si128 (imag_E_code_acc, 1);
+++ output = _mm_blendv_epi8 (imag_E_code_acc, real_E_code_acc, mult1);
+++ _mm_store_si128((__m128i*)E_dotProductVector, output);
+++
+++ imag_L_code_acc = _mm_slli_si128 (imag_L_code_acc, 1);
+++ output = _mm_blendv_epi8 (imag_L_code_acc, real_L_code_acc, mult1);
+++ _mm_store_si128((__m128i*)L_dotProductVector, output);
+++
+++ imag_P_code_acc = _mm_slli_si128 (imag_P_code_acc, 1);
+++ output = _mm_blendv_epi8 (imag_P_code_acc, real_P_code_acc, mult1);
+++ _mm_store_si128((__m128i*)P_dotProductVector, output);
+++
+++ for (int i = 0; i<8; ++i)
+++ {
+++ *E_out_ptr += E_dotProductVector[i];
+++ *L_out_ptr += L_dotProductVector[i];
+++ *P_out_ptr += P_dotProductVector[i];
+++ }
+++ }
+++
+++ lv_8sc_t bb_signal_sample;
+++ for(int i=0; i < num_points%8; ++i)
+++ {
+++ //Perform the carrier wipe-off
+++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
+++ // Now get early, late, and prompt values for each
+++ *E_out_ptr += bb_signal_sample * (*E_code_ptr++);
+++ *P_out_ptr += bb_signal_sample * (*P_code_ptr++);
+++ *L_out_ptr += bb_signal_sample * (*L_code_ptr++);
+++ }
+++}
+++
+++#endif /* LV_HAVE_SSE4_1 */
+++
+++#ifdef LV_HAVE_SSE2
+++#include "emmintrin.h"
+++/*!
+++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
+++ \param input The input signal input
+++ \param carrier The carrier signal input
+++ \param E_code Early PRN code replica input
+++ \param P_code Early PRN code replica input
+++ \param L_code Early PRN code replica input
+++ \param E_out Early correlation output
+++ \param P_out Early correlation output
+++ \param L_out Early correlation output
+++ \param num_points The number of complex values in vectors
+++ */
+++static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_a_sse2(lv_8sc_t* E_out, lv_8sc_t* P_out, lv_8sc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points)
+++{
+++ const unsigned int sse_iters = num_points / 8;
+++
+++ __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample, real_E_code_acc, imag_E_code_acc, real_L_code_acc, imag_L_code_acc, real_P_code_acc, imag_P_code_acc;
+++ __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output;
+++
+++ const lv_8sc_t* input_ptr = input;
+++ const lv_8sc_t* carrier_ptr = carrier;
+++
+++ const lv_8sc_t* E_code_ptr = E_code;
+++ lv_8sc_t* E_out_ptr = E_out;
+++ const lv_8sc_t* L_code_ptr = L_code;
+++ lv_8sc_t* L_out_ptr = L_out;
+++ const lv_8sc_t* P_code_ptr = P_code;
+++ lv_8sc_t* P_out_ptr = P_out;
+++
+++ *E_out_ptr = 0;
+++ *P_out_ptr = 0;
+++ *L_out_ptr = 0;
+++
+++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
+++
+++ real_E_code_acc = _mm_setzero_si128();
+++ imag_E_code_acc = _mm_setzero_si128();
+++ real_L_code_acc = _mm_setzero_si128();
+++ imag_L_code_acc = _mm_setzero_si128();
+++ real_P_code_acc = _mm_setzero_si128();
+++ imag_P_code_acc = _mm_setzero_si128();
+++
+++ if (sse_iters>0)
+++ {
+++ for(int number = 0;number < sse_iters; number++){
+++
+++ //Perform the carrier wipe-off
+++ x = _mm_load_si128((__m128i*)input_ptr);
+++ y = _mm_load_si128((__m128i*)carrier_ptr);
+++
+++ imagx = _mm_srli_si128 (x, 1);
+++ imagx = _mm_and_si128 (imagx, mult1);
+++ realx = _mm_and_si128 (x, mult1);
+++
+++ imagy = _mm_srli_si128 (y, 1);
+++ imagy = _mm_and_si128 (imagy, mult1);
+++ realy = _mm_and_si128 (y, mult1);
+++
+++ realx_mult_realy = _mm_mullo_epi16 (realx, realy);
+++ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);
+++ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);
+++ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);
+++
+++ real_bb_signal_sample = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
+++ imag_bb_signal_sample = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
+++
+++ //Get early values
+++ y = _mm_load_si128((__m128i*)E_code_ptr);
+++
+++ imagy = _mm_srli_si128 (y, 1);
+++ imagy = _mm_and_si128 (imagy, mult1);
+++ realy = _mm_and_si128 (y, mult1);
+++
+++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy);
+++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy);
+++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy);
+++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy);
+++
+++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
+++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
+++
+++ real_E_code_acc = _mm_add_epi16 (real_E_code_acc, real_output);
+++ imag_E_code_acc = _mm_add_epi16 (imag_E_code_acc, imag_output);
+++
+++ //Get late values
+++ y = _mm_load_si128((__m128i*)L_code_ptr);
+++
+++ imagy = _mm_srli_si128 (y, 1);
+++ imagy = _mm_and_si128 (imagy, mult1);
+++ realy = _mm_and_si128 (y, mult1);
+++
+++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy);
+++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy);
+++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy);
+++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy);
+++
+++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
+++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
+++
+++ real_L_code_acc = _mm_add_epi16 (real_L_code_acc, real_output);
+++ imag_L_code_acc = _mm_add_epi16 (imag_L_code_acc, imag_output);
+++
+++ //Get prompt values
+++ y = _mm_load_si128((__m128i*)P_code_ptr);
+++
+++ imagy = _mm_srli_si128 (y, 1);
+++ imagy = _mm_and_si128 (imagy, mult1);
+++ realy = _mm_and_si128 (y, mult1);
+++
+++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy);
+++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy);
+++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy);
+++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy);
+++
+++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
+++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
+++
+++ real_P_code_acc = _mm_add_epi16 (real_P_code_acc, real_output);
+++ imag_P_code_acc = _mm_add_epi16 (imag_P_code_acc, imag_output);
+++
+++ input_ptr += 8;
+++ carrier_ptr += 8;
+++ E_code_ptr += 8;
+++ L_code_ptr += 8;
+++ P_code_ptr += 8;
+++ }
+++
+++ __VOLK_ATTR_ALIGNED(16) lv_8sc_t E_dotProductVector[8];
+++ __VOLK_ATTR_ALIGNED(16) lv_8sc_t L_dotProductVector[8];
+++ __VOLK_ATTR_ALIGNED(16) lv_8sc_t P_dotProductVector[8];
+++
+++ real_E_code_acc = _mm_and_si128 (real_E_code_acc, mult1);
+++ imag_E_code_acc = _mm_and_si128 (imag_E_code_acc, mult1);
+++ imag_E_code_acc = _mm_slli_si128 (imag_E_code_acc, 1);
+++ output = _mm_or_si128 (real_E_code_acc, imag_E_code_acc);
+++ _mm_store_si128((__m128i*)E_dotProductVector, output);
+++
+++ real_L_code_acc = _mm_and_si128 (real_L_code_acc, mult1);
+++ imag_L_code_acc = _mm_and_si128 (imag_L_code_acc, mult1);
+++ imag_L_code_acc = _mm_slli_si128 (imag_L_code_acc, 1);
+++ output = _mm_or_si128 (real_L_code_acc, imag_L_code_acc);
+++ _mm_store_si128((__m128i*)L_dotProductVector, output);
+++
+++ real_P_code_acc = _mm_and_si128 (real_P_code_acc, mult1);
+++ imag_P_code_acc = _mm_and_si128 (imag_P_code_acc, mult1);
+++ imag_P_code_acc = _mm_slli_si128 (imag_P_code_acc, 1);
+++ output = _mm_or_si128 (real_P_code_acc, imag_P_code_acc);
+++ _mm_store_si128((__m128i*)P_dotProductVector, output);
+++
+++ for (int i = 0; i<8; ++i)
+++ {
+++ *E_out_ptr += E_dotProductVector[i];
+++ *L_out_ptr += L_dotProductVector[i];
+++ *P_out_ptr += P_dotProductVector[i];
+++ }
+++ }
+++
+++ lv_8sc_t bb_signal_sample;
+++ for(int i=0; i < num_points%8; ++i)
+++ {
+++ //Perform the carrier wipe-off
+++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
+++ // Now get early, late, and prompt values for each
+++ *E_out_ptr += bb_signal_sample * (*E_code_ptr++);
+++ *P_out_ptr += bb_signal_sample * (*P_code_ptr++);
+++ *L_out_ptr += bb_signal_sample * (*L_code_ptr++);
+++ }
+++}
+++
+++#endif /* LV_HAVE_SSE2 */
+++
+++#ifdef LV_HAVE_GENERIC
+++/*!
+++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
+++ \param input The input signal input
+++ \param carrier The carrier signal input
+++ \param E_code Early PRN code replica input
+++ \param P_code Early PRN code replica input
+++ \param L_code Early PRN code replica input
+++ \param E_out Early correlation output
+++ \param P_out Early correlation output
+++ \param L_out Early correlation output
+++ \param num_points The number of complex values in vectors
+++ */
+++static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_a_generic(lv_8sc_t* E_out, lv_8sc_t* P_out, lv_8sc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points)
+++{
+++ lv_8sc_t bb_signal_sample;
+++
+++ bb_signal_sample = lv_cmake(0, 0);
+++
+++ *E_out = 0;
+++ *P_out = 0;
+++ *L_out = 0;
+++ // perform Early, Prompt and Late correlation
+++ for(int i=0; i < num_points; ++i)
+++ {
+++ //Perform the carrier wipe-off
+++ bb_signal_sample = input[i] * carrier[i];
+++ // Now get early, late, and prompt values for each
+++ *E_out += bb_signal_sample * E_code[i];
+++ *P_out += bb_signal_sample * P_code[i];
+++ *L_out += bb_signal_sample * L_code[i];
+++ }
+++}
+++
+++#endif /* LV_HAVE_GENERIC */
+++
+++#ifdef LV_HAVE_ORC
+++/*!
+++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
+++ \param input The input signal input
+++ \param carrier The carrier signal input
+++ \param E_code Early PRN code replica input
+++ \param P_code Early PRN code replica input
+++ \param L_code Early PRN code replica input
+++ \param E_out Early correlation output
+++ \param P_out Early correlation output
+++ \param L_out Early correlation output
+++ \param num_points The number of complex values in vectors
+++ */
+++
+++extern void volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_first_a_orc_impl(short* E_out_real, short* E_out_imag, short* P_out_real, short* P_out_imag, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, unsigned int num_points);
+++extern void volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_second_a_orc_impl(short* L_out_real, short* L_out_imag, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* L_code, unsigned int num_points);
+++static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_u_orc(lv_8sc_t* E_out, lv_8sc_t* P_out, lv_8sc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points){
+++
+++ short E_out_real = 0;
+++ short E_out_imag = 0;
+++ char* E_out_real_c = (char*)&E_out_real;
+++ E_out_real_c++;
+++ char* E_out_imag_c = (char*)&E_out_imag;
+++ E_out_imag_c++;
+++
+++ short P_out_real = 0;
+++ short P_out_imag = 0;
+++ char* P_out_real_c = (char*)&P_out_real;
+++ P_out_real_c++;
+++ char* P_out_imag_c = (char*)&P_out_imag;
+++ P_out_imag_c++;
+++
+++ short L_out_real = 0;
+++ short L_out_imag = 0;
+++ char* L_out_real_c = (char*)&L_out_real;
+++ L_out_real_c++;
+++ char* L_out_imag_c = (char*)&L_out_imag;
+++ L_out_imag_c++;
+++
+++ volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_first_a_orc_impl( &E_out_real, &E_out_imag, &P_out_real, &P_out_imag, input, carrier, E_code, P_code, num_points);
+++ volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_second_a_orc_impl( &L_out_real, &L_out_imag, input, carrier, L_code, num_points);
+++
+++ //ORC implementation of 8ic_x5_cw_epl_corr_8ic_x3 is done in two different functions because it seems that
+++ //in one function the length of the code gives memory problems (bad access, segmentation fault).
+++ //Also, the maximum number of accumulators that can be used is 4 (and we need 6).
+++ //The "carrier wipe-off" step is done two times: one in the first function and another one in the second.
+++ //Joining all the ORC code in one function would be quicker because the "carrier wipe-off" step would be done just
+++ //one time.
+++
+++ *E_out = lv_cmake(*E_out_real_c, *E_out_imag_c);
+++ *P_out = lv_cmake(*P_out_real_c, *P_out_imag_c);
+++ *L_out = lv_cmake(*L_out_real_c, *L_out_imag_c);
+++}
+++#endif /* LV_HAVE_ORC */
+++
+++#endif /* INCLUDED_gnsssdr_volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_a_H */
++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5.h
++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5.h 1970-01-01 01:00:00.000000000 +0100
+++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5.h 2014-10-15 01:55:08.000000000 +0200
++@@ -0,0 +1,797 @@
+++/*!
+++ * \file volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5.h
+++ * \brief Volk protokernel: performs the carrier wipe-off mixing and the Very early, Early, Prompt, Late and very late correlation with 16 bits vectors, and accumulates the results into float32. In order to avoid overflow, If input, carrier and XX_code have the same number of bits, they must be values between —3 and 3 (2 bits).
+++ * \authors <ul>
+++ * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
+++ * </ul>
+++ *
+++ * Volk protokernel that performs the carrier wipe-off mixing and the
+++ * Very early, Early, Prompt, Late and very late correlation with 16 bits vectors (8 bits the
+++ * real part and 8 bits the imaginary part), and accumulates the result
+++ * in 32 bits single point values, returning float32 values:
+++ * - The carrier wipe-off is done by multiplying the input signal by the
+++ * carrier (multiplication of 16 bits vectors) It returns the input
+++ * signal in base band (BB)
+++ * - Very Early values are calculated by multiplying the input signal in BB by the
+++ * very early code (multiplication of 16 bits vectors), accumulating the results into float32 values
+++ * - Early values are calculated by multiplying the input signal in BB by the
+++ * early code (multiplication of 16 bits vectors), accumulating the results into float32 values
+++ * - Prompt values are calculated by multiplying the input signal in BB by the
+++ * prompt code (multiplication of 16 bits vectors), accumulating the results into float32 values
+++ * - Late values are calculated by multiplying the input signal in BB by the
+++ * late code (multiplication of 16 bits vectors), accumulating the results into float32 values
+++ * - Very Late values are calculated by multiplying the input signal in BB by the
+++ * very late code (multiplication of 16 bits vectors), accumulating the results into float32 values
+++ *
+++ * -------------------------------------------------------------------------
+++ * Bits analysis
+++ *
+++ * input = 8 bits
+++ * carrier = 8 bits
+++ * XX_code = 8 bits
+++ * XX_out = 8 bits
+++ * bb_signal_sample = 8 bits
+++ *
+++ * bb_signal_sample = input*carrier -> 17 bits limited to 8 bits = input and carrier must be values between —7 and 7 to avoid overflow (3 bits)
+++ *
+++ * XX_out16 = XX_code*bb_signal_sample -> 17 bits limited to 8 bits = XX_code and bb_signal_sample must be values between —7 and 7 to avoid overflow (3 bits)
+++ *
+++ * conclusion = input and carrier must be values between —1 and 1 (1 bit) and XX_code must be values between —7 and 7 to avoid overflow (3 bits)
+++ * If input, carrier and XX_code have the same number of bits, they must be values between —3 and 3 to avoid overflow (2 bits).
+++ * -------------------------------------------------------------------------
+++ *
+++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
+++ *
+++ * GNSS-SDR is a software defined Global Navigation
+++ * Satellite Systems receiver
+++ *
+++ * This file is part of GNSS-SDR.
+++ *
+++ * GNSS-SDR is free software: you can redistribute it and/or modify
+++ * it under the terms of the GNU General Public License as published by
+++ * the Free Software Foundation, either version 3 of the License, or
+++ * at your option) any later version.
+++ *
+++ * GNSS-SDR is distributed in the hope that it will be useful,
+++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+++ * GNU General Public License for more details.
+++ *
+++ * You should have received a copy of the GNU General Public License
+++ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
+++ *
+++ * -------------------------------------------------------------------------
+++ */
+++
+++#ifndef INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_u_H
+++#define INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_u_H
+++
+++#include <inttypes.h>
+++#include <stdio.h>
+++#include <volk_gnsssdr/volk_gnsssdr_complex.h>
+++#include <float.h>
+++#include <string.h>
+++
+++#ifdef LV_HAVE_SSE4_1
+++#include "smmintrin.h"
+++#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h"
+++#include "CommonMacros/CommonMacros.h"
+++/*!
+++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
+++ \param input The input signal input
+++ \param carrier The carrier signal input
+++ \param VE_code Very Early PRN code replica input
+++ \param E_code Early PRN code replica input
+++ \param P_code Prompt PRN code replica input
+++ \param L_code Late PRN code replica input
+++ \param VL_code Very Late PRN code replica input
+++ \param VE_out Very Early correlation output
+++ \param E_out Early correlation output
+++ \param P_out Prompt correlation output
+++ \param L_out Late correlation output
+++ \param VL_out Very Late correlation output
+++ \param num_points The number of complex values in vectors
+++ */
+++static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_u_sse4_1(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points)
+++{
+++ const unsigned int sse_iters = num_points / 8;
+++
+++ __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample;
+++ __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output;
+++
+++ __m128 VE_code_acc, E_code_acc, P_code_acc, L_code_acc, VL_code_acc;
+++ __m128i input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2;
+++ __m128 output_ps;
+++
+++ const lv_8sc_t* input_ptr = input;
+++ const lv_8sc_t* carrier_ptr = carrier;
+++
+++ const lv_8sc_t* VE_code_ptr = VE_code;
+++ lv_32fc_t* VE_out_ptr = VE_out;
+++ const lv_8sc_t* E_code_ptr = E_code;
+++ lv_32fc_t* E_out_ptr = E_out;
+++ const lv_8sc_t* P_code_ptr = P_code;
+++ lv_32fc_t* P_out_ptr = P_out;
+++ const lv_8sc_t* L_code_ptr = L_code;
+++ lv_32fc_t* L_out_ptr = L_out;
+++ const lv_8sc_t* VL_code_ptr = VL_code;
+++ lv_32fc_t* VL_out_ptr = VL_out;
+++
+++ *VE_out_ptr = 0;
+++ *E_out_ptr = 0;
+++ *P_out_ptr = 0;
+++ *L_out_ptr = 0;
+++ *VL_out_ptr = 0;
+++
+++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
+++
+++ VE_code_acc = _mm_setzero_ps();
+++ E_code_acc = _mm_setzero_ps();
+++ P_code_acc = _mm_setzero_ps();
+++ L_code_acc = _mm_setzero_ps();
+++ VL_code_acc = _mm_setzero_ps();
+++
+++ if (sse_iters>0)
+++ {
+++ for(int number = 0;number < sse_iters; number++){
+++
+++ //Perform the carrier wipe-off
+++ x = _mm_lddqu_si128((__m128i*)input_ptr);
+++ y = _mm_lddqu_si128((__m128i*)carrier_ptr);
+++
+++ CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(x, mult1, realx, imagx)
+++ CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy)
+++
+++ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample)
+++
+++ //Get very early values
+++ y = _mm_lddqu_si128((__m128i*)VE_code_ptr);
+++
+++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
+++
+++ VE_code_acc = _mm_add_ps (VE_code_acc, output_ps);
+++
+++ //Get early values
+++ y = _mm_lddqu_si128((__m128i*)E_code_ptr);
+++
+++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
+++
+++ E_code_acc = _mm_add_ps (E_code_acc, output_ps);
+++
+++ //Get prompt values
+++ y = _mm_lddqu_si128((__m128i*)P_code_ptr);
+++
+++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
+++
+++ P_code_acc = _mm_add_ps (P_code_acc, output_ps);
+++
+++ //Get late values
+++ y = _mm_lddqu_si128((__m128i*)L_code_ptr);
+++
+++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
+++
+++ L_code_acc = _mm_add_ps (L_code_acc, output_ps);
+++
+++ //Get very late values
+++ y = _mm_lddqu_si128((__m128i*)VL_code_ptr);
+++
+++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
+++
+++ VL_code_acc = _mm_add_ps (VL_code_acc, output_ps);
+++
+++ input_ptr += 8;
+++ carrier_ptr += 8;
+++ VE_code_ptr += 8;
+++ E_code_ptr += 8;
+++ P_code_ptr += 8;
+++ L_code_ptr += 8;
+++ VL_code_ptr += 8;
+++ }
+++
+++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t VE_dotProductVector[2];
+++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2];
+++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2];
+++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2];
+++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t VL_dotProductVector[2];
+++
+++ _mm_storeu_ps((float*)VE_dotProductVector,VE_code_acc); // Store the results back into the dot product vector
+++ _mm_storeu_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector
+++ _mm_storeu_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector
+++ _mm_storeu_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector
+++ _mm_storeu_ps((float*)VL_dotProductVector,VL_code_acc); // Store the results back into the dot product vector
+++
+++ for (int i = 0; i<2; ++i)
+++ {
+++ *VE_out_ptr += VE_dotProductVector[i];
+++ *E_out_ptr += E_dotProductVector[i];
+++ *P_out_ptr += P_dotProductVector[i];
+++ *L_out_ptr += L_dotProductVector[i];
+++ *VL_out_ptr += VL_dotProductVector[i];
+++ }
+++ }
+++
+++ lv_8sc_t bb_signal_sample;
+++ for(int i=0; i < num_points%8; ++i)
+++ {
+++ //Perform the carrier wipe-off
+++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
+++ // Now get very early, early, prompt, late and very late values for each
+++ *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VE_code_ptr++));
+++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++));
+++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++));
+++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++));
+++ *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VL_code_ptr++));
+++ }
+++}
+++#endif /* LV_HAVE_SSE4_1 */
+++
+++#ifdef LV_HAVE_SSE2
+++#include "emmintrin.h"
+++#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h"
+++#include "CommonMacros/CommonMacros.h"
+++/*!
+++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
+++ \param input The input signal input
+++ \param carrier The carrier signal input
+++ \param VE_code Very Early PRN code replica input
+++ \param E_code Early PRN code replica input
+++ \param P_code Prompt PRN code replica input
+++ \param L_code Late PRN code replica input
+++ \param VL_code Very Late PRN code replica input
+++ \param VE_out Very Early correlation output
+++ \param E_out Early correlation output
+++ \param P_out Prompt correlation output
+++ \param L_out Late correlation output
+++ \param VL_out Very Late correlation output
+++ \param num_points The number of complex values in vectors
+++ */
+++static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_u_sse2(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points)
+++{
+++ const unsigned int sse_iters = num_points / 8;
+++
+++ __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample;
+++ __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output;
+++
+++ __m128 VE_code_acc, E_code_acc, P_code_acc, L_code_acc, VL_code_acc;
+++ __m128i input_i_1, input_i_2, output_i32;
+++ __m128 output_ps_1, output_ps_2;
+++
+++ const lv_8sc_t* input_ptr = input;
+++ const lv_8sc_t* carrier_ptr = carrier;
+++
+++ const lv_8sc_t* VE_code_ptr = VE_code;
+++ lv_32fc_t* VE_out_ptr = VE_out;
+++ const lv_8sc_t* E_code_ptr = E_code;
+++ lv_32fc_t* E_out_ptr = E_out;
+++ const lv_8sc_t* P_code_ptr = P_code;
+++ lv_32fc_t* P_out_ptr = P_out;
+++ const lv_8sc_t* L_code_ptr = L_code;
+++ lv_32fc_t* L_out_ptr = L_out;
+++ const lv_8sc_t* VL_code_ptr = VL_code;
+++ lv_32fc_t* VL_out_ptr = VL_out;
+++
+++ *VE_out_ptr = 0;
+++ *E_out_ptr = 0;
+++ *P_out_ptr = 0;
+++ *L_out_ptr = 0;
+++ *VL_out_ptr = 0;
+++
+++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
+++
+++ VE_code_acc = _mm_setzero_ps();
+++ E_code_acc = _mm_setzero_ps();
+++ P_code_acc = _mm_setzero_ps();
+++ L_code_acc = _mm_setzero_ps();
+++ VL_code_acc = _mm_setzero_ps();
+++
+++ if (sse_iters>0)
+++ {
+++ for(int number = 0;number < sse_iters; number++){
+++
+++ //Perform the carrier wipe-off
+++ x = _mm_lddqu_si128((__m128i*)input_ptr);
+++ y = _mm_lddqu_si128((__m128i*)carrier_ptr);
+++
+++ CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(x, mult1, realx, imagx)
+++ CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy)
+++
+++ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample)
+++
+++ //Get very early values
+++ y = _mm_lddqu_si128((__m128i*)VE_code_ptr);
+++
+++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
+++
+++ VE_code_acc = _mm_add_ps (VE_code_acc, output_ps_1);
+++ VE_code_acc = _mm_add_ps (VE_code_acc, output_ps_2);
+++
+++ //Get early values
+++ y = _mm_lddqu_si128((__m128i*)E_code_ptr);
+++
+++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
+++
+++ E_code_acc = _mm_add_ps (E_code_acc, output_ps_1);
+++ E_code_acc = _mm_add_ps (E_code_acc, output_ps_2);
+++
+++ //Get prompt values
+++ y = _mm_lddqu_si128((__m128i*)P_code_ptr);
+++
+++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
+++
+++ P_code_acc = _mm_add_ps (P_code_acc, output_ps_1);
+++ P_code_acc = _mm_add_ps (P_code_acc, output_ps_2);
+++
+++ //Get late values
+++ y = _mm_lddqu_si128((__m128i*)L_code_ptr);
+++
+++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
+++
+++ L_code_acc = _mm_add_ps (L_code_acc, output_ps_1);
+++ L_code_acc = _mm_add_ps (L_code_acc, output_ps_2);
+++
+++ //Get very late values
+++ y = _mm_lddqu_si128((__m128i*)VL_code_ptr);
+++
+++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
+++
+++ VL_code_acc = _mm_add_ps (VL_code_acc, output_ps_1);
+++ VL_code_acc = _mm_add_ps (VL_code_acc, output_ps_2);
+++
+++ input_ptr += 8;
+++ carrier_ptr += 8;
+++ VE_code_ptr += 8;
+++ E_code_ptr += 8;
+++ P_code_ptr += 8;
+++ L_code_ptr += 8;
+++ VL_code_ptr += 8;
+++ }
+++
+++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t VE_dotProductVector[2];
+++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2];
+++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2];
+++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2];
+++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t VL_dotProductVector[2];
+++
+++ _mm_storeu_ps((float*)VE_dotProductVector,VE_code_acc); // Store the results back into the dot product vector
+++ _mm_storeu_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector
+++ _mm_storeu_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector
+++ _mm_storeu_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector
+++ _mm_storeu_ps((float*)VL_dotProductVector,VL_code_acc); // Store the results back into the dot product vector
+++
+++ for (int i = 0; i<2; ++i)
+++ {
+++ *VE_out_ptr += VE_dotProductVector[i];
+++ *E_out_ptr += E_dotProductVector[i];
+++ *P_out_ptr += P_dotProductVector[i];
+++ *L_out_ptr += L_dotProductVector[i];
+++ *VL_out_ptr += VL_dotProductVector[i];
+++ }
+++ }
+++
+++ lv_8sc_t bb_signal_sample;
+++ for(int i=0; i < num_points%8; ++i)
+++ {
+++ //Perform the carrier wipe-off
+++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
+++ // Now get very early, early, prompt, late and very late values for each
+++ *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VE_code_ptr++));
+++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++));
+++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++));
+++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++));
+++ *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VL_code_ptr++));
+++ }
+++}
+++#endif /* LV_HAVE_SSE2 */
+++
+++#ifdef LV_HAVE_GENERIC
+++/*!
+++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
+++ \param input The input signal input
+++ \param carrier The carrier signal input
+++ \param VE_code Very Early PRN code replica input
+++ \param E_code Early PRN code replica input
+++ \param P_code Prompt PRN code replica input
+++ \param L_code Late PRN code replica input
+++ \param VL_code Very Late PRN code replica input
+++ \param VE_out Very Early correlation output
+++ \param E_out Early correlation output
+++ \param P_out Prompt correlation output
+++ \param L_out Late correlation output
+++ \param VL_out Very Late correlation output
+++ \param num_points The number of complex values in vectors
+++ */
+++static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points)
+++{
+++ lv_8sc_t bb_signal_sample;
+++
+++ bb_signal_sample = lv_cmake(0, 0);
+++
+++ *VE_out = 0;
+++ *E_out = 0;
+++ *P_out = 0;
+++ *L_out = 0;
+++ *VL_out = 0;
+++ // perform very early, Early, Prompt, Late and very late correlation
+++ for(int i=0; i < num_points; ++i)
+++ {
+++ //Perform the carrier wipe-off
+++ bb_signal_sample = input[i] * carrier[i];
+++
+++ *VE_out += (lv_32fc_t) (bb_signal_sample * VE_code[i]);
+++ *E_out += (lv_32fc_t) (bb_signal_sample * E_code[i]);
+++ *P_out += (lv_32fc_t) (bb_signal_sample * P_code[i]);
+++ *L_out += (lv_32fc_t) (bb_signal_sample * L_code[i]);
+++ *VL_out += (lv_32fc_t) (bb_signal_sample * VL_code[i]);
+++ }
+++}
+++
+++#endif /* LV_HAVE_GENERIC */
+++
+++#endif /* INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_u_H */
+++
+++
+++#ifndef INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_a_H
+++#define INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_a_H
+++
+++#include <inttypes.h>
+++#include <stdio.h>
+++#include <volk_gnsssdr/volk_gnsssdr_complex.h>
+++#include <float.h>
+++#include <string.h>
+++
+++#ifdef LV_HAVE_SSE4_1
+++#include "smmintrin.h"
+++#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h"
+++#include "CommonMacros/CommonMacros.h"
+++/*!
+++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
+++ \param input The input signal input
+++ \param carrier The carrier signal input
+++ \param VE_code Very Early PRN code replica input
+++ \param E_code Early PRN code replica input
+++ \param P_code Prompt PRN code replica input
+++ \param L_code Late PRN code replica input
+++ \param VL_code Very Late PRN code replica input
+++ \param VE_out Very Early correlation output
+++ \param E_out Early correlation output
+++ \param P_out Prompt correlation output
+++ \param L_out Late correlation output
+++ \param VL_out Very Late correlation output
+++ \param num_points The number of complex values in vectors
+++ */
+++static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_a_sse4_1(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points)
+++{
+++ const unsigned int sse_iters = num_points / 8;
+++
+++ __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample;
+++ __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output;
+++
+++ __m128 VE_code_acc, E_code_acc, P_code_acc, L_code_acc, VL_code_acc;
+++ __m128i input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2;
+++ __m128 output_ps;
+++
+++ const lv_8sc_t* input_ptr = input;
+++ const lv_8sc_t* carrier_ptr = carrier;
+++
+++ const lv_8sc_t* VE_code_ptr = VE_code;
+++ lv_32fc_t* VE_out_ptr = VE_out;
+++ const lv_8sc_t* E_code_ptr = E_code;
+++ lv_32fc_t* E_out_ptr = E_out;
+++ const lv_8sc_t* P_code_ptr = P_code;
+++ lv_32fc_t* P_out_ptr = P_out;
+++ const lv_8sc_t* L_code_ptr = L_code;
+++ lv_32fc_t* L_out_ptr = L_out;
+++ const lv_8sc_t* VL_code_ptr = VL_code;
+++ lv_32fc_t* VL_out_ptr = VL_out;
+++
+++ *VE_out_ptr = 0;
+++ *E_out_ptr = 0;
+++ *P_out_ptr = 0;
+++ *L_out_ptr = 0;
+++ *VL_out_ptr = 0;
+++
+++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
+++
+++ VE_code_acc = _mm_setzero_ps();
+++ E_code_acc = _mm_setzero_ps();
+++ P_code_acc = _mm_setzero_ps();
+++ L_code_acc = _mm_setzero_ps();
+++ VL_code_acc = _mm_setzero_ps();
+++
+++ if (sse_iters>0)
+++ {
+++ for(int number = 0;number < sse_iters; number++){
+++
+++ //Perform the carrier wipe-off
+++ x = _mm_load_si128((__m128i*)input_ptr);
+++ y = _mm_load_si128((__m128i*)carrier_ptr);
+++
+++ CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(x, mult1, realx, imagx)
+++ CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy)
+++
+++ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample)
+++
+++ //Get very early values
+++ y = _mm_load_si128((__m128i*)VE_code_ptr);
+++
+++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
+++
+++ VE_code_acc = _mm_add_ps (VE_code_acc, output_ps);
+++
+++ //Get early values
+++ y = _mm_load_si128((__m128i*)E_code_ptr);
+++
+++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
+++
+++ E_code_acc = _mm_add_ps (E_code_acc, output_ps);
+++
+++ //Get prompt values
+++ y = _mm_load_si128((__m128i*)P_code_ptr);
+++
+++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
+++
+++ P_code_acc = _mm_add_ps (P_code_acc, output_ps);
+++
+++ //Get late values
+++ y = _mm_load_si128((__m128i*)L_code_ptr);
+++
+++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
+++
+++ L_code_acc = _mm_add_ps (L_code_acc, output_ps);
+++
+++ //Get very late values
+++ y = _mm_load_si128((__m128i*)VL_code_ptr);
+++
+++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
+++
+++ VL_code_acc = _mm_add_ps (VL_code_acc, output_ps);
+++
+++ input_ptr += 8;
+++ carrier_ptr += 8;
+++ VE_code_ptr += 8;
+++ E_code_ptr += 8;
+++ P_code_ptr += 8;
+++ L_code_ptr += 8;
+++ VL_code_ptr += 8;
+++ }
+++
+++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t VE_dotProductVector[2];
+++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2];
+++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2];
+++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2];
+++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t VL_dotProductVector[2];
+++
+++ _mm_store_ps((float*)VE_dotProductVector,VE_code_acc); // Store the results back into the dot product vector
+++ _mm_store_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector
+++ _mm_store_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector
+++ _mm_store_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector
+++ _mm_store_ps((float*)VL_dotProductVector,VL_code_acc); // Store the results back into the dot product vector
+++
+++ for (int i = 0; i<2; ++i)
+++ {
+++ *VE_out_ptr += VE_dotProductVector[i];
+++ *E_out_ptr += E_dotProductVector[i];
+++ *P_out_ptr += P_dotProductVector[i];
+++ *L_out_ptr += L_dotProductVector[i];
+++ *VL_out_ptr += VL_dotProductVector[i];
+++ }
+++ }
+++
+++ lv_8sc_t bb_signal_sample;
+++ for(int i=0; i < num_points%8; ++i)
+++ {
+++ //Perform the carrier wipe-off
+++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
+++ // Now get very early, early, prompt, late and very late values for each
+++ *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VE_code_ptr++));
+++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++));
+++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++));
+++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++));
+++ *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VL_code_ptr++));
+++ }
+++}
+++#endif /* LV_HAVE_SSE4_1 */
+++
+++#ifdef LV_HAVE_SSE2
+++#include "emmintrin.h"
+++#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h"
+++#include "CommonMacros/CommonMacros.h"
+++/*!
+++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
+++ \param input The input signal input
+++ \param carrier The carrier signal input
+++ \param VE_code Very Early PRN code replica input
+++ \param E_code Early PRN code replica input
+++ \param P_code Prompt PRN code replica input
+++ \param L_code Late PRN code replica input
+++ \param VL_code Very Late PRN code replica input
+++ \param VE_out Very Early correlation output
+++ \param E_out Early correlation output
+++ \param P_out Prompt correlation output
+++ \param L_out Late correlation output
+++ \param VL_out Very Late correlation output
+++ \param num_points The number of complex values in vectors
+++ */
+++static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_a_sse2(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points)
+++{
+++ const unsigned int sse_iters = num_points / 8;
+++
+++ __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample;
+++ __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output;
+++
+++ __m128 VE_code_acc, E_code_acc, P_code_acc, L_code_acc, VL_code_acc;
+++ __m128i input_i_1, input_i_2, output_i32;
+++ __m128 output_ps_1, output_ps_2;
+++
+++ const lv_8sc_t* input_ptr = input;
+++ const lv_8sc_t* carrier_ptr = carrier;
+++
+++ const lv_8sc_t* VE_code_ptr = VE_code;
+++ lv_32fc_t* VE_out_ptr = VE_out;
+++ const lv_8sc_t* E_code_ptr = E_code;
+++ lv_32fc_t* E_out_ptr = E_out;
+++ const lv_8sc_t* P_code_ptr = P_code;
+++ lv_32fc_t* P_out_ptr = P_out;
+++ const lv_8sc_t* L_code_ptr = L_code;
+++ lv_32fc_t* L_out_ptr = L_out;
+++ const lv_8sc_t* VL_code_ptr = VL_code;
+++ lv_32fc_t* VL_out_ptr = VL_out;
+++
+++ *VE_out_ptr = 0;
+++ *E_out_ptr = 0;
+++ *P_out_ptr = 0;
+++ *L_out_ptr = 0;
+++ *VL_out_ptr = 0;
+++
+++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
+++
+++ VE_code_acc = _mm_setzero_ps();
+++ E_code_acc = _mm_setzero_ps();
+++ P_code_acc = _mm_setzero_ps();
+++ L_code_acc = _mm_setzero_ps();
+++ VL_code_acc = _mm_setzero_ps();
+++
+++ if (sse_iters>0)
+++ {
+++ for(int number = 0;number < sse_iters; number++){
+++
+++ //Perform the carrier wipe-off
+++ x = _mm_load_si128((__m128i*)input_ptr);
+++ y = _mm_load_si128((__m128i*)carrier_ptr);
+++
+++ CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(x, mult1, realx, imagx)
+++ CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy)
+++
+++ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample)
+++
+++ //Get very early values
+++ y = _mm_load_si128((__m128i*)VE_code_ptr);
+++
+++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
+++
+++ VE_code_acc = _mm_add_ps (VE_code_acc, output_ps_1);
+++ VE_code_acc = _mm_add_ps (VE_code_acc, output_ps_2);
+++
+++ //Get early values
+++ y = _mm_load_si128((__m128i*)E_code_ptr);
+++
+++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
+++
+++ E_code_acc = _mm_add_ps (E_code_acc, output_ps_1);
+++ E_code_acc = _mm_add_ps (E_code_acc, output_ps_2);
+++
+++ //Get prompt values
+++ y = _mm_load_si128((__m128i*)P_code_ptr);
+++
+++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
+++
+++ P_code_acc = _mm_add_ps (P_code_acc, output_ps_1);
+++ P_code_acc = _mm_add_ps (P_code_acc, output_ps_2);
+++
+++ //Get late values
+++ y = _mm_load_si128((__m128i*)L_code_ptr);
+++
+++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
+++
+++ L_code_acc = _mm_add_ps (L_code_acc, output_ps_1);
+++ L_code_acc = _mm_add_ps (L_code_acc, output_ps_2);
+++
+++ //Get very late values
+++ y = _mm_load_si128((__m128i*)VL_code_ptr);
+++
+++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
+++
+++ VL_code_acc = _mm_add_ps (VL_code_acc, output_ps_1);
+++ VL_code_acc = _mm_add_ps (VL_code_acc, output_ps_2);
+++
+++ input_ptr += 8;
+++ carrier_ptr += 8;
+++ VE_code_ptr += 8;
+++ E_code_ptr += 8;
+++ P_code_ptr += 8;
+++ L_code_ptr += 8;
+++ VL_code_ptr += 8;
+++ }
+++
+++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t VE_dotProductVector[2];
+++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2];
+++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2];
+++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2];
+++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t VL_dotProductVector[2];
+++
+++ _mm_store_ps((float*)VE_dotProductVector,VE_code_acc); // Store the results back into the dot product vector
+++ _mm_store_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector
+++ _mm_store_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector
+++ _mm_store_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector
+++ _mm_store_ps((float*)VL_dotProductVector,VL_code_acc); // Store the results back into the dot product vector
+++
+++ for (int i = 0; i<2; ++i)
+++ {
+++ *VE_out_ptr += VE_dotProductVector[i];
+++ *E_out_ptr += E_dotProductVector[i];
+++ *P_out_ptr += P_dotProductVector[i];
+++ *L_out_ptr += L_dotProductVector[i];
+++ *VL_out_ptr += VL_dotProductVector[i];
+++ }
+++ }
+++
+++ lv_8sc_t bb_signal_sample;
+++ for(int i=0; i < num_points%8; ++i)
+++ {
+++ //Perform the carrier wipe-off
+++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
+++ // Now get very early, early, prompt, late and very late values for each
+++ *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VE_code_ptr++));
+++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++));
+++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++));
+++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++));
+++ *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VL_code_ptr++));
+++ }
+++}
+++#endif /* LV_HAVE_SSE2 */
+++
+++#ifdef LV_HAVE_GENERIC
+++/*!
+++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
+++ \param input The input signal input
+++ \param carrier The carrier signal input
+++ \param VE_code Very Early PRN code replica input
+++ \param E_code Early PRN code replica input
+++ \param P_code Prompt PRN code replica input
+++ \param L_code Late PRN code replica input
+++ \param VL_code Very Late PRN code replica input
+++ \param VE_out Very Early correlation output
+++ \param E_out Early correlation output
+++ \param P_out Prompt correlation output
+++ \param L_out Late correlation output
+++ \param VL_out Very Late correlation output
+++ \param num_points The number of complex values in vectors
+++ */
+++static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_a_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points)
+++{
+++ lv_8sc_t bb_signal_sample;
+++
+++ bb_signal_sample = lv_cmake(0, 0);
+++
+++ *VE_out = 0;
+++ *E_out = 0;
+++ *P_out = 0;
+++ *L_out = 0;
+++ *VL_out = 0;
+++ // perform very early, Early, Prompt, Late and very late correlation
+++ for(int i=0; i < num_points; ++i)
+++ {
+++ //Perform the carrier wipe-off
+++ bb_signal_sample = input[i] * carrier[i];
+++
+++ *VE_out += (lv_32fc_t) (bb_signal_sample * VE_code[i]);
+++ *E_out += (lv_32fc_t) (bb_signal_sample * E_code[i]);
+++ *P_out += (lv_32fc_t) (bb_signal_sample * P_code[i]);
+++ *L_out += (lv_32fc_t) (bb_signal_sample * L_code[i]);
+++ *VL_out += (lv_32fc_t) (bb_signal_sample * VL_code[i]);
+++ }
+++}
+++
+++#endif /* LV_HAVE_GENERIC */
+++
+++#endif /* INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_a_H */
++\ No newline at end of file
++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5.h
++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5.h 1970-01-01 01:00:00.000000000 +0100
+++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5.h 2014-10-15 01:55:08.000000000 +0200
++@@ -0,0 +1,1520 @@
+++/*!
+++ * \file volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5.h
+++ * \brief Volk protokernel: performs the carrier wipe-off mixing and the Very early, Early, Prompt, Late and very late correlation with 16 bits vectors using different methods: inside u_sse4_1_first there is one method, inside u_sse4_1_second there is another... This protokernel has been created to test the performance of different methods.
+++ * \authors <ul>
+++ * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
+++ * </ul>
+++ *
+++ * Volk protokernel that performs the carrier wipe-off mixing and the
+++ * Very early, Early, Prompt, Late and very late correlation with 16 bits vectors (8 bits the
+++ * real part and 8 bits the imaginary part), and accumulates the result
+++ * in 32 bits single point values, returning float32 values:
+++ * - The carrier wipe-off is done by multiplying the input signal by the
+++ * carrier (multiplication of 16 bits vectors) It returns the input
+++ * signal in base band (BB)
+++ * - Very Early values are calculated by multiplying the input signal in BB by the
+++ * very early code (multiplication of 16 bits vectors), accumulating the results into float32 values
+++ * - Early values are calculated by multiplying the input signal in BB by the
+++ * early code (multiplication of 16 bits vectors), accumulating the results into float32 values
+++ * - Prompt values are calculated by multiplying the input signal in BB by the
+++ * prompt code (multiplication of 16 bits vectors), accumulating the results into float32 values
+++ * - Late values are calculated by multiplying the input signal in BB by the
+++ * late code (multiplication of 16 bits vectors), accumulating the results into float32 values
+++ * - Very Late values are calculated by multiplying the input signal in BB by the
+++ * very late code (multiplication of 16 bits vectors), accumulating the results into float32 values
+++ *
+++ * -------------------------------------------------------------------------
+++ *
+++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
+++ *
+++ * GNSS-SDR is a software defined Global Navigation
+++ * Satellite Systems receiver
+++ *
+++ * This file is part of GNSS-SDR.
+++ *
+++ * GNSS-SDR is free software: you can redistribute it and/or modify
+++ * it under the terms of the GNU General Public License as published by
+++ * the Free Software Foundation, either version 3 of the License, or
+++ * at your option) any later version.
+++ *
+++ * GNSS-SDR is distributed in the hope that it will be useful,
+++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+++ * GNU General Public License for more details.
+++ *
+++ * You should have received a copy of the GNU General Public License
+++ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
+++ *
+++ * -------------------------------------------------------------------------
+++ */
+++
+++#ifndef INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5_u_H
+++#define INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5_u_H
+++
+++#include <inttypes.h>
+++#include <stdio.h>
+++#include <volk_gnsssdr/volk_gnsssdr_complex.h>
+++#include <float.h>
+++#include <string.h>
+++
+++#ifdef LV_HAVE_SSE4_1
+++#include "smmintrin.h"
+++#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h"
+++#include "CommonMacros/CommonMacros.h"
+++/*!
+++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
+++ \param input The input signal input
+++ \param carrier The carrier signal input
+++ \param VE_code Very Early PRN code replica input
+++ \param E_code Early PRN code replica input
+++ \param P_code Prompt PRN code replica input
+++ \param L_code Late PRN code replica input
+++ \param VL_code Very Late PRN code replica input
+++ \param VE_out Very Early correlation output
+++ \param E_out Early correlation output
+++ \param P_out Prompt correlation output
+++ \param L_out Late correlation output
+++ \param VL_out Very Late correlation output
+++ \param num_points The number of complex values in vectors
+++ */
+++static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5_u_sse4_1_first(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points)
+++{
+++ const unsigned int sse_iters = num_points / 8;
+++
+++ __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample;
+++ __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output;
+++
+++ __m128 VE_code_acc, E_code_acc, P_code_acc, L_code_acc, VL_code_acc;
+++ __m128i input_i_1, input_i_2, output_i32;
+++ __m128 output_ps_1, output_ps_2;
+++
+++ const lv_8sc_t* input_ptr = input;
+++ const lv_8sc_t* carrier_ptr = carrier;
+++
+++ const lv_8sc_t* VE_code_ptr = VE_code;
+++ lv_32fc_t* VE_out_ptr = VE_out;
+++ const lv_8sc_t* E_code_ptr = E_code;
+++ lv_32fc_t* E_out_ptr = E_out;
+++ const lv_8sc_t* P_code_ptr = P_code;
+++ lv_32fc_t* P_out_ptr = P_out;
+++ const lv_8sc_t* L_code_ptr = L_code;
+++ lv_32fc_t* L_out_ptr = L_out;
+++ const lv_8sc_t* VL_code_ptr = VL_code;
+++ lv_32fc_t* VL_out_ptr = VL_out;
+++
+++ *VE_out_ptr = 0;
+++ *E_out_ptr = 0;
+++ *P_out_ptr = 0;
+++ *L_out_ptr = 0;
+++ *VL_out_ptr = 0;
+++
+++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
+++
+++ VE_code_acc = _mm_setzero_ps();
+++ E_code_acc = _mm_setzero_ps();
+++ P_code_acc = _mm_setzero_ps();
+++ L_code_acc = _mm_setzero_ps();
+++ VL_code_acc = _mm_setzero_ps();
+++
+++ if (sse_iters>0)
+++ {
+++ for(int number = 0;number < sse_iters; number++){
+++
+++ //Perform the carrier wipe-off
+++ x = _mm_lddqu_si128((__m128i*)input_ptr);
+++ y = _mm_lddqu_si128((__m128i*)carrier_ptr);
+++
+++ imagx = _mm_srli_si128 (x, 1);
+++ imagx = _mm_and_si128 (imagx, mult1);
+++ realx = _mm_and_si128 (x, mult1);
+++
+++ imagy = _mm_srli_si128 (y, 1);
+++ imagy = _mm_and_si128 (imagy, mult1);
+++ realy = _mm_and_si128 (y, mult1);
+++
+++ realx_mult_realy = _mm_mullo_epi16 (realx, realy);
+++ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);
+++ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);
+++ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);
+++
+++ real_bb_signal_sample = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
+++ imag_bb_signal_sample = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
+++
+++ //Get very early values
+++ y = _mm_lddqu_si128((__m128i*)VE_code_ptr);
+++
+++ imagy = _mm_srli_si128 (y, 1);
+++ imagy = _mm_and_si128 (imagy, mult1);
+++ realy = _mm_and_si128 (y, mult1);
+++
+++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy);
+++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy);
+++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy);
+++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy);
+++
+++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
+++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
+++
+++ imag_output = _mm_slli_si128 (imag_output, 1);
+++ output = _mm_blendv_epi8 (imag_output, real_output, mult1);
+++
+++ input_i_1 = _mm_cvtepi8_epi32(output);
+++ output = _mm_srli_si128 (output, 4);
+++ input_i_2 = _mm_cvtepi8_epi32(output);
+++ output = _mm_srli_si128 (output, 4);
+++ output_i32 = _mm_add_epi32 (input_i_1, input_i_2);
+++ output_ps_1 = _mm_cvtepi32_ps(output_i32);
+++
+++ input_i_1 = _mm_cvtepi8_epi32(output);
+++ output = _mm_srli_si128 (output, 4);
+++ input_i_2 = _mm_cvtepi8_epi32(output);
+++ output = _mm_srli_si128 (output, 4);
+++ output_i32 = _mm_add_epi32 (input_i_1, input_i_2);
+++ output_ps_2 = _mm_cvtepi32_ps(output_i32);
+++
+++ VE_code_acc = _mm_add_ps (VE_code_acc, output_ps_1);
+++ VE_code_acc = _mm_add_ps (VE_code_acc, output_ps_2);
+++
+++ //Get early values
+++ y = _mm_lddqu_si128((__m128i*)E_code_ptr);
+++
+++ imagy = _mm_srli_si128 (y, 1);
+++ imagy = _mm_and_si128 (imagy, mult1);
+++ realy = _mm_and_si128 (y, mult1);
+++
+++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy);
+++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy);
+++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy);
+++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy);
+++
+++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
+++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
+++
+++ imag_output = _mm_slli_si128 (imag_output, 1);
+++ output = _mm_blendv_epi8 (imag_output, real_output, mult1);
+++
+++ input_i_1 = _mm_cvtepi8_epi32(output);
+++ output = _mm_srli_si128 (output, 4);
+++ input_i_2 = _mm_cvtepi8_epi32(output);
+++ output = _mm_srli_si128 (output, 4);
+++ output_i32 = _mm_add_epi32 (input_i_1, input_i_2);
+++ output_ps_1 = _mm_cvtepi32_ps(output_i32);
+++
+++ input_i_1 = _mm_cvtepi8_epi32(output);
+++ output = _mm_srli_si128 (output, 4);
+++ input_i_2 = _mm_cvtepi8_epi32(output);
+++ output = _mm_srli_si128 (output, 4);
+++ output_i32 = _mm_add_epi32 (input_i_1, input_i_2);
+++ output_ps_2 = _mm_cvtepi32_ps(output_i32);
+++
+++ E_code_acc = _mm_add_ps (E_code_acc, output_ps_1);
+++ E_code_acc = _mm_add_ps (E_code_acc, output_ps_2);
+++
+++ //Get prompt values
+++ y = _mm_lddqu_si128((__m128i*)P_code_ptr);
+++
+++ imagy = _mm_srli_si128 (y, 1);
+++ imagy = _mm_and_si128 (imagy, mult1);
+++ realy = _mm_and_si128 (y, mult1);
+++
+++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy);
+++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy);
+++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy);
+++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy);
+++
+++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
+++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
+++
+++ imag_output = _mm_slli_si128 (imag_output, 1);
+++ output = _mm_blendv_epi8 (imag_output, real_output, mult1);
+++
+++ input_i_1 = _mm_cvtepi8_epi32(output);
+++ output = _mm_srli_si128 (output, 4);
+++ input_i_2 = _mm_cvtepi8_epi32(output);
+++ output = _mm_srli_si128 (output, 4);
+++ output_i32 = _mm_add_epi32 (input_i_1, input_i_2);
+++ output_ps_1 = _mm_cvtepi32_ps(output_i32);
+++
+++ input_i_1 = _mm_cvtepi8_epi32(output);
+++ output = _mm_srli_si128 (output, 4);
+++ input_i_2 = _mm_cvtepi8_epi32(output);
+++ output = _mm_srli_si128 (output, 4);
+++ output_i32 = _mm_add_epi32 (input_i_1, input_i_2);
+++ output_ps_2 = _mm_cvtepi32_ps(output_i32);
+++
+++ P_code_acc = _mm_add_ps (P_code_acc, output_ps_1);
+++ P_code_acc = _mm_add_ps (P_code_acc, output_ps_2);
+++
+++ //Get late values
+++ y = _mm_lddqu_si128((__m128i*)L_code_ptr);
+++
+++ imagy = _mm_srli_si128 (y, 1);
+++ imagy = _mm_and_si128 (imagy, mult1);
+++ realy = _mm_and_si128 (y, mult1);
+++
+++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy);
+++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy);
+++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy);
+++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy);
+++
+++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
+++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
+++
+++ imag_output = _mm_slli_si128 (imag_output, 1);
+++ output = _mm_blendv_epi8 (imag_output, real_output, mult1);
+++
+++ input_i_1 = _mm_cvtepi8_epi32(output);
+++ output = _mm_srli_si128 (output, 4);
+++ input_i_2 = _mm_cvtepi8_epi32(output);
+++ output = _mm_srli_si128 (output, 4);
+++ output_i32 = _mm_add_epi32 (input_i_1, input_i_2);
+++ output_ps_1 = _mm_cvtepi32_ps(output_i32);
+++
+++ input_i_1 = _mm_cvtepi8_epi32(output);
+++ output = _mm_srli_si128 (output, 4);
+++ input_i_2 = _mm_cvtepi8_epi32(output);
+++ output = _mm_srli_si128 (output, 4);
+++ output_i32 = _mm_add_epi32 (input_i_1, input_i_2);
+++ output_ps_2 = _mm_cvtepi32_ps(output_i32);
+++
+++ L_code_acc = _mm_add_ps (L_code_acc, output_ps_1);
+++ L_code_acc = _mm_add_ps (L_code_acc, output_ps_2);
+++
+++ //Get very late values
+++ y = _mm_lddqu_si128((__m128i*)VL_code_ptr);
+++
+++ imagy = _mm_srli_si128 (y, 1);
+++ imagy = _mm_and_si128 (imagy, mult1);
+++ realy = _mm_and_si128 (y, mult1);
+++
+++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy);
+++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy);
+++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy);
+++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy);
+++
+++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
+++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
+++
+++ imag_output = _mm_slli_si128 (imag_output, 1);
+++ output = _mm_blendv_epi8 (imag_output, real_output, mult1);
+++
+++ input_i_1 = _mm_cvtepi8_epi32(output);
+++ output = _mm_srli_si128 (output, 4);
+++ input_i_2 = _mm_cvtepi8_epi32(output);
+++ output = _mm_srli_si128 (output, 4);
+++ output_i32 = _mm_add_epi32 (input_i_1, input_i_2);
+++ output_ps_1 = _mm_cvtepi32_ps(output_i32);
+++
+++ input_i_1 = _mm_cvtepi8_epi32(output);
+++ output = _mm_srli_si128 (output, 4);
+++ input_i_2 = _mm_cvtepi8_epi32(output);
+++ output = _mm_srli_si128 (output, 4);
+++ output_i32 = _mm_add_epi32 (input_i_1, input_i_2);
+++ output_ps_2 = _mm_cvtepi32_ps(output_i32);
+++
+++ VL_code_acc = _mm_add_ps (VL_code_acc, output_ps_1);
+++ VL_code_acc = _mm_add_ps (VL_code_acc, output_ps_2);
+++
+++ input_ptr += 8;
+++ carrier_ptr += 8;
+++ VE_code_ptr += 8;
+++ E_code_ptr += 8;
+++ P_code_ptr += 8;
+++ L_code_ptr += 8;
+++ VL_code_ptr += 8;
+++ }
+++
+++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t VE_dotProductVector[2];
+++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2];
+++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2];
+++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2];
+++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t VL_dotProductVector[2];
+++
+++ _mm_storeu_ps((float*)VE_dotProductVector,VE_code_acc); // Store the results back into the dot product vector
+++ _mm_storeu_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector
+++ _mm_storeu_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector
+++ _mm_storeu_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector
+++ _mm_storeu_ps((float*)VL_dotProductVector,VL_code_acc); // Store the results back into the dot product vector
+++
+++ for (int i = 0; i<2; ++i)
+++ {
+++ *VE_out_ptr += VE_dotProductVector[i];
+++ *E_out_ptr += E_dotProductVector[i];
+++ *P_out_ptr += P_dotProductVector[i];
+++ *L_out_ptr += L_dotProductVector[i];
+++ *VL_out_ptr += VL_dotProductVector[i];
+++ }
+++ }
+++
+++ lv_8sc_t bb_signal_sample;
+++ for(int i=0; i < num_points%8; ++i)
+++ {
+++ //Perform the carrier wipe-off
+++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
+++ // Now get very early, early, prompt, late and very late values for each
+++ *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VE_code_ptr++));
+++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++));
+++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++));
+++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++));
+++ *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VL_code_ptr++));
+++ }
+++}
+++#endif /* LV_HAVE_SSE4_1 */
+++
+++#ifdef LV_HAVE_SSE4_1
+++#include "smmintrin.h"
+++#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h"
+++#include "CommonMacros/CommonMacros.h"
+++/*!
+++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
+++ \param input The input signal input
+++ \param carrier The carrier signal input
+++ \param VE_code Very Early PRN code replica input
+++ \param E_code Early PRN code replica input
+++ \param P_code Prompt PRN code replica input
+++ \param L_code Late PRN code replica input
+++ \param VL_code Very Late PRN code replica input
+++ \param VE_out Very Early correlation output
+++ \param E_out Early correlation output
+++ \param P_out Prompt correlation output
+++ \param L_out Late correlation output
+++ \param VL_out Very Late correlation output
+++ \param num_points The number of complex values in vectors
+++ */
+++static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5_u_sse4_1_second(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points)
+++{
+++ const unsigned int sse_iters = num_points / 8;
+++
+++ __m128i x, x_abs, y, y_aux, bb_signal_sample_aux, bb_signal_sample_aux_abs;;
+++ __m128i mult1, output, real_output, imag_output;
+++
+++ __m128 VE_code_acc, E_code_acc, P_code_acc, L_code_acc, VL_code_acc;
+++ __m128i input_i_1, input_i_2, output_i32;
+++ __m128 output_ps_1, output_ps_2;
+++
+++ __m128i check_sign_sequence = _mm_set_epi8 (255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1);
+++
+++ const lv_8sc_t* input_ptr = input;
+++ const lv_8sc_t* carrier_ptr = carrier;
+++
+++ const lv_8sc_t* VE_code_ptr = VE_code;
+++ lv_32fc_t* VE_out_ptr = VE_out;
+++ const lv_8sc_t* E_code_ptr = E_code;
+++ lv_32fc_t* E_out_ptr = E_out;
+++ const lv_8sc_t* P_code_ptr = P_code;
+++ lv_32fc_t* P_out_ptr = P_out;
+++ const lv_8sc_t* L_code_ptr = L_code;
+++ lv_32fc_t* L_out_ptr = L_out;
+++ const lv_8sc_t* VL_code_ptr = VL_code;
+++ lv_32fc_t* VL_out_ptr = VL_out;
+++
+++ *VE_out_ptr = 0;
+++ *E_out_ptr = 0;
+++ *P_out_ptr = 0;
+++ *L_out_ptr = 0;
+++ *VL_out_ptr = 0;
+++
+++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
+++
+++ VE_code_acc = _mm_setzero_ps();
+++ E_code_acc = _mm_setzero_ps();
+++ P_code_acc = _mm_setzero_ps();
+++ L_code_acc = _mm_setzero_ps();
+++ VL_code_acc = _mm_setzero_ps();
+++
+++ if (sse_iters>0)
+++ {
+++ for(int number = 0;number < sse_iters; number++){
+++
+++ //Perform the carrier wipe-off
+++ x = _mm_lddqu_si128((__m128i*)input_ptr);
+++ y = _mm_lddqu_si128((__m128i*)carrier_ptr);
+++
+++ x_abs = _mm_abs_epi8 (x);
+++
+++ y_aux = _mm_sign_epi8 (y, x);
+++ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence);
+++ real_output = _mm_maddubs_epi16 (x_abs, y_aux);
+++
+++ y_aux = _mm_shuffle_epi8 (y, _mm_set_epi8 (14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1));
+++ y_aux = _mm_sign_epi8 (y_aux, x);
+++ imag_output = _mm_maddubs_epi16 (x_abs, y_aux);
+++
+++ imag_output = _mm_slli_si128 (imag_output, 1);
+++ bb_signal_sample_aux = _mm_blendv_epi8 (imag_output, real_output, mult1);
+++
+++ bb_signal_sample_aux_abs = _mm_abs_epi8 (bb_signal_sample_aux);
+++
+++ //Get very early values
+++ y = _mm_lddqu_si128((__m128i*)VE_code_ptr);
+++
+++ y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux);
+++ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence);
+++ real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux);
+++
+++ y_aux = _mm_shuffle_epi8 (y, _mm_set_epi8 (14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1));
+++ y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux);
+++ imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux);
+++
+++ imag_output = _mm_slli_si128 (imag_output, 1);
+++ output = _mm_blendv_epi8 (imag_output, real_output, mult1);
+++
+++ input_i_1 = _mm_cvtepi8_epi32(output);
+++ output = _mm_srli_si128 (output, 4);
+++ input_i_2 = _mm_cvtepi8_epi32(output);
+++ output = _mm_srli_si128 (output, 4);
+++ output_i32 = _mm_add_epi32 (input_i_1, input_i_2);
+++ output_ps_1 = _mm_cvtepi32_ps(output_i32);
+++
+++ input_i_1 = _mm_cvtepi8_epi32(output);
+++ output = _mm_srli_si128 (output, 4);
+++ input_i_2 = _mm_cvtepi8_epi32(output);
+++ output = _mm_srli_si128 (output, 4);
+++ output_i32 = _mm_add_epi32 (input_i_1, input_i_2);
+++ output_ps_2 = _mm_cvtepi32_ps(output_i32);
+++
+++ VE_code_acc = _mm_add_ps (VE_code_acc, output_ps_1);
+++ VE_code_acc = _mm_add_ps (VE_code_acc, output_ps_2);
+++
+++ //Get early values
+++ y = _mm_lddqu_si128((__m128i*)E_code_ptr);
+++
+++ y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux);
+++ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence);
+++ real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux);
+++
+++ y_aux = _mm_shuffle_epi8 (y, _mm_set_epi8 (14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1));
+++ y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux);
+++ imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux);
+++
+++ imag_output = _mm_slli_si128 (imag_output, 1);
+++ output = _mm_blendv_epi8 (imag_output, real_output, mult1);
+++
+++ input_i_1 = _mm_cvtepi8_epi32(output);
+++ output = _mm_srli_si128 (output, 4);
+++ input_i_2 = _mm_cvtepi8_epi32(output);
+++ output = _mm_srli_si128 (output, 4);
+++ output_i32 = _mm_add_epi32 (input_i_1, input_i_2);
+++ output_ps_1 = _mm_cvtepi32_ps(output_i32);
+++
+++ input_i_1 = _mm_cvtepi8_epi32(output);
+++ output = _mm_srli_si128 (output, 4);
+++ input_i_2 = _mm_cvtepi8_epi32(output);
+++ output = _mm_srli_si128 (output, 4);
+++ output_i32 = _mm_add_epi32 (input_i_1, input_i_2);
+++ output_ps_2 = _mm_cvtepi32_ps(output_i32);
+++
+++ E_code_acc = _mm_add_ps (E_code_acc, output_ps_1);
+++ E_code_acc = _mm_add_ps (E_code_acc, output_ps_2);
+++
+++ //Get prompt values
+++ y = _mm_lddqu_si128((__m128i*)P_code_ptr);
+++
+++ y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux);
+++ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence);
+++ real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux);
+++
+++ y_aux = _mm_shuffle_epi8 (y, _mm_set_epi8 (14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1));
+++ y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux);
+++ imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux);
+++
+++ imag_output = _mm_slli_si128 (imag_output, 1);
+++ output = _mm_blendv_epi8 (imag_output, real_output, mult1);
+++
+++ input_i_1 = _mm_cvtepi8_epi32(output);
+++ output = _mm_srli_si128 (output, 4);
+++ input_i_2 = _mm_cvtepi8_epi32(output);
+++ output = _mm_srli_si128 (output, 4);
+++ output_i32 = _mm_add_epi32 (input_i_1, input_i_2);
+++ output_ps_1 = _mm_cvtepi32_ps(output_i32);
+++
+++ input_i_1 = _mm_cvtepi8_epi32(output);
+++ output = _mm_srli_si128 (output, 4);
+++ input_i_2 = _mm_cvtepi8_epi32(output);
+++ output = _mm_srli_si128 (output, 4);
+++ output_i32 = _mm_add_epi32 (input_i_1, input_i_2);
+++ output_ps_2 = _mm_cvtepi32_ps(output_i32);
+++
+++ P_code_acc = _mm_add_ps (P_code_acc, output_ps_1);
+++ P_code_acc = _mm_add_ps (P_code_acc, output_ps_2);
+++
+++ //Get late values
+++ y = _mm_lddqu_si128((__m128i*)L_code_ptr);
+++
+++ y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux);
+++ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence);
+++ real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux);
+++
+++ y_aux = _mm_shuffle_epi8 (y, _mm_set_epi8 (14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1));
+++ y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux);
+++ imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux);
+++
+++ imag_output = _mm_slli_si128 (imag_output, 1);
+++ output = _mm_blendv_epi8 (imag_output, real_output, mult1);
+++
+++ input_i_1 = _mm_cvtepi8_epi32(output);
+++ output = _mm_srli_si128 (output, 4);
+++ input_i_2 = _mm_cvtepi8_epi32(output);
+++ output = _mm_srli_si128 (output, 4);
+++ output_i32 = _mm_add_epi32 (input_i_1, input_i_2);
+++ output_ps_1 = _mm_cvtepi32_ps(output_i32);
+++
+++ input_i_1 = _mm_cvtepi8_epi32(output);
+++ output = _mm_srli_si128 (output, 4);
+++ input_i_2 = _mm_cvtepi8_epi32(output);
+++ output = _mm_srli_si128 (output, 4);
+++ output_i32 = _mm_add_epi32 (input_i_1, input_i_2);
+++ output_ps_2 = _mm_cvtepi32_ps(output_i32);
+++
+++ L_code_acc = _mm_add_ps (L_code_acc, output_ps_1);
+++ L_code_acc = _mm_add_ps (L_code_acc, output_ps_2);
+++
+++ //Get very late values
+++ y = _mm_lddqu_si128((__m128i*)VL_code_ptr);
+++
+++ y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux);
+++ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence);
+++ real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux);
+++
+++ y_aux = _mm_shuffle_epi8 (y, _mm_set_epi8 (14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1));
+++ y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux);
+++ imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux);
+++
+++ imag_output = _mm_slli_si128 (imag_output, 1);
+++ output = _mm_blendv_epi8 (imag_output, real_output, mult1);
+++
+++ input_i_1 = _mm_cvtepi8_epi32(output);
+++ output = _mm_srli_si128 (output, 4);
+++ input_i_2 = _mm_cvtepi8_epi32(output);
+++ output = _mm_srli_si128 (output, 4);
+++ output_i32 = _mm_add_epi32 (input_i_1, input_i_2);
+++ output_ps_1 = _mm_cvtepi32_ps(output_i32);
+++
+++ input_i_1 = _mm_cvtepi8_epi32(output);
+++ output = _mm_srli_si128 (output, 4);
+++ input_i_2 = _mm_cvtepi8_epi32(output);
+++ output = _mm_srli_si128 (output, 4);
+++ output_i32 = _mm_add_epi32 (input_i_1, input_i_2);
+++ output_ps_2 = _mm_cvtepi32_ps(output_i32);
+++
+++ VL_code_acc = _mm_add_ps (VL_code_acc, output_ps_1);
+++ VL_code_acc = _mm_add_ps (VL_code_acc, output_ps_2);
+++
+++ input_ptr += 8;
+++ carrier_ptr += 8;
+++ VE_code_ptr += 8;
+++ E_code_ptr += 8;
+++ P_code_ptr += 8;
+++ L_code_ptr += 8;
+++ VL_code_ptr += 8;
+++ }
+++
+++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t VE_dotProductVector[2];
+++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2];
+++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2];
+++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2];
+++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t VL_dotProductVector[2];
+++
+++ _mm_storeu_ps((float*)VE_dotProductVector,VE_code_acc); // Store the results back into the dot product vector
+++ _mm_storeu_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector
+++ _mm_storeu_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector
+++ _mm_storeu_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector
+++ _mm_storeu_ps((float*)VL_dotProductVector,VL_code_acc); // Store the results back into the dot product vector
+++
+++ for (int i = 0; i<2; ++i)
+++ {
+++ *VE_out_ptr += VE_dotProductVector[i];
+++ *E_out_ptr += E_dotProductVector[i];
+++ *P_out_ptr += P_dotProductVector[i];
+++ *L_out_ptr += L_dotProductVector[i];
+++ *VL_out_ptr += VL_dotProductVector[i];
+++ }
+++ }
+++
+++ lv_8sc_t bb_signal_sample;
+++ for(int i=0; i < num_points%8; ++i)
+++ {
+++ //Perform the carrier wipe-off
+++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
+++ // Now get very early, early, prompt, late and very late values for each
+++ *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VE_code_ptr++));
+++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++));
+++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++));
+++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++));
+++ *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VL_code_ptr++));
+++ }
+++}
+++#endif /* LV_HAVE_SSE4_1 */
+++
+++#ifdef LV_HAVE_SSE4_1
+++#include "smmintrin.h"
+++#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h"
+++#include "CommonMacros/CommonMacros.h"
+++/*!
+++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
+++ \param input The input signal input
+++ \param carrier The carrier signal input
+++ \param VE_code Very Early PRN code replica input
+++ \param E_code Early PRN code replica input
+++ \param P_code Prompt PRN code replica input
+++ \param L_code Late PRN code replica input
+++ \param VL_code Very Late PRN code replica input
+++ \param VE_out Very Early correlation output
+++ \param E_out Early correlation output
+++ \param P_out Prompt correlation output
+++ \param L_out Late correlation output
+++ \param VL_out Very Late correlation output
+++ \param num_points The number of complex values in vectors
+++ */
+++static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5_u_sse4_1_third(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points)
+++{
+++ const unsigned int sse_iters = num_points / 8;
+++
+++ __m128i x, x_abs, y, y_aux, bb_signal_sample_aux, bb_signal_sample_aux_abs;;
+++ __m128i mult1, real_output, imag_output;
+++
+++ __m128 real_VE_code_acc, imag_VE_code_acc, real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc, real_VL_code_acc, imag_VL_code_acc;
+++ __m128i real_output_i_1, real_output_i_2, imag_output_i_1, imag_output_i_2, real_output_i32, imag_output_i32;
+++ __m128 real_output_ps, imag_output_ps;
+++
+++ __m128i check_sign_sequence = _mm_set_epi8 (255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1);
+++ __m128i rearrange_sequence = _mm_set_epi8 (14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
+++
+++ const lv_8sc_t* input_ptr = input;
+++ const lv_8sc_t* carrier_ptr = carrier;
+++
+++ const lv_8sc_t* VE_code_ptr = VE_code;
+++ lv_32fc_t* VE_out_ptr = VE_out;
+++ const lv_8sc_t* E_code_ptr = E_code;
+++ lv_32fc_t* E_out_ptr = E_out;
+++ const lv_8sc_t* P_code_ptr = P_code;
+++ lv_32fc_t* P_out_ptr = P_out;
+++ const lv_8sc_t* L_code_ptr = L_code;
+++ lv_32fc_t* L_out_ptr = L_out;
+++ const lv_8sc_t* VL_code_ptr = VL_code;
+++ lv_32fc_t* VL_out_ptr = VL_out;
+++
+++ float VE_out_real = 0;
+++ float VE_out_imag = 0;
+++ float E_out_real = 0;
+++ float E_out_imag = 0;
+++ float P_out_real = 0;
+++ float P_out_imag = 0;
+++ float L_out_real = 0;
+++ float L_out_imag = 0;
+++ float VL_out_real = 0;
+++ float VL_out_imag = 0;
+++
+++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
+++
+++ real_VE_code_acc = _mm_setzero_ps();
+++ imag_VE_code_acc = _mm_setzero_ps();
+++ real_E_code_acc = _mm_setzero_ps();
+++ imag_E_code_acc = _mm_setzero_ps();
+++ real_P_code_acc = _mm_setzero_ps();
+++ imag_P_code_acc = _mm_setzero_ps();
+++ real_L_code_acc = _mm_setzero_ps();
+++ imag_L_code_acc = _mm_setzero_ps();
+++ real_VL_code_acc = _mm_setzero_ps();
+++ imag_VL_code_acc = _mm_setzero_ps();
+++
+++ if (sse_iters>0)
+++ {
+++ for(int number = 0;number < sse_iters; number++){
+++
+++ //Perform the carrier wipe-off
+++ x = _mm_lddqu_si128((__m128i*)input_ptr);
+++ y = _mm_lddqu_si128((__m128i*)carrier_ptr);
+++
+++ x_abs = _mm_abs_epi8 (x);
+++
+++ y_aux = _mm_sign_epi8 (y, x);
+++ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence);
+++ real_output = _mm_maddubs_epi16 (x_abs, y_aux);
+++
+++ y_aux = _mm_shuffle_epi8 (y, rearrange_sequence);
+++ y_aux = _mm_sign_epi8 (y_aux, x);
+++ imag_output = _mm_maddubs_epi16 (x_abs, y_aux);
+++
+++ imag_output = _mm_slli_si128 (imag_output, 1);
+++ bb_signal_sample_aux = _mm_blendv_epi8 (imag_output, real_output, mult1);
+++ bb_signal_sample_aux_abs = _mm_abs_epi8 (bb_signal_sample_aux);
+++
+++ //Get very early values
+++ y = _mm_lddqu_si128((__m128i*)VE_code_ptr);
+++
+++ y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux);
+++ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence);
+++ real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux);
+++
+++ y_aux = _mm_shuffle_epi8 (y, rearrange_sequence);
+++ y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux);
+++ imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux);
+++
+++ real_output_i_1 = _mm_cvtepi16_epi32(real_output);
+++ real_output = _mm_srli_si128 (real_output, 8);
+++ real_output_i_2 = _mm_cvtepi16_epi32(real_output);
+++ real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2);
+++ real_output_ps = _mm_cvtepi32_ps(real_output_i32);
+++
+++ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output);
+++ imag_output = _mm_srli_si128 (imag_output, 8);
+++ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output);
+++ imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2);
+++ imag_output_ps = _mm_cvtepi32_ps(imag_output_i32);
+++
+++ real_VE_code_acc = _mm_add_ps (real_VE_code_acc, real_output_ps);
+++ imag_VE_code_acc = _mm_add_ps (imag_VE_code_acc, imag_output_ps);
+++
+++ //Get early values
+++ y = _mm_lddqu_si128((__m128i*)E_code_ptr);
+++
+++ y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux);
+++ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence);
+++ real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux);
+++
+++ y_aux = _mm_shuffle_epi8 (y, rearrange_sequence);
+++ y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux);
+++ imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux);
+++
+++ real_output_i_1 = _mm_cvtepi16_epi32(real_output);
+++ real_output = _mm_srli_si128 (real_output, 8);
+++ real_output_i_2 = _mm_cvtepi16_epi32(real_output);
+++ real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2);
+++ real_output_ps = _mm_cvtepi32_ps(real_output_i32);
+++
+++ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output);
+++ imag_output = _mm_srli_si128 (imag_output, 8);
+++ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output);
+++ imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2);
+++ imag_output_ps = _mm_cvtepi32_ps(imag_output_i32);
+++
+++ real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps);
+++ imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps);
+++
+++ //Get prompt values
+++ y = _mm_lddqu_si128((__m128i*)P_code_ptr);
+++
+++ y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux);
+++ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence);
+++ real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux);
+++
+++ y_aux = _mm_shuffle_epi8 (y, rearrange_sequence);
+++ y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux);
+++ imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux);
+++
+++ real_output_i_1 = _mm_cvtepi16_epi32(real_output);
+++ real_output = _mm_srli_si128 (real_output, 8);
+++ real_output_i_2 = _mm_cvtepi16_epi32(real_output);
+++ real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2);
+++ real_output_ps = _mm_cvtepi32_ps(real_output_i32);
+++
+++ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output);
+++ imag_output = _mm_srli_si128 (imag_output, 8);
+++ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output);
+++ imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2);
+++ imag_output_ps = _mm_cvtepi32_ps(imag_output_i32);
+++
+++ real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps);
+++ imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps);
+++
+++ //Get late values
+++ y = _mm_lddqu_si128((__m128i*)L_code_ptr);
+++
+++ y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux);
+++ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence);
+++ real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux);
+++
+++ y_aux = _mm_shuffle_epi8 (y, rearrange_sequence);
+++ y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux);
+++ imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux);
+++
+++ real_output_i_1 = _mm_cvtepi16_epi32(real_output);
+++ real_output = _mm_srli_si128 (real_output, 8);
+++ real_output_i_2 = _mm_cvtepi16_epi32(real_output);
+++ real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2);
+++ real_output_ps = _mm_cvtepi32_ps(real_output_i32);
+++
+++ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output);
+++ imag_output = _mm_srli_si128 (imag_output, 8);
+++ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output);
+++ imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2);
+++ imag_output_ps = _mm_cvtepi32_ps(imag_output_i32);
+++
+++ real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps);
+++ imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps);
+++
+++ //Get very late values
+++ y = _mm_lddqu_si128((__m128i*)VL_code_ptr);
+++
+++ y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux);
+++ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence);
+++ real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux);
+++
+++ y_aux = _mm_shuffle_epi8 (y, _mm_set_epi8 (14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1));
+++ y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux);
+++ imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux);
+++
+++ real_output_i_1 = _mm_cvtepi16_epi32(real_output);
+++ real_output = _mm_srli_si128 (real_output, 8);
+++ real_output_i_2 = _mm_cvtepi16_epi32(real_output);
+++ real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2);
+++ real_output_ps = _mm_cvtepi32_ps(real_output_i32);
+++
+++ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output);
+++ imag_output = _mm_srli_si128 (imag_output, 8);
+++ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output);
+++ imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2);
+++ imag_output_ps = _mm_cvtepi32_ps(imag_output_i32);
+++
+++ real_VL_code_acc = _mm_add_ps (real_VL_code_acc, real_output_ps);
+++ imag_VL_code_acc = _mm_add_ps (imag_VL_code_acc, imag_output_ps);
+++
+++ input_ptr += 8;
+++ carrier_ptr += 8;
+++ VE_code_ptr += 8;
+++ E_code_ptr += 8;
+++ P_code_ptr += 8;
+++ L_code_ptr += 8;
+++ VL_code_ptr += 8;
+++ }
+++
+++ __VOLK_ATTR_ALIGNED(16) float real_VE_dotProductVector[4];
+++ __VOLK_ATTR_ALIGNED(16) float imag_VE_dotProductVector[4];
+++ __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4];
+++ __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4];
+++ __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4];
+++ __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4];
+++ __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4];
+++ __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4];
+++ __VOLK_ATTR_ALIGNED(16) float real_VL_dotProductVector[4];
+++ __VOLK_ATTR_ALIGNED(16) float imag_VL_dotProductVector[4];
+++
+++ _mm_storeu_ps((float*)real_VE_dotProductVector,real_VE_code_acc); // Store the results back into the dot product vector
+++ _mm_storeu_ps((float*)imag_VE_dotProductVector,imag_VE_code_acc); // Store the results back into the dot product vector
+++ _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector
+++ _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector
+++ _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector
+++ _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector
+++ _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector
+++ _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector
+++ _mm_storeu_ps((float*)real_VL_dotProductVector,real_VL_code_acc); // Store the results back into the dot product vector
+++ _mm_storeu_ps((float*)imag_VL_dotProductVector,imag_VL_code_acc); // Store the results back into the dot product vector
+++
+++ for (int i = 0; i<4; ++i)
+++ {
+++ VE_out_real += real_VE_dotProductVector[i];
+++ VE_out_imag += imag_VE_dotProductVector[i];
+++ E_out_real += real_E_dotProductVector[i];
+++ E_out_imag += imag_E_dotProductVector[i];
+++ P_out_real += real_P_dotProductVector[i];
+++ P_out_imag += imag_P_dotProductVector[i];
+++ L_out_real += real_L_dotProductVector[i];
+++ L_out_imag += imag_L_dotProductVector[i];
+++ VL_out_real += real_VL_dotProductVector[i];
+++ VL_out_imag += imag_VL_dotProductVector[i];
+++ }
+++ *VE_out_ptr = lv_cmake(VE_out_real, VE_out_imag);
+++ *E_out_ptr = lv_cmake(E_out_real, E_out_imag);
+++ *P_out_ptr = lv_cmake(P_out_real, P_out_imag);
+++ *L_out_ptr = lv_cmake(L_out_real, L_out_imag);
+++ *VL_out_ptr = lv_cmake(VL_out_real, VL_out_imag);
+++ }
+++
+++ lv_16sc_t bb_signal_sample;
+++ for(int i=0; i < num_points%8; ++i)
+++ {
+++ //Perform the carrier wipe-off
+++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
+++ // Now get very early, early, prompt, late and very late values for each
+++ *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*VE_code_ptr++));
+++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*E_code_ptr++));
+++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*P_code_ptr++));
+++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*L_code_ptr++));
+++ *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*VL_code_ptr++));
+++ }
+++}
+++#endif /* LV_HAVE_SSE4_1 */
+++
+++#ifdef LV_HAVE_SSE4_1
+++#include "smmintrin.h"
+++#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h"
+++#include "CommonMacros/CommonMacros.h"
+++/*!
+++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
+++ \param input The input signal input
+++ \param carrier The carrier signal input
+++ \param VE_code Very Early PRN code replica input
+++ \param E_code Early PRN code replica input
+++ \param P_code Prompt PRN code replica input
+++ \param L_code Late PRN code replica input
+++ \param VL_code Very Late PRN code replica input
+++ \param VE_out Very Early correlation output
+++ \param E_out Early correlation output
+++ \param P_out Prompt correlation output
+++ \param L_out Late correlation output
+++ \param VL_out Very Late correlation output
+++ \param num_points The number of complex values in vectors
+++ */
+++static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5_u_sse4_1_fourth(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points)
+++{
+++ const unsigned int sse_iters = num_points / 8;
+++
+++ __m128i x, x_abs, y, y_aux, bb_signal_sample_aux, bb_signal_sample_aux_abs;;
+++ __m128i real_output, imag_output;
+++ __m128 real_VE_code_acc, imag_VE_code_acc, real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc, real_VL_code_acc, imag_VL_code_acc;
+++ __m128i real_output_i_1, real_output_i_2, imag_output_i_1, imag_output_i_2, real_output_i32, imag_output_i32;
+++ __m128 real_output_ps, imag_output_ps;
+++ __m128i minus128control;
+++
+++ __m128i minus128 = _mm_set1_epi8 (-128);
+++ __m128i check_sign_sequence = _mm_set_epi8 (255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1);
+++ __m128i rearrange_sequence = _mm_set_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
+++ __m128i mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
+++
+++ const lv_8sc_t* input_ptr = input;
+++ const lv_8sc_t* carrier_ptr = carrier;
+++
+++ const lv_8sc_t* VE_code_ptr = VE_code;
+++ lv_32fc_t* VE_out_ptr = VE_out;
+++ const lv_8sc_t* E_code_ptr = E_code;
+++ lv_32fc_t* E_out_ptr = E_out;
+++ const lv_8sc_t* P_code_ptr = P_code;
+++ lv_32fc_t* P_out_ptr = P_out;
+++ const lv_8sc_t* L_code_ptr = L_code;
+++ lv_32fc_t* L_out_ptr = L_out;
+++ const lv_8sc_t* VL_code_ptr = VL_code;
+++ lv_32fc_t* VL_out_ptr = VL_out;
+++
+++ float VE_out_real = 0;
+++ float VE_out_imag = 0;
+++ float E_out_real = 0;
+++ float E_out_imag = 0;
+++ float P_out_real = 0;
+++ float P_out_imag = 0;
+++ float L_out_real = 0;
+++ float L_out_imag = 0;
+++ float VL_out_real = 0;
+++ float VL_out_imag = 0;
+++
+++ real_VE_code_acc = _mm_setzero_ps();
+++ imag_VE_code_acc = _mm_setzero_ps();
+++ real_E_code_acc = _mm_setzero_ps();
+++ imag_E_code_acc = _mm_setzero_ps();
+++ real_P_code_acc = _mm_setzero_ps();
+++ imag_P_code_acc = _mm_setzero_ps();
+++ real_L_code_acc = _mm_setzero_ps();
+++ imag_L_code_acc = _mm_setzero_ps();
+++ real_VL_code_acc = _mm_setzero_ps();
+++ imag_VL_code_acc = _mm_setzero_ps();
+++
+++ if (sse_iters>0)
+++ {
+++ for(int number = 0;number < sse_iters; number++){
+++
+++ //Perform the carrier wipe-off
+++ x = _mm_lddqu_si128((__m128i*)input_ptr);
+++ y = _mm_lddqu_si128((__m128i*)carrier_ptr);
+++
+++ x_abs = _mm_abs_epi8 (x);
+++
+++ y_aux = _mm_sign_epi8 (y, x);
+++ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence);
+++ real_output = _mm_maddubs_epi16 (x_abs, y_aux);
+++
+++ y_aux = _mm_shuffle_epi8 (y, rearrange_sequence);
+++ y_aux = _mm_sign_epi8 (y_aux, x);
+++ imag_output = _mm_maddubs_epi16 (x_abs, y_aux);
+++
+++ imag_output = _mm_slli_si128 (imag_output, 1);
+++ bb_signal_sample_aux = _mm_blendv_epi8 (imag_output, real_output, mult1);
+++ bb_signal_sample_aux_abs = _mm_abs_epi8 (bb_signal_sample_aux);
+++
+++ //Get very early values
+++ y = _mm_lddqu_si128((__m128i*)VE_code_ptr);
+++ minus128control = _mm_cmpeq_epi8 (y, minus128);
+++ y = _mm_sub_epi8 (y, minus128control);
+++
+++ y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux);
+++ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence);
+++ real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux);
+++
+++ y_aux = _mm_shuffle_epi8 (y, rearrange_sequence);
+++ y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux);
+++ imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux);
+++
+++ real_output_i_1 = _mm_cvtepi16_epi32(real_output);
+++ real_output = _mm_srli_si128 (real_output, 8);
+++ real_output_i_2 = _mm_cvtepi16_epi32(real_output);
+++ real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2);
+++ real_output_ps = _mm_cvtepi32_ps(real_output_i32);
+++
+++ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output);
+++ imag_output = _mm_srli_si128 (imag_output, 8);
+++ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output);
+++ imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2);
+++ imag_output_ps = _mm_cvtepi32_ps(imag_output_i32);
+++
+++ real_VE_code_acc = _mm_add_ps (real_VE_code_acc, real_output_ps);
+++ imag_VE_code_acc = _mm_add_ps (imag_VE_code_acc, imag_output_ps);
+++
+++ //Get early values
+++ y = _mm_lddqu_si128((__m128i*)E_code_ptr);
+++ minus128control = _mm_cmpeq_epi8 (y, minus128);
+++ y = _mm_sub_epi8 (y, minus128control);
+++
+++ y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux);
+++ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence);
+++ real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux);
+++
+++ y_aux = _mm_shuffle_epi8 (y, rearrange_sequence);
+++ y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux);
+++ imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux);
+++
+++ real_output_i_1 = _mm_cvtepi16_epi32(real_output);
+++ real_output = _mm_srli_si128 (real_output, 8);
+++ real_output_i_2 = _mm_cvtepi16_epi32(real_output);
+++ real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2);
+++ real_output_ps = _mm_cvtepi32_ps(real_output_i32);
+++
+++ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output);
+++ imag_output = _mm_srli_si128 (imag_output, 8);
+++ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output);
+++ imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2);
+++ imag_output_ps = _mm_cvtepi32_ps(imag_output_i32);
+++
+++ real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps);
+++ imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps);
+++
+++ //Get prompt values
+++ y = _mm_lddqu_si128((__m128i*)P_code_ptr);
+++ minus128control = _mm_cmpeq_epi8 (y, minus128);
+++ y = _mm_sub_epi8 (y, minus128control);
+++
+++ y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux);
+++ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence);
+++ real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux);
+++
+++ y_aux = _mm_shuffle_epi8 (y, rearrange_sequence);
+++ y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux);
+++ imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux);
+++
+++ real_output_i_1 = _mm_cvtepi16_epi32(real_output);
+++ real_output = _mm_srli_si128 (real_output, 8);
+++ real_output_i_2 = _mm_cvtepi16_epi32(real_output);
+++ real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2);
+++ real_output_ps = _mm_cvtepi32_ps(real_output_i32);
+++
+++ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output);
+++ imag_output = _mm_srli_si128 (imag_output, 8);
+++ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output);
+++ imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2);
+++ imag_output_ps = _mm_cvtepi32_ps(imag_output_i32);
+++
+++ real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps);
+++ imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps);
+++
+++ //Get late values
+++ y = _mm_lddqu_si128((__m128i*)L_code_ptr);
+++ minus128control = _mm_cmpeq_epi8 (y, minus128);
+++ y = _mm_sub_epi8 (y, minus128control);
+++
+++ y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux);
+++ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence);
+++ real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux);
+++
+++ y_aux = _mm_shuffle_epi8 (y, rearrange_sequence);
+++ y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux);
+++ imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux);
+++
+++ real_output_i_1 = _mm_cvtepi16_epi32(real_output);
+++ real_output = _mm_srli_si128 (real_output, 8);
+++ real_output_i_2 = _mm_cvtepi16_epi32(real_output);
+++ real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2);
+++ real_output_ps = _mm_cvtepi32_ps(real_output_i32);
+++
+++ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output);
+++ imag_output = _mm_srli_si128 (imag_output, 8);
+++ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output);
+++ imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2);
+++ imag_output_ps = _mm_cvtepi32_ps(imag_output_i32);
+++
+++ real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps);
+++ imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps);
+++
+++ //Get very late values
+++ y = _mm_lddqu_si128((__m128i*)VL_code_ptr);
+++ minus128control = _mm_cmpeq_epi8 (y, minus128);
+++ y = _mm_sub_epi8 (y, minus128control);
+++
+++
+++ y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux);
+++ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence);
+++ real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux);
+++
+++ y_aux = _mm_shuffle_epi8 (y, _mm_set_epi8 (14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1));
+++ y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux);
+++ imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux);
+++
+++ real_output_i_1 = _mm_cvtepi16_epi32(real_output);
+++ real_output = _mm_srli_si128 (real_output, 8);
+++ real_output_i_2 = _mm_cvtepi16_epi32(real_output);
+++ real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2);
+++ real_output_ps = _mm_cvtepi32_ps(real_output_i32);
+++
+++ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output);
+++ imag_output = _mm_srli_si128 (imag_output, 8);
+++ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output);
+++ imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2);
+++ imag_output_ps = _mm_cvtepi32_ps(imag_output_i32);
+++
+++ real_VL_code_acc = _mm_add_ps (real_VL_code_acc, real_output_ps);
+++ imag_VL_code_acc = _mm_add_ps (imag_VL_code_acc, imag_output_ps);
+++
+++ input_ptr += 8;
+++ carrier_ptr += 8;
+++ VE_code_ptr += 8;
+++ E_code_ptr += 8;
+++ P_code_ptr += 8;
+++ L_code_ptr += 8;
+++ VL_code_ptr += 8;
+++ }
+++
+++ __VOLK_ATTR_ALIGNED(16) float real_VE_dotProductVector[4];
+++ __VOLK_ATTR_ALIGNED(16) float imag_VE_dotProductVector[4];
+++ __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4];
+++ __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4];
+++ __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4];
+++ __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4];
+++ __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4];
+++ __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4];
+++ __VOLK_ATTR_ALIGNED(16) float real_VL_dotProductVector[4];
+++ __VOLK_ATTR_ALIGNED(16) float imag_VL_dotProductVector[4];
+++
+++ _mm_storeu_ps((float*)real_VE_dotProductVector,real_VE_code_acc); // Store the results back into the dot product vector
+++ _mm_storeu_ps((float*)imag_VE_dotProductVector,imag_VE_code_acc); // Store the results back into the dot product vector
+++ _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector
+++ _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector
+++ _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector
+++ _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector
+++ _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector
+++ _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector
+++ _mm_storeu_ps((float*)real_VL_dotProductVector,real_VL_code_acc); // Store the results back into the dot product vector
+++ _mm_storeu_ps((float*)imag_VL_dotProductVector,imag_VL_code_acc); // Store the results back into the dot product vector
+++
+++ for (int i = 0; i<4; ++i)
+++ {
+++ VE_out_real += real_VE_dotProductVector[i];
+++ VE_out_imag += imag_VE_dotProductVector[i];
+++ E_out_real += real_E_dotProductVector[i];
+++ E_out_imag += imag_E_dotProductVector[i];
+++ P_out_real += real_P_dotProductVector[i];
+++ P_out_imag += imag_P_dotProductVector[i];
+++ L_out_real += real_L_dotProductVector[i];
+++ L_out_imag += imag_L_dotProductVector[i];
+++ VL_out_real += real_VL_dotProductVector[i];
+++ VL_out_imag += imag_VL_dotProductVector[i];
+++ }
+++ *VE_out_ptr = lv_cmake(VE_out_real, VE_out_imag);
+++ *E_out_ptr = lv_cmake(E_out_real, E_out_imag);
+++ *P_out_ptr = lv_cmake(P_out_real, P_out_imag);
+++ *L_out_ptr = lv_cmake(L_out_real, L_out_imag);
+++ *VL_out_ptr = lv_cmake(VL_out_real, VL_out_imag);
+++ }
+++
+++ lv_16sc_t bb_signal_sample;
+++ for(int i=0; i < num_points%8; ++i)
+++ {
+++ //Perform the carrier wipe-off
+++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
+++ // Now get very early, early, prompt, late and very late values for each
+++ *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*VE_code_ptr++));
+++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*E_code_ptr++));
+++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*P_code_ptr++));
+++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*L_code_ptr++));
+++ *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*VL_code_ptr++));
+++ }
+++}
+++#endif /* LV_HAVE_SSE4_1 */
+++
+++
+++#ifdef LV_HAVE_GENERIC
+++#include <stdio.h>
+++#include <tmmintrin.h>
+++
+++/*!
+++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
+++ \param input The input signal input
+++ \param carrier The carrier signal input
+++ \param VE_code Very Early PRN code replica input
+++ \param E_code Early PRN code replica input
+++ \param P_code Prompt PRN code replica input
+++ \param L_code Late PRN code replica input
+++ \param VL_code Very Late PRN code replica input
+++ \param VE_out Very Early correlation output
+++ \param E_out Early correlation output
+++ \param P_out Prompt correlation output
+++ \param L_out Late correlation output
+++ \param VL_out Very Late correlation output
+++ \param num_points The number of complex values in vectors
+++ */
+++static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points)
+++{
+++ *VE_out = 0;
+++ *E_out = 0;
+++ *P_out = 0;
+++ *L_out = 0;
+++ *VL_out = 0;
+++
+++
+++ lv_16sc_t VE_code_value;
+++ lv_16sc_t E_code_value;
+++ lv_16sc_t P_code_value;
+++ lv_16sc_t L_code_value;
+++ lv_16sc_t VL_code_value;
+++ lv_16sc_t bb_signal_sample;
+++
+++ for(int i=0; i < num_points; ++i)
+++ {
+++ VE_code_value = VE_code[i];
+++ E_code_value = E_code[i];
+++ P_code_value = P_code[i];
+++ L_code_value = L_code[i];
+++ VL_code_value = VL_code[i];
+++
+++ if(lv_creal(VE_code_value) == -128)
+++ {
+++ VE_code_value = lv_cmake(-127, lv_cimag(VE_code_value));
+++ }
+++ if(lv_cimag(VE_code_value) == -128)
+++ {
+++ VE_code_value = lv_cmake(lv_creal(VE_code_value), -127);
+++ }
+++
+++ if(lv_creal(E_code_value) == -128)
+++ {
+++ E_code_value = lv_cmake(-127, lv_cimag(E_code_value));
+++ }
+++ if(lv_cimag(E_code_value) == -128)
+++ {
+++ E_code_value = lv_cmake(lv_creal(E_code_value), -127);
+++ }
+++
+++ if(lv_creal(P_code_value) == -128)
+++ {
+++ P_code_value = lv_cmake(-127, lv_cimag(P_code_value));
+++ }
+++ if(lv_cimag(P_code_value) == -128)
+++ {
+++ P_code_value = lv_cmake(lv_creal(P_code_value), -127);
+++ }
+++
+++ if(lv_creal(L_code_value) == -128)
+++ {
+++ L_code_value = lv_cmake(-127, lv_cimag(L_code_value));
+++ }
+++ if(lv_cimag(L_code_value) == -128)
+++ {
+++ L_code_value = lv_cmake(lv_creal(L_code_value), -127);
+++ }
+++
+++ if(lv_creal(VL_code_value) == -128)
+++ {
+++ VL_code_value = lv_cmake(-127, lv_cimag(VL_code_value));
+++ }
+++ if(lv_cimag(VL_code_value) == -128)
+++ {
+++ VL_code_value = lv_cmake(lv_creal(VL_code_value), -127);
+++ }
+++
+++ //Perform the carrier wipe-off
+++ bb_signal_sample = input[i] * carrier[i];
+++ // Now get very early, early, prompt, late and very late values for each
+++ *VE_out += (lv_32fc_t) (bb_signal_sample * VE_code_value);
+++ *E_out += (lv_32fc_t) (bb_signal_sample * E_code_value);
+++ *P_out += (lv_32fc_t) (bb_signal_sample * P_code_value);
+++ *L_out += (lv_32fc_t) (bb_signal_sample * L_code_value);
+++ *VL_out += (lv_32fc_t) (bb_signal_sample * VL_code_value);
+++ }
+++}
+++
+++#endif /* LV_HAVE_GENERIC */
+++
+++//#ifdef LV_HAVE_GENERIC
+++//#include <stdio.h>
+++//#include <stdlib.h>
+++//#include <tmmintrin.h>
+++//
+++//#ifndef MAX
+++//#define MAX(a,b) ((a) > (b) ? a : b)
+++//#endif
+++//
+++//#ifndef MIN
+++//#define MIN(a,b) ((a) < (b) ? a : b)
+++//#endif
+++//
+++///*!
+++// \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
+++// \param input The input signal input
+++// \param carrier The carrier signal input
+++// \param VE_code Very Early PRN code replica input
+++// \param E_code Early PRN code replica input
+++// \param P_code Prompt PRN code replica input
+++// \param L_code Late PRN code replica input
+++// \param VL_code Very Late PRN code replica input
+++// \param VE_out Very Early correlation output
+++// \param E_out Early correlation output
+++// \param P_out Prompt correlation output
+++// \param L_out Late correlation output
+++// \param VL_out Very Late correlation output
+++// \param num_points The number of complex values in vectors
+++// */
+++//static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points)
+++//{
+++// *VE_out = 0;
+++// *E_out = 0;
+++// *P_out = 0;
+++// *L_out = 0;
+++// *VL_out = 0;
+++//
+++// lv_16sc_t VE_out16;
+++// lv_16sc_t E_out16;
+++// lv_16sc_t P_out16;
+++// lv_16sc_t L_out16;
+++// lv_16sc_t VL_out16;
+++//
+++// int32_t max = 32767;
+++// int32_t min = -32768;
+++//
+++// int16_t real_real;
+++// int16_t imag_imag;
+++// int16_t real_imag;
+++// int16_t imag_real;
+++// int32_t out_real_32;
+++// int32_t out_imag_32;
+++// int16_t out_real_16;
+++// int16_t out_imag_16;
+++// int16_t aux1;
+++// int16_t aux2;
+++//
+++//
+++// lv_8sc_t bb_signal_sample = lv_cmake(0, 0);
+++//
+++// // perform very early, Early, Prompt, Late and very late correlation
+++// for(int i=0; i < num_points; ++i)
+++// {
+++// //Perform the carrier wipe-off
+++// bb_signal_sample = input[i] * carrier[i];
+++//
+++// aux1 = (int16_t)lv_creal(bb_signal_sample);
+++// aux2 = (int16_t)lv_creal(VE_code[i]);
+++// real_real = aux1*aux2;
+++// aux1 = (int16_t)lv_cimag(bb_signal_sample);
+++// aux2 = (int16_t)lv_cimag(VE_code[i]);
+++// imag_imag = aux1*aux2;
+++// aux1 = (int16_t)lv_creal(bb_signal_sample);
+++// aux2 = (int16_t)lv_cimag(VE_code[i]);
+++// real_imag = aux1*aux2;
+++// aux1 = (int16_t)lv_cimag(bb_signal_sample);
+++// aux2 = (int16_t)lv_creal(VE_code[i]);
+++// imag_real = aux1*aux2;
+++// out_real_32 = (int32_t)real_real - (int32_t)imag_imag;
+++// out_imag_32 = (int32_t)real_imag + (int32_t)imag_real;
+++// out_real_16 = MIN(MAX(out_real_32, min), max);
+++// out_imag_16 = MIN(MAX(out_imag_32, min), max);
+++// VE_out16 = lv_cmake(out_real_16, out_imag_16);
+++//
+++//
+++//
+++// if(lv_creal(L_code[i]) == -128)
+++// {
+++// int8_t* L_pointer = (int8_t*)&L_code[i];
+++// *L_pointer = -127;
+++// }
+++// if(lv_cimag(L_code[i]) == -128)
+++// {
+++// int8_t* L_pointer = (int8_t*)&L_code[i];
+++// L_pointer++;
+++// *L_pointer = -127;
+++// }
+++// aux1 = (int16_t)lv_creal(bb_signal_sample);
+++// aux2 = (int16_t)lv_creal(L_code[i]);
+++// real_real = aux1*aux2;
+++// aux1 = (int16_t)lv_cimag(bb_signal_sample);
+++// aux2 = (int16_t)lv_cimag(L_code[i]);
+++// imag_imag = aux1*aux2;
+++// aux1 = (int16_t)lv_creal(bb_signal_sample);
+++// aux2 = (int16_t)lv_cimag(L_code[i]);
+++// real_imag = aux1*aux2;
+++// aux1 = (int16_t)lv_cimag(bb_signal_sample);
+++// aux2 = (int16_t)lv_creal(L_code[i]);
+++// imag_real = aux1*aux2;
+++// out_real_32 = (int32_t)real_real - (int32_t)imag_imag;
+++// out_imag_32 = (int32_t)real_imag + (int32_t)imag_real;
+++// out_real_16 = MIN(MAX(out_real_32, min), max);
+++// out_imag_16 = MIN(MAX(out_imag_32, min), max);
+++// L_out16 = lv_cmake(out_real_16, out_imag_16);
+++//
+++// E_out16 = (lv_16sc_t)bb_signal_sample * (lv_16sc_t)E_code[i];
+++// P_out16 = (lv_16sc_t)bb_signal_sample * (lv_16sc_t)P_code[i];
+++// VL_out16 = (lv_16sc_t)bb_signal_sample * (lv_16sc_t)VL_code[i];
+++//
+++//
+++// *VE_out += (lv_32fc_t) VE_out16;
+++// *E_out += (lv_32fc_t) E_out16;
+++// *P_out += (lv_32fc_t) P_out16;
+++// *L_out += (lv_32fc_t) L_out16;
+++// *VL_out += (lv_32fc_t) VL_out16;
+++//
+++// //error en la parte real de L con 32 muestras
+++// //*L_out = lv_cmake(12, 12);
+++// }
+++//}
+++//
+++//#endif /* LV_HAVE_GENERIC */
+++
+++//#ifdef LV_HAVE_GENERIC
+++///*!
+++// \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
+++// \param input The input signal input
+++// \param carrier The carrier signal input
+++// \param VE_code Very Early PRN code replica input
+++// \param E_code Early PRN code replica input
+++// \param P_code Prompt PRN code replica input
+++// \param L_code Late PRN code replica input
+++// \param VL_code Very Late PRN code replica input
+++// \param VE_out Very Early correlation output
+++// \param E_out Early correlation output
+++// \param P_out Prompt correlation output
+++// \param L_out Late correlation output
+++// \param VL_out Very Late correlation output
+++// \param num_points The number of complex values in vectors
+++// */
+++//static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points)
+++//{
+++// lv_8sc_t bb_signal_sample;
+++//
+++// bb_signal_sample = lv_cmake(0, 0);
+++//
+++// *VE_out = 0;
+++// *E_out = 0;
+++// *P_out = 0;
+++// *L_out = 0;
+++// *VL_out = 0;
+++// // perform very early, Early, Prompt, Late and very late correlation
+++// for(int i=0; i < num_points; ++i)
+++// {
+++// //Perform the carrier wipe-off
+++// bb_signal_sample = input[i] * carrier[i];
+++//
+++// *VE_out += (lv_32fc_t) (bb_signal_sample * VE_code[i]);
+++// *E_out += (lv_32fc_t) (bb_signal_sample * E_code[i]);
+++// *P_out += (lv_32fc_t) (bb_signal_sample * P_code[i]);
+++// *L_out += (lv_32fc_t) (bb_signal_sample * L_code[i]);
+++// *VL_out += (lv_32fc_t) (bb_signal_sample * VL_code[i]);
+++// }
+++//}
+++//
+++//#endif /* LV_HAVE_GENERIC */
+++
+++#endif /* INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5_u_H */
++\ No newline at end of file
++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5.h
++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5.h 1970-01-01 01:00:00.000000000 +0100
+++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5.h 2014-10-15 01:55:08.000000000 +0200
++@@ -0,0 +1,772 @@
+++/*!
+++ * \file volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5.h
+++ * \brief Volk protokernel: performs the carrier wipe-off mixing and the Very early, Early, Prompt, Late and very late correlation with 16 bits vectors, and accumulates the results into float32. This protokernel is called "safe" because it checks when the inputs have a -128 value, and replaces it with a -127 value. By doing this it avoids malfunctioning, but it lasts more time that the "unsafe" implementation. In order to avoid overflow, "input" and "carrier" must be values between —7 and 7 and "XX_code inputs" must be values between —127 and 127.
+++ * \authors <ul>
+++ * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
+++ * </ul>
+++ *
+++ * Volk protokernel that performs the carrier wipe-off mixing and the
+++ * Very early, Early, Prompt, Late and very late correlation with 16 bits vectors (8 bits the
+++ * real part and 8 bits the imaginary part), and accumulates the result
+++ * in 32 bits single point values, returning float32 values:
+++ * - The carrier wipe-off is done by multiplying the input signal by the
+++ * carrier (multiplication of 16 bits vectors) It returns the input
+++ * signal in base band (BB)
+++ * - Very Early values are calculated by multiplying the input signal in BB by the
+++ * very early code (multiplication of 16 bits vectors), accumulating the results into float32 values
+++ * - Early values are calculated by multiplying the input signal in BB by the
+++ * early code (multiplication of 16 bits vectors), accumulating the results into float32 values
+++ * - Prompt values are calculated by multiplying the input signal in BB by the
+++ * prompt code (multiplication of 16 bits vectors), accumulating the results into float32 values
+++ * - Late values are calculated by multiplying the input signal in BB by the
+++ * late code (multiplication of 16 bits vectors), accumulating the results into float32 values
+++ * - Very Late values are calculated by multiplying the input signal in BB by the
+++ * very late code (multiplication of 16 bits vectors), accumulating the results into float32 values
+++ *
+++ * -------------------------------------------------------------------------
+++ * Bits analysis
+++ *
+++ * input = 8 bits
+++ * carrier = 8 bits
+++ * XX_code = 8 bits
+++ * XX_out16 = 16 bits
+++ * bb_signal_sample = 8 bits
+++ *
+++ * bb_signal_sample = input*carrier -> 17 bits limited to 8 bits = input and carrier must be values between —7 and 7 to avoid overflow (3 bits)
+++ *
+++ * XX_out16 = XX_code*bb_signal_sample -> 17 bits limited to 16 bits = XX_code must be values between —127 and 127 to avoid overflow (7 bits)
+++ * -------------------------------------------------------------------------
+++ *
+++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
+++ *
+++ * GNSS-SDR is a software defined Global Navigation
+++ * Satellite Systems receiver
+++ *
+++ * This file is part of GNSS-SDR.
+++ *
+++ * GNSS-SDR is free software: you can redistribute it and/or modify
+++ * it under the terms of the GNU General Public License as published by
+++ * the Free Software Foundation, either version 3 of the License, or
+++ * at your option) any later version.
+++ *
+++ * GNSS-SDR is distributed in the hope that it will be useful,
+++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+++ * GNU General Public License for more details.
+++ *
+++ * You should have received a copy of the GNU General Public License
+++ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
+++ *
+++ * -------------------------------------------------------------------------
+++ */
+++
+++#ifndef INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5_u_H
+++#define INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5_u_H
+++
+++#include <inttypes.h>
+++#include <stdio.h>
+++#include <volk_gnsssdr/volk_gnsssdr_complex.h>
+++#include <float.h>
+++#include <string.h>
+++
+++#ifdef LV_HAVE_SSE4_1
+++#include "smmintrin.h"
+++#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h"
+++#include "CommonMacros/CommonMacros.h"
+++/*!
+++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
+++ \param input The input signal input
+++ \param carrier The carrier signal input
+++ \param VE_code Very Early PRN code replica input
+++ \param E_code Early PRN code replica input
+++ \param P_code Prompt PRN code replica input
+++ \param L_code Late PRN code replica input
+++ \param VL_code Very Late PRN code replica input
+++ \param VE_out Very Early correlation output
+++ \param E_out Early correlation output
+++ \param P_out Prompt correlation output
+++ \param L_out Late correlation output
+++ \param VL_out Very Late correlation output
+++ \param num_points The number of complex values in vectors
+++ */
+++static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5_u_sse4_1(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points)
+++{
+++ const unsigned int sse_iters = num_points / 8;
+++
+++ __m128i x, x_abs, y, y_aux, bb_signal_sample_aux, bb_signal_sample_aux_abs;;
+++ __m128i real_output, imag_output;
+++ __m128 real_VE_code_acc, imag_VE_code_acc, real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc, real_VL_code_acc, imag_VL_code_acc;
+++ __m128i input_i_1, input_i_2, output_i32;
+++ __m128 real_output_ps, imag_output_ps;
+++ __m128i minus128control;
+++
+++ __m128i minus128 = _mm_set1_epi8 (-128);
+++ __m128i check_sign_sequence = _mm_set_epi8 (255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1);
+++ __m128i rearrange_sequence = _mm_set_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
+++ __m128i mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
+++
+++ const lv_8sc_t* input_ptr = input;
+++ const lv_8sc_t* carrier_ptr = carrier;
+++
+++ const lv_8sc_t* VE_code_ptr = VE_code;
+++ lv_32fc_t* VE_out_ptr = VE_out;
+++ const lv_8sc_t* E_code_ptr = E_code;
+++ lv_32fc_t* E_out_ptr = E_out;
+++ const lv_8sc_t* P_code_ptr = P_code;
+++ lv_32fc_t* P_out_ptr = P_out;
+++ const lv_8sc_t* L_code_ptr = L_code;
+++ lv_32fc_t* L_out_ptr = L_out;
+++ const lv_8sc_t* VL_code_ptr = VL_code;
+++ lv_32fc_t* VL_out_ptr = VL_out;
+++
+++ float VE_out_real = 0;
+++ float VE_out_imag = 0;
+++ float E_out_real = 0;
+++ float E_out_imag = 0;
+++ float P_out_real = 0;
+++ float P_out_imag = 0;
+++ float L_out_real = 0;
+++ float L_out_imag = 0;
+++ float VL_out_real = 0;
+++ float VL_out_imag = 0;
+++
+++ real_VE_code_acc = _mm_setzero_ps();
+++ imag_VE_code_acc = _mm_setzero_ps();
+++ real_E_code_acc = _mm_setzero_ps();
+++ imag_E_code_acc = _mm_setzero_ps();
+++ real_P_code_acc = _mm_setzero_ps();
+++ imag_P_code_acc = _mm_setzero_ps();
+++ real_L_code_acc = _mm_setzero_ps();
+++ imag_L_code_acc = _mm_setzero_ps();
+++ real_VL_code_acc = _mm_setzero_ps();
+++ imag_VL_code_acc = _mm_setzero_ps();
+++
+++ if (sse_iters>0)
+++ {
+++ for(int number = 0;number < sse_iters; number++){
+++
+++ //Perform the carrier wipe-off
+++ x = _mm_lddqu_si128((__m128i*)input_ptr);
+++ y = _mm_lddqu_si128((__m128i*)carrier_ptr);
+++
+++ x_abs = _mm_abs_epi8 (x);
+++
+++ CM_8IC_X2_SCALAR_PRODUCT_16IC_X2_U_SSSE3(y, x, check_sign_sequence, rearrange_sequence, y_aux, x_abs, real_output, imag_output)
+++
+++ imag_output = _mm_slli_si128 (imag_output, 1);
+++ bb_signal_sample_aux = _mm_blendv_epi8 (imag_output, real_output, mult1);
+++ bb_signal_sample_aux_abs = _mm_abs_epi8 (bb_signal_sample_aux);
+++
+++ //Get very early values
+++ y = _mm_lddqu_si128((__m128i*)VE_code_ptr);
+++
+++ CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
+++
+++ real_VE_code_acc = _mm_add_ps (real_VE_code_acc, real_output_ps);
+++ imag_VE_code_acc = _mm_add_ps (imag_VE_code_acc, imag_output_ps);
+++
+++ //Get early values
+++ y = _mm_lddqu_si128((__m128i*)E_code_ptr);
+++
+++ CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
+++
+++ real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps);
+++ imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps);
+++
+++ //Get prompt values
+++ y = _mm_lddqu_si128((__m128i*)P_code_ptr);
+++
+++ CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
+++
+++ real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps);
+++ imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps);
+++
+++ //Get late values
+++ y = _mm_lddqu_si128((__m128i*)L_code_ptr);
+++
+++ CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
+++
+++ real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps);
+++ imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps);
+++
+++ //Get very late values
+++ y = _mm_lddqu_si128((__m128i*)VL_code_ptr);
+++
+++ CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
+++
+++ real_VL_code_acc = _mm_add_ps (real_VL_code_acc, real_output_ps);
+++ imag_VL_code_acc = _mm_add_ps (imag_VL_code_acc, imag_output_ps);
+++
+++ input_ptr += 8;
+++ carrier_ptr += 8;
+++ VE_code_ptr += 8;
+++ E_code_ptr += 8;
+++ P_code_ptr += 8;
+++ L_code_ptr += 8;
+++ VL_code_ptr += 8;
+++ }
+++
+++ __VOLK_ATTR_ALIGNED(16) float real_VE_dotProductVector[4];
+++ __VOLK_ATTR_ALIGNED(16) float imag_VE_dotProductVector[4];
+++ __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4];
+++ __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4];
+++ __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4];
+++ __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4];
+++ __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4];
+++ __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4];
+++ __VOLK_ATTR_ALIGNED(16) float real_VL_dotProductVector[4];
+++ __VOLK_ATTR_ALIGNED(16) float imag_VL_dotProductVector[4];
+++
+++ _mm_storeu_ps((float*)real_VE_dotProductVector,real_VE_code_acc); // Store the results back into the dot product vector
+++ _mm_storeu_ps((float*)imag_VE_dotProductVector,imag_VE_code_acc); // Store the results back into the dot product vector
+++ _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector
+++ _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector
+++ _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector
+++ _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector
+++ _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector
+++ _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector
+++ _mm_storeu_ps((float*)real_VL_dotProductVector,real_VL_code_acc); // Store the results back into the dot product vector
+++ _mm_storeu_ps((float*)imag_VL_dotProductVector,imag_VL_code_acc); // Store the results back into the dot product vector
+++
+++ for (int i = 0; i<4; ++i)
+++ {
+++ VE_out_real += real_VE_dotProductVector[i];
+++ VE_out_imag += imag_VE_dotProductVector[i];
+++ E_out_real += real_E_dotProductVector[i];
+++ E_out_imag += imag_E_dotProductVector[i];
+++ P_out_real += real_P_dotProductVector[i];
+++ P_out_imag += imag_P_dotProductVector[i];
+++ L_out_real += real_L_dotProductVector[i];
+++ L_out_imag += imag_L_dotProductVector[i];
+++ VL_out_real += real_VL_dotProductVector[i];
+++ VL_out_imag += imag_VL_dotProductVector[i];
+++ }
+++ *VE_out_ptr = lv_cmake(VE_out_real, VE_out_imag);
+++ *E_out_ptr = lv_cmake(E_out_real, E_out_imag);
+++ *P_out_ptr = lv_cmake(P_out_real, P_out_imag);
+++ *L_out_ptr = lv_cmake(L_out_real, L_out_imag);
+++ *VL_out_ptr = lv_cmake(VL_out_real, VL_out_imag);
+++ }
+++
+++ if(num_points%8!=0)
+++ {
+++ lv_16sc_t bb_signal_sample;
+++ lv_16sc_t VE_code_value;
+++ lv_16sc_t E_code_value;
+++ lv_16sc_t P_code_value;
+++ lv_16sc_t L_code_value;
+++ lv_16sc_t VL_code_value;
+++
+++ for(int i=0; i < num_points%8; ++i)
+++ {
+++ VE_code_value = *VE_code_ptr++;
+++ E_code_value = *E_code_ptr++;
+++ P_code_value = *P_code_ptr++;
+++ L_code_value = *L_code_ptr++;
+++ VL_code_value = *VL_code_ptr++;
+++
+++ if(lv_creal(VE_code_value) == -128)
+++ {
+++ VE_code_value = lv_cmake(-127, lv_cimag(VE_code_value));
+++ }
+++ if(lv_cimag(VE_code_value) == -128)
+++ {
+++ VE_code_value = lv_cmake(lv_creal(VE_code_value), -127);
+++ }
+++
+++ if(lv_creal(E_code_value) == -128)
+++ {
+++ E_code_value = lv_cmake(-127, lv_cimag(E_code_value));
+++ }
+++ if(lv_cimag(E_code_value) == -128)
+++ {
+++ E_code_value = lv_cmake(lv_creal(E_code_value), -127);
+++ }
+++
+++ if(lv_creal(P_code_value) == -128)
+++ {
+++ P_code_value = lv_cmake(-127, lv_cimag(P_code_value));
+++ }
+++ if(lv_cimag(P_code_value) == -128)
+++ {
+++ P_code_value = lv_cmake(lv_creal(P_code_value), -127);
+++ }
+++
+++ if(lv_creal(L_code_value) == -128)
+++ {
+++ L_code_value = lv_cmake(-127, lv_cimag(L_code_value));
+++ }
+++ if(lv_cimag(L_code_value) == -128)
+++ {
+++ L_code_value = lv_cmake(lv_creal(L_code_value), -127);
+++ }
+++
+++ //Perform the carrier wipe-off
+++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
+++ // Now get very early, early, prompt, late and very late values for each
+++ *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * VE_code_value);
+++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * E_code_value);
+++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * P_code_value);
+++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * L_code_value);
+++ *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * VL_code_value);
+++ }
+++ }
+++}
+++#endif /* LV_HAVE_SSE4_1 */
+++
+++#ifdef LV_HAVE_GENERIC
+++#include <stdio.h>
+++#include <tmmintrin.h>
+++
+++/*!
+++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
+++ \param input The input signal input
+++ \param carrier The carrier signal input
+++ \param VE_code Very Early PRN code replica input
+++ \param E_code Early PRN code replica input
+++ \param P_code Prompt PRN code replica input
+++ \param L_code Late PRN code replica input
+++ \param VL_code Very Late PRN code replica input
+++ \param VE_out Very Early correlation output
+++ \param E_out Early correlation output
+++ \param P_out Prompt correlation output
+++ \param L_out Late correlation output
+++ \param VL_out Very Late correlation output
+++ \param num_points The number of complex values in vectors
+++ */
+++static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points)
+++{
+++ *VE_out = 0;
+++ *E_out = 0;
+++ *P_out = 0;
+++ *L_out = 0;
+++ *VL_out = 0;
+++
+++ lv_16sc_t VE_code_value;
+++ lv_16sc_t E_code_value;
+++ lv_16sc_t P_code_value;
+++ lv_16sc_t L_code_value;
+++ lv_16sc_t VL_code_value;
+++ lv_16sc_t bb_signal_sample;
+++
+++ for(int i=0; i < num_points; ++i)
+++ {
+++ VE_code_value = VE_code[i];
+++ E_code_value = E_code[i];
+++ P_code_value = P_code[i];
+++ L_code_value = L_code[i];
+++ VL_code_value = VL_code[i];
+++
+++ if(lv_creal(VE_code_value) == -128)
+++ {
+++ VE_code_value = lv_cmake(-127, lv_cimag(VE_code_value));
+++ }
+++ if(lv_cimag(VE_code_value) == -128)
+++ {
+++ VE_code_value = lv_cmake(lv_creal(VE_code_value), -127);
+++ }
+++
+++ if(lv_creal(E_code_value) == -128)
+++ {
+++ E_code_value = lv_cmake(-127, lv_cimag(E_code_value));
+++ }
+++ if(lv_cimag(E_code_value) == -128)
+++ {
+++ E_code_value = lv_cmake(lv_creal(E_code_value), -127);
+++ }
+++
+++ if(lv_creal(P_code_value) == -128)
+++ {
+++ P_code_value = lv_cmake(-127, lv_cimag(P_code_value));
+++ }
+++ if(lv_cimag(P_code_value) == -128)
+++ {
+++ P_code_value = lv_cmake(lv_creal(P_code_value), -127);
+++ }
+++
+++ if(lv_creal(L_code_value) == -128)
+++ {
+++ L_code_value = lv_cmake(-127, lv_cimag(L_code_value));
+++ }
+++ if(lv_cimag(L_code_value) == -128)
+++ {
+++ L_code_value = lv_cmake(lv_creal(L_code_value), -127);
+++ }
+++
+++ if(lv_creal(VL_code_value) == -128)
+++ {
+++ VL_code_value = lv_cmake(-127, lv_cimag(VL_code_value));
+++ }
+++ if(lv_cimag(VL_code_value) == -128)
+++ {
+++ VL_code_value = lv_cmake(lv_creal(VL_code_value), -127);
+++ }
+++
+++ //Perform the carrier wipe-off
+++ bb_signal_sample = input[i] * carrier[i];
+++ // Now get very early, early, prompt, late and very late values for each
+++ *VE_out += (lv_32fc_t) (bb_signal_sample * VE_code_value);
+++ *E_out += (lv_32fc_t) (bb_signal_sample * E_code_value);
+++ *P_out += (lv_32fc_t) (bb_signal_sample * P_code_value);
+++ *L_out += (lv_32fc_t) (bb_signal_sample * L_code_value);
+++ *VL_out += (lv_32fc_t) (bb_signal_sample * VL_code_value);
+++ }
+++}
+++#endif /* LV_HAVE_GENERIC */
+++#endif /* INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5_u_H */
+++
+++
+++#ifndef INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5_a_H
+++#define INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5_a_H
+++
+++#include <inttypes.h>
+++#include <stdio.h>
+++#include <volk_gnsssdr/volk_gnsssdr_complex.h>
+++#include <float.h>
+++#include <string.h>
+++
+++#ifdef LV_HAVE_SSE4_1
+++#include "smmintrin.h"
+++#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h"
+++#include "CommonMacros/CommonMacros.h"
+++/*!
+++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
+++ \param input The input signal input
+++ \param carrier The carrier signal input
+++ \param VE_code Very Early PRN code replica input
+++ \param E_code Early PRN code replica input
+++ \param P_code Prompt PRN code replica input
+++ \param L_code Late PRN code replica input
+++ \param VL_code Very Late PRN code replica input
+++ \param VE_out Very Early correlation output
+++ \param E_out Early correlation output
+++ \param P_out Prompt correlation output
+++ \param L_out Late correlation output
+++ \param VL_out Very Late correlation output
+++ \param num_points The number of complex values in vectors
+++ */
+++static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5_a_sse4_1(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points)
+++{
+++ const unsigned int sse_iters = num_points / 8;
+++
+++ __m128i x, x_abs, y, y_aux, bb_signal_sample_aux, bb_signal_sample_aux_abs;;
+++ __m128i real_output, imag_output;
+++ __m128 real_VE_code_acc, imag_VE_code_acc, real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc, real_VL_code_acc, imag_VL_code_acc;
+++ __m128i input_i_1, input_i_2, output_i32;
+++ __m128 real_output_ps, imag_output_ps;
+++ __m128i minus128control;
+++
+++ __m128i minus128 = _mm_set1_epi8 (-128);
+++ __m128i check_sign_sequence = _mm_set_epi8 (255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1);
+++ __m128i rearrange_sequence = _mm_set_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
+++ __m128i mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
+++
+++ const lv_8sc_t* input_ptr = input;
+++ const lv_8sc_t* carrier_ptr = carrier;
+++
+++ const lv_8sc_t* VE_code_ptr = VE_code;
+++ lv_32fc_t* VE_out_ptr = VE_out;
+++ const lv_8sc_t* E_code_ptr = E_code;
+++ lv_32fc_t* E_out_ptr = E_out;
+++ const lv_8sc_t* P_code_ptr = P_code;
+++ lv_32fc_t* P_out_ptr = P_out;
+++ const lv_8sc_t* L_code_ptr = L_code;
+++ lv_32fc_t* L_out_ptr = L_out;
+++ const lv_8sc_t* VL_code_ptr = VL_code;
+++ lv_32fc_t* VL_out_ptr = VL_out;
+++
+++ float VE_out_real = 0;
+++ float VE_out_imag = 0;
+++ float E_out_real = 0;
+++ float E_out_imag = 0;
+++ float P_out_real = 0;
+++ float P_out_imag = 0;
+++ float L_out_real = 0;
+++ float L_out_imag = 0;
+++ float VL_out_real = 0;
+++ float VL_out_imag = 0;
+++
+++ real_VE_code_acc = _mm_setzero_ps();
+++ imag_VE_code_acc = _mm_setzero_ps();
+++ real_E_code_acc = _mm_setzero_ps();
+++ imag_E_code_acc = _mm_setzero_ps();
+++ real_P_code_acc = _mm_setzero_ps();
+++ imag_P_code_acc = _mm_setzero_ps();
+++ real_L_code_acc = _mm_setzero_ps();
+++ imag_L_code_acc = _mm_setzero_ps();
+++ real_VL_code_acc = _mm_setzero_ps();
+++ imag_VL_code_acc = _mm_setzero_ps();
+++
+++ if (sse_iters>0)
+++ {
+++ for(int number = 0;number < sse_iters; number++){
+++
+++ //Perform the carrier wipe-off
+++ x = _mm_load_si128((__m128i*)input_ptr);
+++ y = _mm_load_si128((__m128i*)carrier_ptr);
+++
+++ x_abs = _mm_abs_epi8 (x);
+++
+++ CM_8IC_X2_SCALAR_PRODUCT_16IC_X2_U_SSSE3(y, x, check_sign_sequence, rearrange_sequence, y_aux, x_abs, real_output, imag_output)
+++
+++ imag_output = _mm_slli_si128 (imag_output, 1);
+++ bb_signal_sample_aux = _mm_blendv_epi8 (imag_output, real_output, mult1);
+++ bb_signal_sample_aux_abs = _mm_abs_epi8 (bb_signal_sample_aux);
+++
+++ //Get very early values
+++ y = _mm_load_si128((__m128i*)VE_code_ptr);
+++
+++ CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
+++
+++ real_VE_code_acc = _mm_add_ps (real_VE_code_acc, real_output_ps);
+++ imag_VE_code_acc = _mm_add_ps (imag_VE_code_acc, imag_output_ps);
+++
+++ //Get early values
+++ y = _mm_load_si128((__m128i*)E_code_ptr);
+++
+++ CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
+++
+++ real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps);
+++ imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps);
+++
+++ //Get prompt values
+++ y = _mm_load_si128((__m128i*)P_code_ptr);
+++
+++ CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
+++
+++ real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps);
+++ imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps);
+++
+++ //Get late values
+++ y = _mm_load_si128((__m128i*)L_code_ptr);
+++
+++ CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
+++
+++ real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps);
+++ imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps);
+++
+++ //Get very late values
+++ y = _mm_load_si128((__m128i*)VL_code_ptr);
+++
+++ CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
+++
+++ real_VL_code_acc = _mm_add_ps (real_VL_code_acc, real_output_ps);
+++ imag_VL_code_acc = _mm_add_ps (imag_VL_code_acc, imag_output_ps);
+++
+++ input_ptr += 8;
+++ carrier_ptr += 8;
+++ VE_code_ptr += 8;
+++ E_code_ptr += 8;
+++ P_code_ptr += 8;
+++ L_code_ptr += 8;
+++ VL_code_ptr += 8;
+++ }
+++
+++ __VOLK_ATTR_ALIGNED(16) float real_VE_dotProductVector[4];
+++ __VOLK_ATTR_ALIGNED(16) float imag_VE_dotProductVector[4];
+++ __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4];
+++ __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4];
+++ __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4];
+++ __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4];
+++ __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4];
+++ __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4];
+++ __VOLK_ATTR_ALIGNED(16) float real_VL_dotProductVector[4];
+++ __VOLK_ATTR_ALIGNED(16) float imag_VL_dotProductVector[4];
+++
+++ _mm_store_ps((float*)real_VE_dotProductVector,real_VE_code_acc); // Store the results back into the dot product vector
+++ _mm_store_ps((float*)imag_VE_dotProductVector,imag_VE_code_acc); // Store the results back into the dot product vector
+++ _mm_store_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector
+++ _mm_store_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector
+++ _mm_store_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector
+++ _mm_store_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector
+++ _mm_store_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector
+++ _mm_store_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector
+++ _mm_store_ps((float*)real_VL_dotProductVector,real_VL_code_acc); // Store the results back into the dot product vector
+++ _mm_store_ps((float*)imag_VL_dotProductVector,imag_VL_code_acc); // Store the results back into the dot product vector
+++
+++ for (int i = 0; i<4; ++i)
+++ {
+++ VE_out_real += real_VE_dotProductVector[i];
+++ VE_out_imag += imag_VE_dotProductVector[i];
+++ E_out_real += real_E_dotProductVector[i];
+++ E_out_imag += imag_E_dotProductVector[i];
+++ P_out_real += real_P_dotProductVector[i];
+++ P_out_imag += imag_P_dotProductVector[i];
+++ L_out_real += real_L_dotProductVector[i];
+++ L_out_imag += imag_L_dotProductVector[i];
+++ VL_out_real += real_VL_dotProductVector[i];
+++ VL_out_imag += imag_VL_dotProductVector[i];
+++ }
+++ *VE_out_ptr = lv_cmake(VE_out_real, VE_out_imag);
+++ *E_out_ptr = lv_cmake(E_out_real, E_out_imag);
+++ *P_out_ptr = lv_cmake(P_out_real, P_out_imag);
+++ *L_out_ptr = lv_cmake(L_out_real, L_out_imag);
+++ *VL_out_ptr = lv_cmake(VL_out_real, VL_out_imag);
+++ }
+++
+++ if(num_points%8!=0)
+++ {
+++ lv_16sc_t bb_signal_sample;
+++ lv_16sc_t VE_code_value;
+++ lv_16sc_t E_code_value;
+++ lv_16sc_t P_code_value;
+++ lv_16sc_t L_code_value;
+++ lv_16sc_t VL_code_value;
+++
+++ for(int i=0; i < num_points%8; ++i)
+++ {
+++ VE_code_value = *VE_code_ptr++;
+++ E_code_value = *E_code_ptr++;
+++ P_code_value = *P_code_ptr++;
+++ L_code_value = *L_code_ptr++;
+++ VL_code_value = *VL_code_ptr++;
+++
+++ if(lv_creal(VE_code_value) == -128)
+++ {
+++ VE_code_value = lv_cmake(-127, lv_cimag(VE_code_value));
+++ }
+++ if(lv_cimag(VE_code_value) == -128)
+++ {
+++ VE_code_value = lv_cmake(lv_creal(VE_code_value), -127);
+++ }
+++
+++ if(lv_creal(E_code_value) == -128)
+++ {
+++ E_code_value = lv_cmake(-127, lv_cimag(E_code_value));
+++ }
+++ if(lv_cimag(E_code_value) == -128)
+++ {
+++ E_code_value = lv_cmake(lv_creal(E_code_value), -127);
+++ }
+++
+++ if(lv_creal(P_code_value) == -128)
+++ {
+++ P_code_value = lv_cmake(-127, lv_cimag(P_code_value));
+++ }
+++ if(lv_cimag(P_code_value) == -128)
+++ {
+++ P_code_value = lv_cmake(lv_creal(P_code_value), -127);
+++ }
+++
+++ if(lv_creal(L_code_value) == -128)
+++ {
+++ L_code_value = lv_cmake(-127, lv_cimag(L_code_value));
+++ }
+++ if(lv_cimag(L_code_value) == -128)
+++ {
+++ L_code_value = lv_cmake(lv_creal(L_code_value), -127);
+++ }
+++
+++ //Perform the carrier wipe-off
+++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
+++ // Now get very early, early, prompt, late and very late values for each
+++ *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * VE_code_value);
+++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * E_code_value);
+++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * P_code_value);
+++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * L_code_value);
+++ *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * VL_code_value);
+++ }
+++ }
+++}
+++#endif /* LV_HAVE_SSE4_1 */
+++
+++#ifdef LV_HAVE_GENERIC
+++#include <stdio.h>
+++#include <tmmintrin.h>
+++
+++/*!
+++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
+++ \param input The input signal input
+++ \param carrier The carrier signal input
+++ \param VE_code Very Early PRN code replica input
+++ \param E_code Early PRN code replica input
+++ \param P_code Prompt PRN code replica input
+++ \param L_code Late PRN code replica input
+++ \param VL_code Very Late PRN code replica input
+++ \param VE_out Very Early correlation output
+++ \param E_out Early correlation output
+++ \param P_out Prompt correlation output
+++ \param L_out Late correlation output
+++ \param VL_out Very Late correlation output
+++ \param num_points The number of complex values in vectors
+++ */
+++static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5_a_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points)
+++{
+++ *VE_out = 0;
+++ *E_out = 0;
+++ *P_out = 0;
+++ *L_out = 0;
+++ *VL_out = 0;
+++
+++ lv_16sc_t VE_code_value;
+++ lv_16sc_t E_code_value;
+++ lv_16sc_t P_code_value;
+++ lv_16sc_t L_code_value;
+++ lv_16sc_t VL_code_value;
+++ lv_16sc_t bb_signal_sample;
+++
+++ for(int i=0; i < num_points; ++i)
+++ {
+++ VE_code_value = VE_code[i];
+++ E_code_value = E_code[i];
+++ P_code_value = P_code[i];
+++ L_code_value = L_code[i];
+++ VL_code_value = VL_code[i];
+++
+++ if(lv_creal(VE_code_value) == -128)
+++ {
+++ VE_code_value = lv_cmake(-127, lv_cimag(VE_code_value));
+++ }
+++ if(lv_cimag(VE_code_value) == -128)
+++ {
+++ VE_code_value = lv_cmake(lv_creal(VE_code_value), -127);
+++ }
+++
+++ if(lv_creal(E_code_value) == -128)
+++ {
+++ E_code_value = lv_cmake(-127, lv_cimag(E_code_value));
+++ }
+++ if(lv_cimag(E_code_value) == -128)
+++ {
+++ E_code_value = lv_cmake(lv_creal(E_code_value), -127);
+++ }
+++
+++ if(lv_creal(P_code_value) == -128)
+++ {
+++ P_code_value = lv_cmake(-127, lv_cimag(P_code_value));
+++ }
+++ if(lv_cimag(P_code_value) == -128)
+++ {
+++ P_code_value = lv_cmake(lv_creal(P_code_value), -127);
+++ }
+++
+++ if(lv_creal(L_code_value) == -128)
+++ {
+++ L_code_value = lv_cmake(-127, lv_cimag(L_code_value));
+++ }
+++ if(lv_cimag(L_code_value) == -128)
+++ {
+++ L_code_value = lv_cmake(lv_creal(L_code_value), -127);
+++ }
+++
+++ if(lv_creal(VL_code_value) == -128)
+++ {
+++ VL_code_value = lv_cmake(-127, lv_cimag(VL_code_value));
+++ }
+++ if(lv_cimag(VL_code_value) == -128)
+++ {
+++ VL_code_value = lv_cmake(lv_creal(VL_code_value), -127);
+++ }
+++
+++ //Perform the carrier wipe-off
+++ bb_signal_sample = input[i] * carrier[i];
+++ // Now get very early, early, prompt, late and very late values for each
+++ *VE_out += (lv_32fc_t) (bb_signal_sample * VE_code_value);
+++ *E_out += (lv_32fc_t) (bb_signal_sample * E_code_value);
+++ *P_out += (lv_32fc_t) (bb_signal_sample * P_code_value);
+++ *L_out += (lv_32fc_t) (bb_signal_sample * L_code_value);
+++ *VL_out += (lv_32fc_t) (bb_signal_sample * VL_code_value);
+++ }
+++}
+++#endif /* LV_HAVE_GENERIC */
+++#endif /* INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5_a_H */
++\ No newline at end of file
++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5.h
++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5.h 1970-01-01 01:00:00.000000000 +0100
+++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5.h 2014-10-15 01:55:08.000000000 +0200
++@@ -0,0 +1,554 @@
+++/*!
+++ * \file volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5.h
+++ * \brief Volk protokernel: performs the carrier wipe-off mixing and the Very early, Early, Prompt, Late and very late correlation with 16 bits vectors, and accumulates the results into float32. This protokernel is called "unsafe" because it does NOT check when the inputs have a -128 value. If you introduce a -128 value the protokernel will NOT operate properly (generic implementation will have different results than volk implementation). In order to avoid overflow, "input" and "carrier" must be values between —7 and 7 and "XX_code inputs" must be values between —127 and 127.
+++ * \authors <ul>
+++ * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
+++ * </ul>
+++ *
+++ * Volk protokernel that performs the carrier wipe-off mixing and the
+++ * Very early, Early, Prompt, Late and very late correlation with 16 bits vectors (8 bits the
+++ * real part and 8 bits the imaginary part), and accumulates the result
+++ * in 32 bits single point values, returning float32 values:
+++ * - The carrier wipe-off is done by multiplying the input signal by the
+++ * carrier (multiplication of 16 bits vectors) It returns the input
+++ * signal in base band (BB)
+++ * - Very Early values are calculated by multiplying the input signal in BB by the
+++ * very early code (multiplication of 16 bits vectors), accumulating the results into float32 values
+++ * - Early values are calculated by multiplying the input signal in BB by the
+++ * early code (multiplication of 16 bits vectors), accumulating the results into float32 values
+++ * - Prompt values are calculated by multiplying the input signal in BB by the
+++ * prompt code (multiplication of 16 bits vectors), accumulating the results into float32 values
+++ * - Late values are calculated by multiplying the input signal in BB by the
+++ * late code (multiplication of 16 bits vectors), accumulating the results into float32 values
+++ * - Very Late values are calculated by multiplying the input signal in BB by the
+++ * very late code (multiplication of 16 bits vectors), accumulating the results into float32 values
+++ *
+++ * -------------------------------------------------------------------------
+++ * Bits analysis
+++ *
+++ * input = 8 bits
+++ * carrier = 8 bits
+++ * XX_code = 8 bits
+++ * XX_out16 = 16 bits
+++ * bb_signal_sample = 8 bits
+++ *
+++ * bb_signal_sample = input*carrier -> 17 bits limited to 8 bits = input and carrier must be values between —7 and 7 to avoid overflow (3 bits)
+++ *
+++ * XX_out16 = XX_code*bb_signal_sample -> 17 bits limited to 16 bits = XX_code must be values between —127 and 127 to avoid overflow (7 bits)
+++ * -------------------------------------------------------------------------
+++ *
+++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
+++ *
+++ * GNSS-SDR is a software defined Global Navigation
+++ * Satellite Systems receiver
+++ *
+++ * This file is part of GNSS-SDR.
+++ *
+++ * GNSS-SDR is free software: you can redistribute it and/or modify
+++ * it under the terms of the GNU General Public License as published by
+++ * the Free Software Foundation, either version 3 of the License, or
+++ * at your option) any later version.
+++ *
+++ * GNSS-SDR is distributed in the hope that it will be useful,
+++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+++ * GNU General Public License for more details.
+++ *
+++ * You should have received a copy of the GNU General Public License
+++ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
+++ *
+++ * -------------------------------------------------------------------------
+++ */
+++
+++#ifndef INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_u_H
+++#define INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_u_H
+++
+++#include <inttypes.h>
+++#include <stdio.h>
+++#include <volk_gnsssdr/volk_gnsssdr_complex.h>
+++#include <float.h>
+++#include <string.h>
+++
+++#ifdef LV_HAVE_SSE4_1
+++#include "smmintrin.h"
+++#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h"
+++#include "CommonMacros/CommonMacros.h"
+++/*!
+++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
+++ \param input The input signal input
+++ \param carrier The carrier signal input
+++ \param VE_code Very Early PRN code replica input
+++ \param E_code Early PRN code replica input
+++ \param P_code Prompt PRN code replica input
+++ \param L_code Late PRN code replica input
+++ \param VL_code Very Late PRN code replica input
+++ \param VE_out Very Early correlation output
+++ \param E_out Early correlation output
+++ \param P_out Prompt correlation output
+++ \param L_out Late correlation output
+++ \param VL_out Very Late correlation output
+++ \param num_points The number of complex values in vectors
+++ */
+++static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_u_sse4_1(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points)
+++{
+++ const unsigned int sse_iters = num_points / 8;
+++
+++ __m128i x, x_abs, y, y_aux, bb_signal_sample_aux, bb_signal_sample_aux_abs;;
+++ __m128i real_output, imag_output;
+++ __m128 real_VE_code_acc, imag_VE_code_acc, real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc, real_VL_code_acc, imag_VL_code_acc;
+++ __m128i input_i_1, input_i_2, output_i32;
+++ __m128 real_output_ps, imag_output_ps;
+++
+++ __m128i check_sign_sequence = _mm_set_epi8 (255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1);
+++ __m128i rearrange_sequence = _mm_set_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
+++ __m128i mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
+++
+++ const lv_8sc_t* input_ptr = input;
+++ const lv_8sc_t* carrier_ptr = carrier;
+++
+++ const lv_8sc_t* VE_code_ptr = VE_code;
+++ lv_32fc_t* VE_out_ptr = VE_out;
+++ const lv_8sc_t* E_code_ptr = E_code;
+++ lv_32fc_t* E_out_ptr = E_out;
+++ const lv_8sc_t* P_code_ptr = P_code;
+++ lv_32fc_t* P_out_ptr = P_out;
+++ const lv_8sc_t* L_code_ptr = L_code;
+++ lv_32fc_t* L_out_ptr = L_out;
+++ const lv_8sc_t* VL_code_ptr = VL_code;
+++ lv_32fc_t* VL_out_ptr = VL_out;
+++
+++ float VE_out_real = 0;
+++ float VE_out_imag = 0;
+++ float E_out_real = 0;
+++ float E_out_imag = 0;
+++ float P_out_real = 0;
+++ float P_out_imag = 0;
+++ float L_out_real = 0;
+++ float L_out_imag = 0;
+++ float VL_out_real = 0;
+++ float VL_out_imag = 0;
+++
+++ real_VE_code_acc = _mm_setzero_ps();
+++ imag_VE_code_acc = _mm_setzero_ps();
+++ real_E_code_acc = _mm_setzero_ps();
+++ imag_E_code_acc = _mm_setzero_ps();
+++ real_P_code_acc = _mm_setzero_ps();
+++ imag_P_code_acc = _mm_setzero_ps();
+++ real_L_code_acc = _mm_setzero_ps();
+++ imag_L_code_acc = _mm_setzero_ps();
+++ real_VL_code_acc = _mm_setzero_ps();
+++ imag_VL_code_acc = _mm_setzero_ps();
+++
+++ if (sse_iters>0)
+++ {
+++ for(int number = 0;number < sse_iters; number++){
+++
+++ //Perform the carrier wipe-off
+++ x = _mm_lddqu_si128((__m128i*)input_ptr);
+++ y = _mm_lddqu_si128((__m128i*)carrier_ptr);
+++
+++ x_abs = _mm_abs_epi8 (x);
+++
+++ CM_8IC_X2_SCALAR_PRODUCT_16IC_X2_U_SSSE3(y, x, check_sign_sequence, rearrange_sequence, y_aux, x_abs, real_output, imag_output)
+++
+++ imag_output = _mm_slli_si128 (imag_output, 1);
+++ bb_signal_sample_aux = _mm_blendv_epi8 (imag_output, real_output, mult1);
+++ bb_signal_sample_aux_abs = _mm_abs_epi8 (bb_signal_sample_aux);
+++
+++ //Get very early values
+++ y = _mm_lddqu_si128((__m128i*)VE_code_ptr);
+++
+++ CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
+++
+++ real_VE_code_acc = _mm_add_ps (real_VE_code_acc, real_output_ps);
+++ imag_VE_code_acc = _mm_add_ps (imag_VE_code_acc, imag_output_ps);
+++
+++ //Get early values
+++ y = _mm_lddqu_si128((__m128i*)E_code_ptr);
+++
+++ CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
+++
+++ real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps);
+++ imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps);
+++
+++ //Get prompt values
+++ y = _mm_lddqu_si128((__m128i*)P_code_ptr);
+++
+++ CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
+++
+++ real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps);
+++ imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps);
+++
+++ //Get late values
+++ y = _mm_lddqu_si128((__m128i*)L_code_ptr);
+++
+++ CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
+++
+++ real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps);
+++ imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps);
+++
+++ //Get very late values
+++ y = _mm_lddqu_si128((__m128i*)VL_code_ptr);
+++
+++ CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
+++
+++ real_VL_code_acc = _mm_add_ps (real_VL_code_acc, real_output_ps);
+++ imag_VL_code_acc = _mm_add_ps (imag_VL_code_acc, imag_output_ps);
+++
+++ input_ptr += 8;
+++ carrier_ptr += 8;
+++ VE_code_ptr += 8;
+++ E_code_ptr += 8;
+++ P_code_ptr += 8;
+++ L_code_ptr += 8;
+++ VL_code_ptr += 8;
+++ }
+++
+++ __VOLK_ATTR_ALIGNED(16) float real_VE_dotProductVector[4];
+++ __VOLK_ATTR_ALIGNED(16) float imag_VE_dotProductVector[4];
+++ __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4];
+++ __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4];
+++ __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4];
+++ __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4];
+++ __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4];
+++ __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4];
+++ __VOLK_ATTR_ALIGNED(16) float real_VL_dotProductVector[4];
+++ __VOLK_ATTR_ALIGNED(16) float imag_VL_dotProductVector[4];
+++
+++ _mm_storeu_ps((float*)real_VE_dotProductVector,real_VE_code_acc); // Store the results back into the dot product vector
+++ _mm_storeu_ps((float*)imag_VE_dotProductVector,imag_VE_code_acc); // Store the results back into the dot product vector
+++ _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector
+++ _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector
+++ _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector
+++ _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector
+++ _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector
+++ _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector
+++ _mm_storeu_ps((float*)real_VL_dotProductVector,real_VL_code_acc); // Store the results back into the dot product vector
+++ _mm_storeu_ps((float*)imag_VL_dotProductVector,imag_VL_code_acc); // Store the results back into the dot product vector
+++
+++ for (int i = 0; i<4; ++i)
+++ {
+++ VE_out_real += real_VE_dotProductVector[i];
+++ VE_out_imag += imag_VE_dotProductVector[i];
+++ E_out_real += real_E_dotProductVector[i];
+++ E_out_imag += imag_E_dotProductVector[i];
+++ P_out_real += real_P_dotProductVector[i];
+++ P_out_imag += imag_P_dotProductVector[i];
+++ L_out_real += real_L_dotProductVector[i];
+++ L_out_imag += imag_L_dotProductVector[i];
+++ VL_out_real += real_VL_dotProductVector[i];
+++ VL_out_imag += imag_VL_dotProductVector[i];
+++ }
+++ *VE_out_ptr = lv_cmake(VE_out_real, VE_out_imag);
+++ *E_out_ptr = lv_cmake(E_out_real, E_out_imag);
+++ *P_out_ptr = lv_cmake(P_out_real, P_out_imag);
+++ *L_out_ptr = lv_cmake(L_out_real, L_out_imag);
+++ *VL_out_ptr = lv_cmake(VL_out_real, VL_out_imag);
+++ }
+++
+++ lv_16sc_t bb_signal_sample;
+++ for(int i=0; i < num_points%8; ++i)
+++ {
+++ //Perform the carrier wipe-off
+++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
+++ // Now get very early, early, prompt, late and very late values for each
+++ *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*VE_code_ptr++));
+++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*E_code_ptr++));
+++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*P_code_ptr++));
+++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*L_code_ptr++));
+++ *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*VL_code_ptr++));
+++ }
+++}
+++#endif /* LV_HAVE_SSE4_1 */
+++
+++#ifdef LV_HAVE_GENERIC
+++#include <stdio.h>
+++#include <tmmintrin.h>
+++
+++/*!
+++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
+++ \param input The input signal input
+++ \param carrier The carrier signal input
+++ \param VE_code Very Early PRN code replica input
+++ \param E_code Early PRN code replica input
+++ \param P_code Prompt PRN code replica input
+++ \param L_code Late PRN code replica input
+++ \param VL_code Very Late PRN code replica input
+++ \param VE_out Very Early correlation output
+++ \param E_out Early correlation output
+++ \param P_out Prompt correlation output
+++ \param L_out Late correlation output
+++ \param VL_out Very Late correlation output
+++ \param num_points The number of complex values in vectors
+++ */
+++static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points)
+++{
+++ *VE_out = 0;
+++ *E_out = 0;
+++ *P_out = 0;
+++ *L_out = 0;
+++ *VL_out = 0;
+++
+++ lv_16sc_t bb_signal_sample;
+++
+++ for(int i=0; i < num_points; ++i)
+++ {
+++ //Perform the carrier wipe-off
+++ bb_signal_sample = input[i] * carrier[i];
+++ // Now get very early, early, prompt, late and very late values for each
+++ *VE_out += (lv_32fc_t) (bb_signal_sample * VE_code[i]);
+++ *E_out += (lv_32fc_t) (bb_signal_sample * E_code[i]);
+++ *P_out += (lv_32fc_t) (bb_signal_sample * P_code[i]);
+++ *L_out += (lv_32fc_t) (bb_signal_sample * L_code[i]);
+++ *VL_out += (lv_32fc_t) (bb_signal_sample * VL_code[i]);
+++ }
+++}
+++#endif /* LV_HAVE_GENERIC */
+++#endif /* INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_u_H */
+++
+++
+++#ifndef INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_a_H
+++#define INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_a_H
+++
+++#include <inttypes.h>
+++#include <stdio.h>
+++#include <volk_gnsssdr/volk_gnsssdr_complex.h>
+++#include <float.h>
+++#include <string.h>
+++
+++#ifdef LV_HAVE_SSE4_1
+++#include "smmintrin.h"
+++#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h"
+++#include "CommonMacros/CommonMacros.h"
+++/*!
+++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
+++ \param input The input signal input
+++ \param carrier The carrier signal input
+++ \param VE_code Very Early PRN code replica input
+++ \param E_code Early PRN code replica input
+++ \param P_code Prompt PRN code replica input
+++ \param L_code Late PRN code replica input
+++ \param VL_code Very Late PRN code replica input
+++ \param VE_out Very Early correlation output
+++ \param E_out Early correlation output
+++ \param P_out Prompt correlation output
+++ \param L_out Late correlation output
+++ \param VL_out Very Late correlation output
+++ \param num_points The number of complex values in vectors
+++ */
+++static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_a_sse4_1(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points)
+++{
+++ const unsigned int sse_iters = num_points / 8;
+++
+++ __m128i x, x_abs, y, y_aux, bb_signal_sample_aux, bb_signal_sample_aux_abs;;
+++ __m128i real_output, imag_output;
+++ __m128 real_VE_code_acc, imag_VE_code_acc, real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc, real_VL_code_acc, imag_VL_code_acc;
+++ __m128i input_i_1, input_i_2, output_i32;
+++ __m128 real_output_ps, imag_output_ps;
+++
+++ __m128i check_sign_sequence = _mm_set_epi8 (255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1);
+++ __m128i rearrange_sequence = _mm_set_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
+++ __m128i mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
+++
+++ const lv_8sc_t* input_ptr = input;
+++ const lv_8sc_t* carrier_ptr = carrier;
+++
+++ const lv_8sc_t* VE_code_ptr = VE_code;
+++ lv_32fc_t* VE_out_ptr = VE_out;
+++ const lv_8sc_t* E_code_ptr = E_code;
+++ lv_32fc_t* E_out_ptr = E_out;
+++ const lv_8sc_t* P_code_ptr = P_code;
+++ lv_32fc_t* P_out_ptr = P_out;
+++ const lv_8sc_t* L_code_ptr = L_code;
+++ lv_32fc_t* L_out_ptr = L_out;
+++ const lv_8sc_t* VL_code_ptr = VL_code;
+++ lv_32fc_t* VL_out_ptr = VL_out;
+++
+++ float VE_out_real = 0;
+++ float VE_out_imag = 0;
+++ float E_out_real = 0;
+++ float E_out_imag = 0;
+++ float P_out_real = 0;
+++ float P_out_imag = 0;
+++ float L_out_real = 0;
+++ float L_out_imag = 0;
+++ float VL_out_real = 0;
+++ float VL_out_imag = 0;
+++
+++ real_VE_code_acc = _mm_setzero_ps();
+++ imag_VE_code_acc = _mm_setzero_ps();
+++ real_E_code_acc = _mm_setzero_ps();
+++ imag_E_code_acc = _mm_setzero_ps();
+++ real_P_code_acc = _mm_setzero_ps();
+++ imag_P_code_acc = _mm_setzero_ps();
+++ real_L_code_acc = _mm_setzero_ps();
+++ imag_L_code_acc = _mm_setzero_ps();
+++ real_VL_code_acc = _mm_setzero_ps();
+++ imag_VL_code_acc = _mm_setzero_ps();
+++
+++ if (sse_iters>0)
+++ {
+++ for(int number = 0;number < sse_iters; number++){
+++
+++ //Perform the carrier wipe-off
+++ x = _mm_load_si128((__m128i*)input_ptr);
+++ y = _mm_load_si128((__m128i*)carrier_ptr);
+++
+++ x_abs = _mm_abs_epi8 (x);
+++
+++ CM_8IC_X2_SCALAR_PRODUCT_16IC_X2_U_SSSE3(y, x, check_sign_sequence, rearrange_sequence, y_aux, x_abs, real_output, imag_output)
+++
+++ imag_output = _mm_slli_si128 (imag_output, 1);
+++ bb_signal_sample_aux = _mm_blendv_epi8 (imag_output, real_output, mult1);
+++ bb_signal_sample_aux_abs = _mm_abs_epi8 (bb_signal_sample_aux);
+++
+++ //Get very early values
+++ y = _mm_load_si128((__m128i*)VE_code_ptr);
+++
+++ CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
+++
+++ real_VE_code_acc = _mm_add_ps (real_VE_code_acc, real_output_ps);
+++ imag_VE_code_acc = _mm_add_ps (imag_VE_code_acc, imag_output_ps);
+++
+++ //Get early values
+++ y = _mm_load_si128((__m128i*)E_code_ptr);
+++
+++ CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
+++
+++ real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps);
+++ imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps);
+++
+++ //Get prompt values
+++ y = _mm_load_si128((__m128i*)P_code_ptr);
+++
+++ CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
+++
+++ real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps);
+++ imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps);
+++
+++ //Get late values
+++ y = _mm_load_si128((__m128i*)L_code_ptr);
+++
+++ CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
+++
+++ real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps);
+++ imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps);
+++
+++ //Get very late values
+++ y = _mm_load_si128((__m128i*)VL_code_ptr);
+++
+++ CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
+++
+++ real_VL_code_acc = _mm_add_ps (real_VL_code_acc, real_output_ps);
+++ imag_VL_code_acc = _mm_add_ps (imag_VL_code_acc, imag_output_ps);
+++
+++ input_ptr += 8;
+++ carrier_ptr += 8;
+++ VE_code_ptr += 8;
+++ E_code_ptr += 8;
+++ P_code_ptr += 8;
+++ L_code_ptr += 8;
+++ VL_code_ptr += 8;
+++ }
+++
+++ __VOLK_ATTR_ALIGNED(16) float real_VE_dotProductVector[4];
+++ __VOLK_ATTR_ALIGNED(16) float imag_VE_dotProductVector[4];
+++ __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4];
+++ __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4];
+++ __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4];
+++ __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4];
+++ __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4];
+++ __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4];
+++ __VOLK_ATTR_ALIGNED(16) float real_VL_dotProductVector[4];
+++ __VOLK_ATTR_ALIGNED(16) float imag_VL_dotProductVector[4];
+++
+++ _mm_store_ps((float*)real_VE_dotProductVector,real_VE_code_acc); // Store the results back into the dot product vector
+++ _mm_store_ps((float*)imag_VE_dotProductVector,imag_VE_code_acc); // Store the results back into the dot product vector
+++ _mm_store_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector
+++ _mm_store_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector
+++ _mm_store_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector
+++ _mm_store_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector
+++ _mm_store_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector
+++ _mm_store_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector
+++ _mm_store_ps((float*)real_VL_dotProductVector,real_VL_code_acc); // Store the results back into the dot product vector
+++ _mm_store_ps((float*)imag_VL_dotProductVector,imag_VL_code_acc); // Store the results back into the dot product vector
+++
+++ for (int i = 0; i<4; ++i)
+++ {
+++ VE_out_real += real_VE_dotProductVector[i];
+++ VE_out_imag += imag_VE_dotProductVector[i];
+++ E_out_real += real_E_dotProductVector[i];
+++ E_out_imag += imag_E_dotProductVector[i];
+++ P_out_real += real_P_dotProductVector[i];
+++ P_out_imag += imag_P_dotProductVector[i];
+++ L_out_real += real_L_dotProductVector[i];
+++ L_out_imag += imag_L_dotProductVector[i];
+++ VL_out_real += real_VL_dotProductVector[i];
+++ VL_out_imag += imag_VL_dotProductVector[i];
+++ }
+++ *VE_out_ptr = lv_cmake(VE_out_real, VE_out_imag);
+++ *E_out_ptr = lv_cmake(E_out_real, E_out_imag);
+++ *P_out_ptr = lv_cmake(P_out_real, P_out_imag);
+++ *L_out_ptr = lv_cmake(L_out_real, L_out_imag);
+++ *VL_out_ptr = lv_cmake(VL_out_real, VL_out_imag);
+++ }
+++
+++ lv_16sc_t bb_signal_sample;
+++ for(int i=0; i < num_points%8; ++i)
+++ {
+++ //Perform the carrier wipe-off
+++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
+++ // Now get very early, early, prompt, late and very late values for each
+++ *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*VE_code_ptr++));
+++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*E_code_ptr++));
+++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*P_code_ptr++));
+++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*L_code_ptr++));
+++ *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*VL_code_ptr++));
+++ }
+++}
+++#endif /* LV_HAVE_SSE4_1 */
+++
+++#ifdef LV_HAVE_GENERIC
+++#include <stdio.h>
+++#include <tmmintrin.h>
+++
+++/*!
+++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
+++ \param input The input signal input
+++ \param carrier The carrier signal input
+++ \param VE_code Very Early PRN code replica input
+++ \param E_code Early PRN code replica input
+++ \param P_code Prompt PRN code replica input
+++ \param L_code Late PRN code replica input
+++ \param VL_code Very Late PRN code replica input
+++ \param VE_out Very Early correlation output
+++ \param E_out Early correlation output
+++ \param P_out Prompt correlation output
+++ \param L_out Late correlation output
+++ \param VL_out Very Late correlation output
+++ \param num_points The number of complex values in vectors
+++ */
+++static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_a_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points)
+++{
+++ *VE_out = 0;
+++ *E_out = 0;
+++ *P_out = 0;
+++ *L_out = 0;
+++ *VL_out = 0;
+++
+++ lv_16sc_t bb_signal_sample;
+++
+++ for(int i=0; i < num_points; ++i)
+++ {
+++ //Perform the carrier wipe-off
+++ bb_signal_sample = input[i] * carrier[i];
+++ // Now get very early, early, prompt, late and very late values for each
+++ *VE_out += (lv_32fc_t) (bb_signal_sample * VE_code[i]);
+++ *E_out += (lv_32fc_t) (bb_signal_sample * E_code[i]);
+++ *P_out += (lv_32fc_t) (bb_signal_sample * P_code[i]);
+++ *L_out += (lv_32fc_t) (bb_signal_sample * L_code[i]);
+++ *VL_out += (lv_32fc_t) (bb_signal_sample * VL_code[i]);
+++ }
+++}
+++#endif /* LV_HAVE_GENERIC */
+++#endif /* INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_a_H */
++\ No newline at end of file
++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8u_x2_multiply_8u.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8u_x2_multiply_8u.h
++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8u_x2_multiply_8u.h 1970-01-01 01:00:00.000000000 +0100
+++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8u_x2_multiply_8u.h 2014-10-15 01:55:08.000000000 +0200
++@@ -0,0 +1,210 @@
+++/*!
+++ * \file volk_gnsssdr_8u_x2_multiply_8u.h
+++ * \brief Volk protokernel: multiplies unsigned char values
+++ * \authors <ul>
+++ * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
+++ * </ul>
+++ *
+++ * Volk protokernel that multiplies unsigned char values (8 bits data)
+++ *
+++ * -------------------------------------------------------------------------
+++ *
+++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
+++ *
+++ * GNSS-SDR is a software defined Global Navigation
+++ * Satellite Systems receiver
+++ *
+++ * This file is part of GNSS-SDR.
+++ *
+++ * GNSS-SDR is free software: you can redistribute it and/or modify
+++ * it under the terms of the GNU General Public License as published by
+++ * the Free Software Foundation, either version 3 of the License, or
+++ * at your option) any later version.
+++ *
+++ * GNSS-SDR is distributed in the hope that it will be useful,
+++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+++ * GNU General Public License for more details.
+++ *
+++ * You should have received a copy of the GNU General Public License
+++ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
+++ *
+++ * -------------------------------------------------------------------------
+++ */
+++
+++#ifndef INCLUDED_volk_gnsssdr_8u_x2_multiply_8u_u_H
+++#define INCLUDED_volk_gnsssdr_8u_x2_multiply_8u_u_H
+++
+++#include <inttypes.h>
+++#include <stdio.h>
+++
+++#ifdef LV_HAVE_SSE3
+++#include <pmmintrin.h>
+++#include <emmintrin.h>
+++/*!
+++ \brief Multiplies the two input unsigned char values and stores their results in the third unisgned char
+++ \param cChar The unsigned char where the results will be stored
+++ \param aChar One of the unsigned char to be multiplied
+++ \param bChar One of the unsigned char to be multiplied
+++ \param num_points The number of unsigned char values in aChar and bChar to be multiplied together and stored into cChar
+++ */
+++static inline void volk_gnsssdr_8u_x2_multiply_8u_u_sse3(unsigned char* cChar, const unsigned char* aChar, const unsigned char* bChar, unsigned int num_points){
+++
+++ const unsigned int sse_iters = num_points / 16;
+++
+++ __m128i x, y, x1, x2, y1, y2, mult1, x1_mult_y1, x2_mult_y2, tmp, tmp1, tmp2, totalc;
+++ unsigned char* c = cChar;
+++ const unsigned char* a = aChar;
+++ const unsigned char* b = bChar;
+++
+++ for(int number = 0;number < sse_iters; number++){
+++ x = _mm_lddqu_si128((__m128i*)a);
+++ y = _mm_lddqu_si128((__m128i*)b);
+++
+++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
+++ x1 = _mm_srli_si128 (x, 1);
+++ x1 = _mm_and_si128 (x1, mult1);
+++ x2 = _mm_and_si128 (x, mult1);
+++
+++ y1 = _mm_srli_si128 (y, 1);
+++ y1 = _mm_and_si128 (y1, mult1);
+++ y2 = _mm_and_si128 (y, mult1);
+++
+++ x1_mult_y1 = _mm_mullo_epi16 (x1, y1);
+++ x2_mult_y2 = _mm_mullo_epi16 (x2, y2);
+++
+++ tmp = _mm_and_si128 (x1_mult_y1, mult1);
+++ tmp1 = _mm_slli_si128 (tmp, 1);
+++ tmp2 = _mm_and_si128 (x2_mult_y2, mult1);
+++ totalc = _mm_or_si128 (tmp1, tmp2);
+++
+++ _mm_storeu_si128((__m128i*)c, totalc);
+++
+++ a += 16;
+++ b += 16;
+++ c += 16;
+++ }
+++
+++ for (int i = 0; i<(num_points % 16); ++i)
+++ {
+++ *c++ = (*a++) * (*b++);
+++ }
+++}
+++#endif /* LV_HAVE_SSE3 */
+++
+++#ifdef LV_HAVE_GENERIC
+++/*!
+++ \brief Multiplies the two input unsigned char values and stores their results in the third unisgned char
+++ \param cChar The unsigned char where the results will be stored
+++ \param aChar One of the unsigned char to be multiplied
+++ \param bChar One of the unsigned char to be multiplied
+++ \param num_points The number of unsigned char values in aChar and bChar to be multiplied together and stored into cChar
+++ */
+++static inline void volk_gnsssdr_8u_x2_multiply_8u_generic(unsigned char* cChar, const unsigned char* aChar, const unsigned char* bChar, unsigned int num_points){
+++ unsigned char* cPtr = cChar;
+++ const unsigned char* aPtr = aChar;
+++ const unsigned char* bPtr = bChar;
+++
+++ for(int number = 0; number < num_points; number++){
+++ *cPtr++ = (*aPtr++) * (*bPtr++);
+++ }
+++}
+++#endif /* LV_HAVE_GENERIC */
+++
+++#endif /* INCLUDED_volk_gnsssdr_8u_x2_multiply_8u_u_H */
+++
+++
+++#ifndef INCLUDED_volk_gnsssdr_8u_x2_multiply_8u_a_H
+++#define INCLUDED_volk_gnsssdr_8u_x2_multiply_8u_a_H
+++
+++#include <inttypes.h>
+++#include <stdio.h>
+++
+++#ifdef LV_HAVE_SSE3
+++#include <pmmintrin.h>
+++#include <emmintrin.h>
+++/*!
+++ \brief Multiplies the two input unsigned char values and stores their results in the third unisgned char
+++ \param cChar The unsigned char where the results will be stored
+++ \param aChar One of the unsigned char to be multiplied
+++ \param bChar One of the unsigned char to be multiplied
+++ \param num_points The number of unsigned char values in aChar and bChar to be multiplied together and stored into cChar
+++ */
+++static inline void volk_gnsssdr_8u_x2_multiply_8u_a_sse3(unsigned char* cChar, const unsigned char* aChar, const unsigned char* bChar, unsigned int num_points){
+++
+++ const unsigned int sse_iters = num_points / 16;
+++
+++ __m128i x, y, x1, x2, y1, y2, mult1, x1_mult_y1, x2_mult_y2, tmp, tmp1, tmp2, totalc;
+++ unsigned char* c = cChar;
+++ const unsigned char* a = aChar;
+++ const unsigned char* b = bChar;
+++
+++ for(int number = 0;number < sse_iters; number++){
+++ x = _mm_load_si128((__m128i*)a);
+++ y = _mm_load_si128((__m128i*)b);
+++
+++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
+++ x1 = _mm_srli_si128 (x, 1);
+++ x1 = _mm_and_si128 (x1, mult1);
+++ x2 = _mm_and_si128 (x, mult1);
+++
+++ y1 = _mm_srli_si128 (y, 1);
+++ y1 = _mm_and_si128 (y1, mult1);
+++ y2 = _mm_and_si128 (y, mult1);
+++
+++ x1_mult_y1 = _mm_mullo_epi16 (x1, y1);
+++ x2_mult_y2 = _mm_mullo_epi16 (x2, y2);
+++
+++ tmp = _mm_and_si128 (x1_mult_y1, mult1);
+++ tmp1 = _mm_slli_si128 (tmp, 1);
+++ tmp2 = _mm_and_si128 (x2_mult_y2, mult1);
+++ totalc = _mm_or_si128 (tmp1, tmp2);
+++
+++ _mm_store_si128((__m128i*)c, totalc);
+++
+++ a += 16;
+++ b += 16;
+++ c += 16;
+++ }
+++
+++ for (int i = 0; i<(num_points % 16); ++i)
+++ {
+++ *c++ = (*a++) * (*b++);
+++ }
+++}
+++#endif /* LV_HAVE_SSE */
+++
+++#ifdef LV_HAVE_GENERIC
+++/*!
+++ \brief Multiplies the two input unsigned char values and stores their results in the third unisgned char
+++ \param cChar The unsigned char where the results will be stored
+++ \param aChar One of the unsigned char to be multiplied
+++ \param bChar One of the unsigned char to be multiplied
+++ \param num_points The number of unsigned char values in aChar and bChar to be multiplied together and stored into cChar
+++ */
+++static inline void volk_gnsssdr_8u_x2_multiply_8u_a_generic(unsigned char* cChar, const unsigned char* aChar, const unsigned char* bChar, unsigned int num_points){
+++ unsigned char* cPtr = cChar;
+++ const unsigned char* aPtr = aChar;
+++ const unsigned char* bPtr = bChar;
+++
+++ for(int number = 0; number < num_points; number++){
+++ *cPtr++ = (*aPtr++) * (*bPtr++);
+++ }
+++}
+++#endif /* LV_HAVE_GENERIC */
+++
+++#ifdef LV_HAVE_ORC
+++/*!
+++ \brief Multiplies the two input unsigned char values and stores their results in the third unisgned char
+++ \param cChar The unsigned char where the results will be stored
+++ \param aChar One of the unsigned char to be multiplied
+++ \param bChar One of the unsigned char to be multiplied
+++ \param num_points The number of unsigned char values in aChar and bChar to be multiplied together and stored into cChar
+++ */
+++extern void volk_gnsssdr_8u_x2_multiply_8u_a_orc_impl(unsigned char* cVector, const unsigned char* aVector, const unsigned char* bVector, unsigned int num_points);
+++static inline void volk_gnsssdr_8u_x2_multiply_8u_u_orc(unsigned char* cVector, const unsigned char* aVector, const unsigned char* bVector, unsigned int num_points){
+++ volk_gnsssdr_8u_x2_multiply_8u_a_orc_impl(cVector, aVector, bVector, num_points);
+++}
+++#endif /* LV_HAVE_ORC */
+++
+++#endif /* INCLUDED_volk_gnsssdr_8u_x2_multiply_8u_a_H */
++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_s32f_x2_update_local_carrier_32fc.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_s32f_x2_update_local_carrier_32fc.h
++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_s32f_x2_update_local_carrier_32fc.h 1970-01-01 01:00:00.000000000 +0100
+++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_s32f_x2_update_local_carrier_32fc.h 2014-10-17 01:53:55.000000000 +0200
++@@ -0,0 +1,866 @@
+++/*!
+++ * \file volk_gnsssdr_32fc_s32f_x2_update_local_carrier_32fc
+++ * \brief Volk protokernel: replaces the tracking function for update_local_carrier. Algorithm by Julien Pommier and Giovanni Garberoglio, modified by Andrés Cecilia.
+++ * \authors <ul>
+++ * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
+++ * </ul>
+++ *
+++ * Volk protokernel that replaces the tracking function for update_local_carrier. Algorithm by Julien Pommier and Giovanni Garberoglio, modified by Andrés Cecilia.
+++ *
+++ * -------------------------------------------------------------------------
+++ *
+++ * Copyright (C) 2007 Julien Pommier
+++ *
+++ * This software is provided 'as-is', without any express or implied
+++ * warranty. In no event will the authors be held liable for any damages
+++ * arising from the use of this software.
+++ *
+++ * Permission is granted to anyone to use this software for any purpose,
+++ * including commercial applications, and to alter it and redistribute it
+++ * freely, subject to the following restrictions:
+++ *
+++ * 1. The origin of this software must not be misrepresented; you must not
+++ * claim that you wrote the original software. If you use this software
+++ * in a product, an acknowledgment in the product documentation would be
+++ * appreciated but is not required.
+++ * 2. Altered source versions must be plainly marked as such, and must not be
+++ * misrepresented as being the original software.
+++ * 3. This notice may not be removed or altered from any source distribution.
+++ *
+++ *(this is the zlib license)
+++ *
+++ * -------------------------------------------------------------------------
+++ *
+++ * Copyright (C) 2012 Giovanni Garberoglio
+++ * Interdisciplinary Laboratory for Computational Science (LISC)
+++ * Fondazione Bruno Kessler and University of Trento
+++ * via Sommarive, 18
+++ * I-38123 Trento (Italy)
+++ *
+++ * -------------------------------------------------------------------------
+++ *
+++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
+++ *
+++ * GNSS-SDR is a software defined Global Navigation
+++ * Satellite Systems receiver
+++ *
+++ * This file is part of GNSS-SDR.
+++ *
+++ * GNSS-SDR is free software: you can redistribute it and/or modify
+++ * it under the terms of the GNU General Public License as published by
+++ * the Free Software Foundation, either version 3 of the License, or
+++ * at your option) any later version.
+++ *
+++ * GNSS-SDR is distributed in the hope that it will be useful,
+++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+++ * GNU General Public License for more details.
+++ *
+++ * You should have received a copy of the GNU General Public License
+++ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
+++ *
+++ * -------------------------------------------------------------------------
+++ */
+++
+++#ifndef INCLUDED_volk_gnsssdr_32fc_s32f_x2_update_local_carrier_32fc_u_H
+++#define INCLUDED_volk_gnsssdr_32fc_s32f_x2_update_local_carrier_32fc_u_H
+++
+++#include <volk_gnsssdr/volk_gnsssdr_common.h>
+++#include <inttypes.h>
+++#include <stdio.h>
+++
+++#ifdef LV_HAVE_AVX
+++#include <tmmintrin.h>
+++/*!
+++ \brief Accumulates the values in the input buffer
+++ \param result The accumulated result
+++ \param inputBuffer The buffer of data to be accumulated
+++ \param num_points The number of values in inputBuffer to be accumulated
+++ */
+++static inline void volk_gnsssdr_s32f_x2_update_local_carrier_32fc_u_avx(lv_32fc_t* d_carr_sign, const float phase_rad_init, const float phase_step_rad, unsigned int num_points){
+++
+++// float* pointer1 = (float*)&phase_rad_init;
+++// *pointer1 = 0;
+++// float* pointer2 = (float*)&phase_step_rad;
+++// *pointer2 = 0.5;
+++
+++ const unsigned int sse_iters = num_points / 8;
+++
+++ __m256 _ps256_minus_cephes_DP1 = _mm256_set1_ps(-0.78515625f);
+++ __m256 _ps256_minus_cephes_DP2 = _mm256_set1_ps(-2.4187564849853515625e-4f);
+++ __m256 _ps256_minus_cephes_DP3 = _mm256_set1_ps(-3.77489497744594108e-8f);
+++ __m256 _ps256_sign_mask = _mm256_set1_ps(-0.f);
+++ __m128i _pi32avx_1 = _mm_set1_epi32(1);
+++ __m128i _pi32avx_inv1 = _mm_set1_epi32(~1);
+++ __m128i _pi32avx_2 = _mm_set1_epi32(2);
+++ __m128i _pi32avx_4 = _mm_set1_epi32(4);
+++ __m256 _ps256_cephes_FOPI = _mm256_set1_ps(1.27323954473516f); // 4 / PI
+++ __m256 _ps256_sincof_p0 = _mm256_set1_ps(-1.9515295891E-4f);
+++ __m256 _ps256_sincof_p1 = _mm256_set1_ps( 8.3321608736E-3f);
+++ __m256 _ps256_sincof_p2 = _mm256_set1_ps(-1.6666654611E-1f);
+++ __m256 _ps256_coscof_p0 = _mm256_set1_ps( 2.443315711809948E-005f);
+++ __m256 _ps256_coscof_p1 = _mm256_set1_ps(-1.388731625493765E-003f);
+++ __m256 _ps256_coscof_p2 = _mm256_set1_ps( 4.166664568298827E-002f);
+++ __m256 _ps256_1 = _mm256_set1_ps(1.f);
+++ __m256 _ps256_0p5 = _mm256_set1_ps(0.5f);
+++
+++ __m256 phase_step_rad_array = _mm256_set1_ps(8*phase_step_rad);
+++
+++ __m256 phase_rad_array, x, s, c, swap_sign_bit_sin, sign_bit_cos, poly_mask, z, tmp, y, y2, ysin1, ysin2;
+++ __m256 xmm1, xmm2, xmm3, sign_bit_sin;
+++ __m256i imm0, imm2, imm4;
+++ __m128i imm0_1, imm0_2, imm2_1, imm2_2, imm4_1, imm4_2;
+++ __VOLK_ATTR_ALIGNED(32) float sin_value[8];
+++ __VOLK_ATTR_ALIGNED(32) float cos_value[8];
+++
+++ phase_rad_array = _mm256_set_ps (phase_rad_init+7*phase_step_rad, phase_rad_init+6*phase_step_rad, phase_rad_init+5*phase_step_rad, phase_rad_init+4*phase_step_rad, phase_rad_init+3*phase_step_rad, phase_rad_init+2*phase_step_rad, phase_rad_init+phase_step_rad, phase_rad_init);
+++
+++ for(int i = 0; i < sse_iters; i++)
+++ {
+++
+++ x = phase_rad_array;
+++
+++ /* extract the sign bit (upper one) */
+++ sign_bit_sin = _mm256_and_ps(x, _ps256_sign_mask);
+++
+++ /* take the absolute value */
+++ x = _mm256_xor_ps(x, sign_bit_sin);
+++
+++ /* scale by 4/Pi */
+++ y = _mm256_mul_ps(x, _ps256_cephes_FOPI);
+++
+++ /* we use SSE2 routines to perform the integer ops */
+++
+++ //COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y),imm2_1,imm2_2);
+++ y = _mm256_cvttps_epi32(y);
+++ imm2_1 = _mm256_extractf128_ps (y, 0);
+++ imm2_2 = _mm256_extractf128_ps (y, 1);
+++
+++ imm2_1 = _mm_add_epi32(imm2_1, _pi32avx_1);
+++ imm2_2 = _mm_add_epi32(imm2_2, _pi32avx_1);
+++
+++ imm2_1 = _mm_and_si128(imm2_1, _pi32avx_inv1);
+++ imm2_2 = _mm_and_si128(imm2_2, _pi32avx_inv1);
+++
+++ //COPY_XMM_TO_IMM(imm2_1,imm2_2,imm2);
+++ //_mm256_set_m128i not defined in some versions of immintrin.h
+++ //imm2 = _mm256_set_m128i (imm2_2, imm2_1);
+++ imm2 = _mm256_insertf128_si256(_mm256_castsi128_si256(imm2_1),(imm2_2),1);
+++
+++ y = _mm256_cvtepi32_ps(imm2);
+++
+++ imm4_1 = imm2_1;
+++ imm4_2 = imm2_2;
+++
+++ imm0_1 = _mm_and_si128(imm2_1, _pi32avx_4);
+++ imm0_2 = _mm_and_si128(imm2_2, _pi32avx_4);
+++
+++ imm0_1 = _mm_slli_epi32(imm0_1, 29);
+++ imm0_2 = _mm_slli_epi32(imm0_2, 29);
+++
+++ //COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0);
+++ //_mm256_set_m128i not defined in some versions of immintrin.h
+++ //imm0 = _mm256_set_m128i (imm0_2, imm0_1);
+++ imm0 = _mm256_insertf128_si256(_mm256_castsi128_si256(imm0_1),(imm0_2),1);
+++
+++ imm2_1 = _mm_and_si128(imm2_1, _pi32avx_2);
+++ imm2_2 = _mm_and_si128(imm2_2, _pi32avx_2);
+++
+++ imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128());
+++ imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128());
+++
+++ //COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
+++ //_mm256_set_m128i not defined in some versions of immintrin.h
+++ //imm2 = _mm256_set_m128i (imm2_2, imm2_1);
+++ imm2 = _mm256_insertf128_si256(_mm256_castsi128_si256(imm2_1),(imm2_2),1);
+++
+++ swap_sign_bit_sin = _mm256_castsi256_ps(imm0);
+++ poly_mask = _mm256_castsi256_ps(imm2);
+++
+++ /* The magic pass: "Extended precision modular arithmetic"
+++ x = ((x - y * DP1) - y * DP2) - y * DP3; */
+++ xmm1 = _ps256_minus_cephes_DP1;
+++ xmm2 = _ps256_minus_cephes_DP2;
+++ xmm3 = _ps256_minus_cephes_DP3;
+++ xmm1 = _mm256_mul_ps(y, xmm1);
+++ xmm2 = _mm256_mul_ps(y, xmm2);
+++ xmm3 = _mm256_mul_ps(y, xmm3);
+++ x = _mm256_add_ps(x, xmm1);
+++ x = _mm256_add_ps(x, xmm2);
+++ x = _mm256_add_ps(x, xmm3);
+++
+++ imm4_1 = _mm_sub_epi32(imm4_1, _pi32avx_2);
+++ imm4_2 = _mm_sub_epi32(imm4_2, _pi32avx_2);
+++
+++ imm4_1 = _mm_andnot_si128(imm4_1, _pi32avx_4);
+++ imm4_2 = _mm_andnot_si128(imm4_2, _pi32avx_4);
+++
+++ imm4_1 = _mm_slli_epi32(imm4_1, 29);
+++ imm4_2 = _mm_slli_epi32(imm4_2, 29);
+++
+++ //COPY_XMM_TO_IMM(imm4_1, imm4_2, imm4);
+++ //_mm256_set_m128i not defined in some versions of immintrin.h
+++ //imm4 = _mm256_set_m128i (imm4_2, imm4_1);
+++ imm4 = _mm256_insertf128_si256(_mm256_castsi128_si256(imm4_1),(imm4_2),1);
+++
+++ sign_bit_cos = _mm256_castsi256_ps(imm4);
+++
+++ sign_bit_sin = _mm256_xor_ps(sign_bit_sin, swap_sign_bit_sin);
+++
+++ /* Evaluate the first polynom (0 <= x <= Pi/4) */
+++ z = _mm256_mul_ps(x,x);
+++ y = _ps256_coscof_p0;
+++
+++ y = _mm256_mul_ps(y, z);
+++ y = _mm256_add_ps(y, _ps256_coscof_p1);
+++ y = _mm256_mul_ps(y, z);
+++ y = _mm256_add_ps(y, _ps256_coscof_p2);
+++ y = _mm256_mul_ps(y, z);
+++ y = _mm256_mul_ps(y, z);
+++ tmp = _mm256_mul_ps(z, _ps256_0p5);
+++ y = _mm256_sub_ps(y, tmp);
+++ y = _mm256_add_ps(y, _ps256_1);
+++
+++ /* Evaluate the second polynom (Pi/4 <= x <= 0) */
+++
+++ y2 = _ps256_sincof_p0;
+++ y2 = _mm256_mul_ps(y2, z);
+++ y2 = _mm256_add_ps(y2, _ps256_sincof_p1);
+++ y2 = _mm256_mul_ps(y2, z);
+++ y2 = _mm256_add_ps(y2, _ps256_sincof_p2);
+++ y2 = _mm256_mul_ps(y2, z);
+++ y2 = _mm256_mul_ps(y2, x);
+++ y2 = _mm256_add_ps(y2, x);
+++
+++ /* select the correct result from the two polynoms */
+++ xmm3 = poly_mask;
+++ ysin2 = _mm256_and_ps(xmm3, y2);
+++ ysin1 = _mm256_andnot_ps(xmm3, y);
+++ y2 = _mm256_sub_ps(y2,ysin2);
+++ y = _mm256_sub_ps(y, ysin1);
+++
+++ xmm1 = _mm256_add_ps(ysin1,ysin2);
+++ xmm2 = _mm256_add_ps(y,y2);
+++
+++ /* update the sign */
+++ s = _mm256_xor_ps(xmm1, sign_bit_sin);
+++ c = _mm256_xor_ps(xmm2, sign_bit_cos);
+++
+++ //GNSS-SDR needs to return -sin
+++ s = _mm256_xor_ps(s, _ps256_sign_mask);
+++
+++ _mm256_storeu_ps ((float*)sin_value, s);
+++ _mm256_storeu_ps ((float*)cos_value, c);
+++
+++ for(int i = 0; i < 8; i++)
+++ {
+++ d_carr_sign[i] = lv_cmake(cos_value[i], sin_value[i]);
+++ }
+++ d_carr_sign += 8;
+++
+++ phase_rad_array = _mm256_add_ps (phase_rad_array, phase_step_rad_array);
+++ }
+++
+++ if (num_points%8!=0)
+++ {
+++ __VOLK_ATTR_ALIGNED(32) float phase_rad_store[8];
+++ _mm256_storeu_si256 ((float*)phase_rad_store, phase_rad_array);
+++
+++ float phase_rad = phase_rad_store[0];
+++
+++ for(int i = 0; i < num_points%8; i++)
+++ {
+++ *d_carr_sign = lv_cmake(cos(phase_rad), -sin(phase_rad));
+++ d_carr_sign++;
+++ phase_rad += phase_step_rad;
+++ }
+++ }
+++}
+++#endif /* LV_HAVE_AVX */
+++
+++
+++#ifdef LV_HAVE_SSE2
+++#include <emmintrin.h>
+++/*!
+++ \brief Accumulates the values in the input buffer
+++ \param result The accumulated result
+++ \param inputBuffer The buffer of data to be accumulated
+++ \param num_points The number of values in inputBuffer to be accumulated
+++*/
+++static inline void volk_gnsssdr_s32f_x2_update_local_carrier_32fc_u_sse2(lv_32fc_t* d_carr_sign, const float phase_rad_init, const float phase_step_rad, unsigned int num_points){
+++
+++// float* pointer1 = (float*)&phase_rad_init;
+++// *pointer1 = 0;
+++// float* pointer2 = (float*)&phase_step_rad;
+++// *pointer2 = 0.5;
+++
+++ const unsigned int sse_iters = num_points / 4;
+++
+++ __m128 _ps_minus_cephes_DP1 = _mm_set1_ps(-0.78515625f);
+++ __m128 _ps_minus_cephes_DP2 = _mm_set1_ps(-2.4187564849853515625e-4f);
+++ __m128 _ps_minus_cephes_DP3 = _mm_set1_ps(-3.77489497744594108e-8f);
+++ __m128 _ps_sign_mask = _mm_set1_ps(-0.f);
+++ __m128i _pi32_1 = _mm_set1_epi32(1);
+++ __m128i _pi32_inv1 = _mm_set1_epi32(~1);
+++ __m128i _pi32_2 = _mm_set1_epi32(2);
+++ __m128i _pi32_4 = _mm_set1_epi32(4);
+++ __m128 _ps_cephes_FOPI = _mm_set1_ps(1.27323954473516f); // 4 / PI
+++ __m128 _ps_sincof_p0 = _mm_set1_ps(-1.9515295891E-4f);
+++ __m128 _ps_sincof_p1 = _mm_set1_ps( 8.3321608736E-3f);
+++ __m128 _ps_sincof_p2 = _mm_set1_ps(-1.6666654611E-1f);
+++ __m128 _ps_coscof_p0 = _mm_set1_ps( 2.443315711809948E-005f);
+++ __m128 _ps_coscof_p1 = _mm_set1_ps(-1.388731625493765E-003f);
+++ __m128 _ps_coscof_p2 = _mm_set1_ps( 4.166664568298827E-002f);
+++ __m128 _ps_1 = _mm_set1_ps(1.f);
+++ __m128 _ps_0p5 = _mm_set1_ps(0.5f);
+++
+++ __m128 phase_step_rad_array = _mm_set1_ps(4*phase_step_rad);
+++
+++ __m128 phase_rad_array, x, s, c, swap_sign_bit_sin, sign_bit_cos, poly_mask, z, tmp, y, y2, ysin1, ysin2;
+++ __m128 xmm1, xmm2, xmm3, sign_bit_sin;
+++ __m128i emm0, emm2, emm4;
+++ __VOLK_ATTR_ALIGNED(16) float sin_value[4];
+++ __VOLK_ATTR_ALIGNED(16) float cos_value[4];
+++
+++ phase_rad_array = _mm_set_ps (phase_rad_init+3*phase_step_rad, phase_rad_init+2*phase_step_rad, phase_rad_init+phase_step_rad, phase_rad_init);
+++
+++ for(int i = 0; i < sse_iters; i++)
+++ {
+++ x = phase_rad_array;
+++
+++ /* extract the sign bit (upper one) */
+++ sign_bit_sin = _mm_and_ps(x, _ps_sign_mask);
+++
+++ /* take the absolute value */
+++ x = _mm_xor_ps(x, sign_bit_sin);
+++
+++ /* scale by 4/Pi */
+++ y = _mm_mul_ps(x, _ps_cephes_FOPI);
+++
+++ /* store the integer part of y in emm2 */
+++ emm2 = _mm_cvttps_epi32(y);
+++
+++ /* j=(j+1) & (~1) (see the cephes sources) */
+++ emm2 = _mm_add_epi32(emm2, _pi32_1);
+++ emm2 = _mm_and_si128(emm2, _pi32_inv1);
+++ y = _mm_cvtepi32_ps(emm2);
+++
+++ emm4 = emm2;
+++
+++ /* get the swap sign flag for the sine */
+++ emm0 = _mm_and_si128(emm2, _pi32_4);
+++ emm0 = _mm_slli_epi32(emm0, 29);
+++ swap_sign_bit_sin = _mm_castsi128_ps(emm0);
+++
+++ /* get the polynom selection mask for the sine*/
+++ emm2 = _mm_and_si128(emm2, _pi32_2);
+++ emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
+++ poly_mask = _mm_castsi128_ps(emm2);
+++
+++ /* The magic pass: "Extended precision modular arithmetic"
+++ x = ((x - y * DP1) - y * DP2) - y * DP3; */
+++ xmm1 = _mm_mul_ps(y, _ps_minus_cephes_DP1);
+++ xmm2 = _mm_mul_ps(y, _ps_minus_cephes_DP2);
+++ xmm3 = _mm_mul_ps(y, _ps_minus_cephes_DP3);
+++ x = _mm_add_ps(_mm_add_ps(x, xmm1), _mm_add_ps(xmm2, xmm3));
+++
+++ emm4 = _mm_sub_epi32(emm4, _pi32_2);
+++ emm4 = _mm_andnot_si128(emm4, _pi32_4);
+++ emm4 = _mm_slli_epi32(emm4, 29);
+++ sign_bit_cos = _mm_castsi128_ps(emm4);
+++
+++ sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
+++
+++ /* Evaluate the first polynom (0 <= x <= Pi/4) */
+++ z = _mm_mul_ps(x,x);
+++ y = _ps_coscof_p0;
+++ y = _mm_mul_ps(y, z);
+++ y = _mm_add_ps(y, _ps_coscof_p1);
+++ y = _mm_mul_ps(y, z);
+++ y = _mm_add_ps(y, _ps_coscof_p2);
+++ y = _mm_mul_ps(y, _mm_mul_ps(z, z));
+++ tmp = _mm_mul_ps(z, _ps_0p5);
+++ y = _mm_sub_ps(y, tmp);
+++ y = _mm_add_ps(y, _ps_1);
+++
+++ /* Evaluate the second polynom (Pi/4 <= x <= 0) */
+++ y2 = _ps_sincof_p0;
+++ y2 = _mm_mul_ps(y2, z);
+++ y2 = _mm_add_ps(y2, _ps_sincof_p1);
+++ y2 = _mm_mul_ps(y2, z);
+++ y2 = _mm_add_ps(y2, _ps_sincof_p2);
+++ y2 = _mm_mul_ps(y2, _mm_mul_ps(z, x));
+++ y2 = _mm_add_ps(y2, x);
+++
+++ /* select the correct result from the two polynoms */
+++ xmm3 = poly_mask;
+++ ysin2 = _mm_and_ps(xmm3, y2);
+++ ysin1 = _mm_andnot_ps(xmm3, y);
+++ y2 = _mm_sub_ps(y2,ysin2);
+++ y = _mm_sub_ps(y, ysin1);
+++
+++ xmm1 = _mm_add_ps(ysin1,ysin2);
+++ xmm2 = _mm_add_ps(y,y2);
+++
+++ /* update the sign */
+++ s = _mm_xor_ps(xmm1, sign_bit_sin);
+++ c = _mm_xor_ps(xmm2, sign_bit_cos);
+++
+++ //GNSS-SDR needs to return -sin
+++ s = _mm_xor_ps(s, _ps_sign_mask);
+++
+++ _mm_storeu_ps ((float*)sin_value, s);
+++ _mm_storeu_ps ((float*)cos_value, c);
+++
+++ for(int i = 0; i < 4; i++)
+++ {
+++ d_carr_sign[i] = lv_cmake(cos_value[i], sin_value[i]);
+++ }
+++ d_carr_sign += 4;
+++
+++ phase_rad_array = _mm_add_ps (phase_rad_array, phase_step_rad_array);
+++ }
+++
+++ if (num_points%4!=0)
+++ {
+++ __VOLK_ATTR_ALIGNED(16) float phase_rad_store[4];
+++ _mm_storeu_si128 ((__m128i*)phase_rad_store, phase_rad_array);
+++
+++ float phase_rad = phase_rad_store[0];
+++
+++ for(int i = 0; i < num_points%4; i++)
+++ {
+++ *d_carr_sign = lv_cmake(cos(phase_rad), -sin(phase_rad));
+++ d_carr_sign++;
+++ phase_rad += phase_step_rad;
+++ }
+++ }
+++}
+++#endif /* LV_HAVE_SSE2 */
+++
+++#ifdef LV_HAVE_GENERIC
+++/*!
+++ \brief Accumulates the values in the input buffer
+++ \param result The accumulated result
+++ \param inputBuffer The buffer of data to be accumulated
+++ \param num_points The number of values in inputBuffer to be accumulated
+++*/
+++static inline void volk_gnsssdr_s32f_x2_update_local_carrier_32fc_generic(lv_32fc_t* d_carr_sign, const float phase_rad_init, const float phase_step_rad, unsigned int num_points){
+++
+++// float* pointer1 = (float*)&phase_rad_init;
+++// *pointer1 = 0;
+++// float* pointer2 = (float*)&phase_step_rad;
+++// *pointer2 = 0.5;
+++
+++ float phase_rad = phase_rad_init;
+++ for(int i = 0; i < num_points; i++)
+++ {
+++ *d_carr_sign = lv_cmake(cos(phase_rad), -sin(phase_rad));
+++ d_carr_sign++;
+++ phase_rad += phase_step_rad;
+++ }
+++}
+++#endif /* LV_HAVE_GENERIC */
+++#endif /* INCLUDED_volk_gnsssdr_32fc_s32f_x2_update_local_carrier_32fc_u_H */
+++
+++
+++#ifndef INCLUDED_volk_gnsssdr_32fc_s32f_x2_update_local_carrier_32fc_a_H
+++#define INCLUDED_volk_gnsssdr_32fc_s32f_x2_update_local_carrier_32fc_a_H
+++
+++#include <volk_gnsssdr/volk_gnsssdr_common.h>
+++#include <inttypes.h>
+++#include <stdio.h>
+++
+++#ifdef LV_HAVE_AVX
+++#include <tmmintrin.h>
+++/*!
+++ \brief Accumulates the values in the input buffer
+++ \param result The accumulated result
+++ \param inputBuffer The buffer of data to be accumulated
+++ \param num_points The number of values in inputBuffer to be accumulated
+++ */
+++static inline void volk_gnsssdr_s32f_x2_update_local_carrier_32fc_a_avx(lv_32fc_t* d_carr_sign, const float phase_rad_init, const float phase_step_rad, unsigned int num_points){
+++
+++ // float* pointer1 = (float*)&phase_rad_init;
+++ // *pointer1 = 0;
+++ // float* pointer2 = (float*)&phase_step_rad;
+++ // *pointer2 = 0.5;
+++
+++ const unsigned int sse_iters = num_points / 8;
+++
+++ __m256 _ps256_minus_cephes_DP1 = _mm256_set1_ps(-0.78515625f);
+++ __m256 _ps256_minus_cephes_DP2 = _mm256_set1_ps(-2.4187564849853515625e-4f);
+++ __m256 _ps256_minus_cephes_DP3 = _mm256_set1_ps(-3.77489497744594108e-8f);
+++ __m256 _ps256_sign_mask = _mm256_set1_ps(-0.f);
+++ __m128i _pi32avx_1 = _mm_set1_epi32(1);
+++ __m128i _pi32avx_inv1 = _mm_set1_epi32(~1);
+++ __m128i _pi32avx_2 = _mm_set1_epi32(2);
+++ __m128i _pi32avx_4 = _mm_set1_epi32(4);
+++ __m256 _ps256_cephes_FOPI = _mm256_set1_ps(1.27323954473516f); // 4 / PI
+++ __m256 _ps256_sincof_p0 = _mm256_set1_ps(-1.9515295891E-4f);
+++ __m256 _ps256_sincof_p1 = _mm256_set1_ps( 8.3321608736E-3f);
+++ __m256 _ps256_sincof_p2 = _mm256_set1_ps(-1.6666654611E-1f);
+++ __m256 _ps256_coscof_p0 = _mm256_set1_ps( 2.443315711809948E-005f);
+++ __m256 _ps256_coscof_p1 = _mm256_set1_ps(-1.388731625493765E-003f);
+++ __m256 _ps256_coscof_p2 = _mm256_set1_ps( 4.166664568298827E-002f);
+++ __m256 _ps256_1 = _mm256_set1_ps(1.f);
+++ __m256 _ps256_0p5 = _mm256_set1_ps(0.5f);
+++
+++ __m256 phase_step_rad_array = _mm256_set1_ps(8*phase_step_rad);
+++
+++ __m256 phase_rad_array, x, s, c, swap_sign_bit_sin, sign_bit_cos, poly_mask, z, tmp, y, y2, ysin1, ysin2;
+++ __m256 xmm1, xmm2, xmm3, sign_bit_sin;
+++ __m256i imm0, imm2, imm4;
+++ __m128i imm0_1, imm0_2, imm2_1, imm2_2, imm4_1, imm4_2;
+++ __VOLK_ATTR_ALIGNED(32) float sin_value[8];
+++ __VOLK_ATTR_ALIGNED(32) float cos_value[8];
+++
+++ phase_rad_array = _mm256_set_ps (phase_rad_init+7*phase_step_rad, phase_rad_init+6*phase_step_rad, phase_rad_init+5*phase_step_rad, phase_rad_init+4*phase_step_rad, phase_rad_init+3*phase_step_rad, phase_rad_init+2*phase_step_rad, phase_rad_init+phase_step_rad, phase_rad_init);
+++
+++ for(int i = 0; i < sse_iters; i++)
+++ {
+++
+++ x = phase_rad_array;
+++
+++ /* extract the sign bit (upper one) */
+++ sign_bit_sin = _mm256_and_ps(x, _ps256_sign_mask);
+++
+++ /* take the absolute value */
+++ x = _mm256_xor_ps(x, sign_bit_sin);
+++
+++ /* scale by 4/Pi */
+++ y = _mm256_mul_ps(x, _ps256_cephes_FOPI);
+++
+++ /* we use SSE2 routines to perform the integer ops */
+++
+++ //COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y),imm2_1,imm2_2);
+++ y = _mm256_cvttps_epi32(y);
+++ imm2_1 = _mm256_extractf128_ps (y, 0);
+++ imm2_2 = _mm256_extractf128_ps (y, 1);
+++
+++ imm2_1 = _mm_add_epi32(imm2_1, _pi32avx_1);
+++ imm2_2 = _mm_add_epi32(imm2_2, _pi32avx_1);
+++
+++ imm2_1 = _mm_and_si128(imm2_1, _pi32avx_inv1);
+++ imm2_2 = _mm_and_si128(imm2_2, _pi32avx_inv1);
+++
+++ //COPY_XMM_TO_IMM(imm2_1,imm2_2,imm2);
+++ //_mm256_set_m128i not defined in some versions of immintrin.h
+++ //imm2 = _mm256_set_m128i (imm2_2, imm2_1);
+++ imm2 = _mm256_insertf128_si256(_mm256_castsi128_si256(imm2_1),(imm2_2),1);
+++
+++ y = _mm256_cvtepi32_ps(imm2);
+++
+++ imm4_1 = imm2_1;
+++ imm4_2 = imm2_2;
+++
+++ imm0_1 = _mm_and_si128(imm2_1, _pi32avx_4);
+++ imm0_2 = _mm_and_si128(imm2_2, _pi32avx_4);
+++
+++ imm0_1 = _mm_slli_epi32(imm0_1, 29);
+++ imm0_2 = _mm_slli_epi32(imm0_2, 29);
+++
+++ //COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0);
+++ //_mm256_set_m128i not defined in some versions of immintrin.h
+++ //imm0 = _mm256_set_m128i (imm0_2, imm0_1);
+++ imm0 = _mm256_insertf128_si256(_mm256_castsi128_si256(imm0_1),(imm0_2),1);
+++
+++ imm2_1 = _mm_and_si128(imm2_1, _pi32avx_2);
+++ imm2_2 = _mm_and_si128(imm2_2, _pi32avx_2);
+++
+++ imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128());
+++ imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128());
+++
+++ //COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
+++ //_mm256_set_m128i not defined in some versions of immintrin.h
+++ //imm2 = _mm256_set_m128i (imm2_2, imm2_1);
+++ imm2 = _mm256_insertf128_si256(_mm256_castsi128_si256(imm2_1),(imm2_2),1);
+++
+++ swap_sign_bit_sin = _mm256_castsi256_ps(imm0);
+++ poly_mask = _mm256_castsi256_ps(imm2);
+++
+++ /* The magic pass: "Extended precision modular arithmetic"
+++ x = ((x - y * DP1) - y * DP2) - y * DP3; */
+++ xmm1 = _ps256_minus_cephes_DP1;
+++ xmm2 = _ps256_minus_cephes_DP2;
+++ xmm3 = _ps256_minus_cephes_DP3;
+++ xmm1 = _mm256_mul_ps(y, xmm1);
+++ xmm2 = _mm256_mul_ps(y, xmm2);
+++ xmm3 = _mm256_mul_ps(y, xmm3);
+++ x = _mm256_add_ps(x, xmm1);
+++ x = _mm256_add_ps(x, xmm2);
+++ x = _mm256_add_ps(x, xmm3);
+++
+++ imm4_1 = _mm_sub_epi32(imm4_1, _pi32avx_2);
+++ imm4_2 = _mm_sub_epi32(imm4_2, _pi32avx_2);
+++
+++ imm4_1 = _mm_andnot_si128(imm4_1, _pi32avx_4);
+++ imm4_2 = _mm_andnot_si128(imm4_2, _pi32avx_4);
+++
+++ imm4_1 = _mm_slli_epi32(imm4_1, 29);
+++ imm4_2 = _mm_slli_epi32(imm4_2, 29);
+++
+++ //COPY_XMM_TO_IMM(imm4_1, imm4_2, imm4);
+++ //_mm256_set_m128i not defined in some versions of immintrin.h
+++ //imm4 = _mm256_set_m128i (imm4_2, imm4_1);
+++ imm4 = _mm256_insertf128_si256(_mm256_castsi128_si256(imm4_1),(imm4_2),1);
+++
+++ sign_bit_cos = _mm256_castsi256_ps(imm4);
+++
+++ sign_bit_sin = _mm256_xor_ps(sign_bit_sin, swap_sign_bit_sin);
+++
+++ /* Evaluate the first polynom (0 <= x <= Pi/4) */
+++ z = _mm256_mul_ps(x,x);
+++ y = _ps256_coscof_p0;
+++
+++ y = _mm256_mul_ps(y, z);
+++ y = _mm256_add_ps(y, _ps256_coscof_p1);
+++ y = _mm256_mul_ps(y, z);
+++ y = _mm256_add_ps(y, _ps256_coscof_p2);
+++ y = _mm256_mul_ps(y, z);
+++ y = _mm256_mul_ps(y, z);
+++ tmp = _mm256_mul_ps(z, _ps256_0p5);
+++ y = _mm256_sub_ps(y, tmp);
+++ y = _mm256_add_ps(y, _ps256_1);
+++
+++ /* Evaluate the second polynom (Pi/4 <= x <= 0) */
+++
+++ y2 = _ps256_sincof_p0;
+++ y2 = _mm256_mul_ps(y2, z);
+++ y2 = _mm256_add_ps(y2, _ps256_sincof_p1);
+++ y2 = _mm256_mul_ps(y2, z);
+++ y2 = _mm256_add_ps(y2, _ps256_sincof_p2);
+++ y2 = _mm256_mul_ps(y2, z);
+++ y2 = _mm256_mul_ps(y2, x);
+++ y2 = _mm256_add_ps(y2, x);
+++
+++ /* select the correct result from the two polynoms */
+++ xmm3 = poly_mask;
+++ ysin2 = _mm256_and_ps(xmm3, y2);
+++ ysin1 = _mm256_andnot_ps(xmm3, y);
+++ y2 = _mm256_sub_ps(y2,ysin2);
+++ y = _mm256_sub_ps(y, ysin1);
+++
+++ xmm1 = _mm256_add_ps(ysin1,ysin2);
+++ xmm2 = _mm256_add_ps(y,y2);
+++
+++ /* update the sign */
+++ s = _mm256_xor_ps(xmm1, sign_bit_sin);
+++ c = _mm256_xor_ps(xmm2, sign_bit_cos);
+++
+++ //GNSS-SDR needs to return -sin
+++ s = _mm256_xor_ps(s, _ps256_sign_mask);
+++
+++ _mm256_store_ps ((float*)sin_value, s);
+++ _mm256_store_ps ((float*)cos_value, c);
+++
+++ for(int i = 0; i < 8; i++)
+++ {
+++ d_carr_sign[i] = lv_cmake(cos_value[i], sin_value[i]);
+++ }
+++ d_carr_sign += 8;
+++
+++ phase_rad_array = _mm256_add_ps (phase_rad_array, phase_step_rad_array);
+++ }
+++
+++ if (num_points%8!=0)
+++ {
+++ __VOLK_ATTR_ALIGNED(32) float phase_rad_store[8];
+++ _mm256_store_ps ((float*)phase_rad_store, phase_rad_array);
+++
+++ float phase_rad = phase_rad_store[0];
+++
+++ for(int i = 0; i < num_points%8; i++)
+++ {
+++ *d_carr_sign = lv_cmake(cos(phase_rad), -sin(phase_rad));
+++ d_carr_sign++;
+++ phase_rad += phase_step_rad;
+++ }
+++ }
+++}
+++#endif /* LV_HAVE_AVX */
+++
+++#ifdef LV_HAVE_SSE2
+++#include <emmintrin.h>
+++/*!
+++ \brief Accumulates the values in the input buffer
+++ \param result The accumulated result
+++ \param inputBuffer The buffer of data to be accumulated
+++ \param num_points The number of values in inputBuffer to be accumulated
+++ */
+++static inline void volk_gnsssdr_s32f_x2_update_local_carrier_32fc_a_sse2(lv_32fc_t* d_carr_sign, const float phase_rad_init, const float phase_step_rad, unsigned int num_points){
+++
+++// float* pointer1 = (float*)&phase_rad_init;
+++// *pointer1 = 0;
+++// float* pointer2 = (float*)&phase_step_rad;
+++// *pointer2 = 0.5;
+++
+++ const unsigned int sse_iters = num_points / 4;
+++
+++ __m128 _ps_minus_cephes_DP1 = _mm_set1_ps(-0.78515625f);
+++ __m128 _ps_minus_cephes_DP2 = _mm_set1_ps(-2.4187564849853515625e-4f);
+++ __m128 _ps_minus_cephes_DP3 = _mm_set1_ps(-3.77489497744594108e-8f);
+++ __m128 _ps_sign_mask = _mm_set1_ps(-0.f);
+++ __m128i _pi32_1 = _mm_set1_epi32(1);
+++ __m128i _pi32_inv1 = _mm_set1_epi32(~1);
+++ __m128i _pi32_2 = _mm_set1_epi32(2);
+++ __m128i _pi32_4 = _mm_set1_epi32(4);
+++ __m128 _ps_cephes_FOPI = _mm_set1_ps(1.27323954473516f); // 4 / PI
+++ __m128 _ps_sincof_p0 = _mm_set1_ps(-1.9515295891E-4f);
+++ __m128 _ps_sincof_p1 = _mm_set1_ps( 8.3321608736E-3f);
+++ __m128 _ps_sincof_p2 = _mm_set1_ps(-1.6666654611E-1f);
+++ __m128 _ps_coscof_p0 = _mm_set1_ps( 2.443315711809948E-005f);
+++ __m128 _ps_coscof_p1 = _mm_set1_ps(-1.388731625493765E-003f);
+++ __m128 _ps_coscof_p2 = _mm_set1_ps( 4.166664568298827E-002f);
+++ __m128 _ps_1 = _mm_set1_ps(1.f);
+++ __m128 _ps_0p5 = _mm_set1_ps(0.5f);
+++
+++ __m128 phase_step_rad_array = _mm_set1_ps(4*phase_step_rad);
+++
+++ __m128 phase_rad_array, x, s, c, swap_sign_bit_sin, sign_bit_cos, poly_mask, z, tmp, y, y2, ysin1, ysin2;
+++ __m128 xmm1, xmm2, xmm3, sign_bit_sin;
+++ __m128i emm0, emm2, emm4;
+++ __VOLK_ATTR_ALIGNED(16) float sin_value[4];
+++ __VOLK_ATTR_ALIGNED(16) float cos_value[4];
+++
+++ phase_rad_array = _mm_set_ps (phase_rad_init+3*phase_step_rad, phase_rad_init+2*phase_step_rad, phase_rad_init+phase_step_rad, phase_rad_init);
+++
+++ for(int i = 0; i < sse_iters; i++)
+++ {
+++ x = phase_rad_array;
+++
+++ /* extract the sign bit (upper one) */
+++ sign_bit_sin = _mm_and_ps(x, _ps_sign_mask);
+++
+++ /* take the absolute value */
+++ x = _mm_xor_ps(x, sign_bit_sin);
+++
+++ /* scale by 4/Pi */
+++ y = _mm_mul_ps(x, _ps_cephes_FOPI);
+++
+++ /* store the integer part of y in emm2 */
+++ emm2 = _mm_cvttps_epi32(y);
+++
+++ /* j=(j+1) & (~1) (see the cephes sources) */
+++ emm2 = _mm_add_epi32(emm2, _pi32_1);
+++ emm2 = _mm_and_si128(emm2, _pi32_inv1);
+++ y = _mm_cvtepi32_ps(emm2);
+++
+++ emm4 = emm2;
+++
+++ /* get the swap sign flag for the sine */
+++ emm0 = _mm_and_si128(emm2, _pi32_4);
+++ emm0 = _mm_slli_epi32(emm0, 29);
+++ swap_sign_bit_sin = _mm_castsi128_ps(emm0);
+++
+++ /* get the polynom selection mask for the sine*/
+++ emm2 = _mm_and_si128(emm2, _pi32_2);
+++ emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
+++ poly_mask = _mm_castsi128_ps(emm2);
+++
+++ /* The magic pass: "Extended precision modular arithmetic"
+++ x = ((x - y * DP1) - y * DP2) - y * DP3; */
+++ xmm1 = _mm_mul_ps(y, _ps_minus_cephes_DP1);
+++ xmm2 = _mm_mul_ps(y, _ps_minus_cephes_DP2);
+++ xmm3 = _mm_mul_ps(y, _ps_minus_cephes_DP3);
+++ x = _mm_add_ps(_mm_add_ps(x, xmm1), _mm_add_ps(xmm2, xmm3));
+++
+++ emm4 = _mm_sub_epi32(emm4, _pi32_2);
+++ emm4 = _mm_andnot_si128(emm4, _pi32_4);
+++ emm4 = _mm_slli_epi32(emm4, 29);
+++ sign_bit_cos = _mm_castsi128_ps(emm4);
+++
+++ sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
+++
+++ /* Evaluate the first polynom (0 <= x <= Pi/4) */
+++ z = _mm_mul_ps(x,x);
+++ y = _ps_coscof_p0;
+++ y = _mm_mul_ps(y, z);
+++ y = _mm_add_ps(y, _ps_coscof_p1);
+++ y = _mm_mul_ps(y, z);
+++ y = _mm_add_ps(y, _ps_coscof_p2);
+++ y = _mm_mul_ps(y, _mm_mul_ps(z, z));
+++ tmp = _mm_mul_ps(z, _ps_0p5);
+++ y = _mm_sub_ps(y, tmp);
+++ y = _mm_add_ps(y, _ps_1);
+++
+++ /* Evaluate the second polynom (Pi/4 <= x <= 0) */
+++ y2 = _ps_sincof_p0;
+++ y2 = _mm_mul_ps(y2, z);
+++ y2 = _mm_add_ps(y2, _ps_sincof_p1);
+++ y2 = _mm_mul_ps(y2, z);
+++ y2 = _mm_add_ps(y2, _ps_sincof_p2);
+++ y2 = _mm_mul_ps(y2, _mm_mul_ps(z, x));
+++ y2 = _mm_add_ps(y2, x);
+++
+++ /* select the correct result from the two polynoms */
+++ xmm3 = poly_mask;
+++ ysin2 = _mm_and_ps(xmm3, y2);
+++ ysin1 = _mm_andnot_ps(xmm3, y);
+++ y2 = _mm_sub_ps(y2,ysin2);
+++ y = _mm_sub_ps(y, ysin1);
+++
+++ xmm1 = _mm_add_ps(ysin1,ysin2);
+++ xmm2 = _mm_add_ps(y,y2);
+++
+++ /* update the sign */
+++ s = _mm_xor_ps(xmm1, sign_bit_sin);
+++ c = _mm_xor_ps(xmm2, sign_bit_cos);
+++
+++ //GNSS-SDR needs to return -sin
+++ s = _mm_xor_ps(s, _ps_sign_mask);
+++
+++ _mm_store_ps ((float*)sin_value, s);
+++ _mm_store_ps ((float*)cos_value, c);
+++
+++ for(int i = 0; i < 4; i++)
+++ {
+++ d_carr_sign[i] = lv_cmake(cos_value[i], sin_value[i]);
+++ }
+++ d_carr_sign += 4;
+++
+++ phase_rad_array = _mm_add_ps (phase_rad_array, phase_step_rad_array);
+++ }
+++
+++ if (num_points%4!=0)
+++ {
+++ __VOLK_ATTR_ALIGNED(16) float phase_rad_store[4];
+++ _mm_store_si128 ((__m128i*)phase_rad_store, phase_rad_array);
+++
+++ float phase_rad = phase_rad_store[0];
+++
+++ for(int i = 0; i < num_points%4; i++)
+++ {
+++ *d_carr_sign = lv_cmake(cos(phase_rad), -sin(phase_rad));
+++ d_carr_sign++;
+++ phase_rad += phase_step_rad;
+++ }
+++ }
+++}
+++#endif /* LV_HAVE_SSE2 */
+++
+++#ifdef LV_HAVE_GENERIC
+++/*!
+++ \brief Accumulates the values in the input buffer
+++ \param result The accumulated result
+++ \param inputBuffer The buffer of data to be accumulated
+++ \param num_points The number of values in inputBuffer to be accumulated
+++ */
+++static inline void volk_gnsssdr_s32f_x2_update_local_carrier_32fc_a_generic(lv_32fc_t* d_carr_sign, const float phase_rad_init, const float phase_step_rad, unsigned int num_points){
+++
+++// float* pointer1 = (float*)&phase_rad_init;
+++// *pointer1 = 0;
+++// float* pointer2 = (float*)&phase_step_rad;
+++// *pointer2 = 0.5;
+++
+++ float phase_rad = phase_rad_init;
+++ for(int i = 0; i < num_points; i++)
+++ {
+++ *d_carr_sign = lv_cmake(cos(phase_rad), -sin(phase_rad));
+++ d_carr_sign++;
+++ phase_rad += phase_step_rad;
+++ }
+++}
+++#endif /* LV_HAVE_GENERIC */
+++#endif /* INCLUDED_volk_gnsssdr_32fc_s32f_x2_update_local_carrier_32fc_a_H */
+++
++diff -rupN /Users/andres/Desktop/volk_gnsssdr/lib/CMakeLists.txt /Users/andres/Desktop/volk_gnsssdr_original/lib/CMakeLists.txt
++--- /Users/andres/Desktop/volk_gnsssdr/lib/CMakeLists.txt 2014-10-17 03:00:41.000000000 +0200
+++++ /Users/andres/Desktop/volk_gnsssdr_original/lib/CMakeLists.txt 2014-10-15 01:55:08.000000000 +0200
++@@ -406,8 +406,10 @@ if(${CMAKE_VERSION} VERSION_GREATER "2.8
++ # if we find one that matches our current system architecture
++ # set up the assembler flags and include the source files
++ foreach(ARCH ${ASM_ARCHS_AVAILABLE})
+++ message(STATUS "--==>> -CFLAGS1: ${FULL_C_FLAGS}")
++ string(REGEX MATCH "${ARCH}" ASM_ARCH "${FULL_C_FLAGS}")
++ if( ASM_ARCH STREQUAL "armv7" )
+++ set(ASM-ATT $ENV{ASM})
++ message(STATUS "---- Adding ASM files") # we always use ATT syntax
++ message(STATUS "-- Detected armv7 architecture; enabling ASM")
++ # setup architecture specific assembler flags
++@@ -420,20 +422,13 @@ if(${CMAKE_VERSION} VERSION_GREATER "2.8
++ message(STATUS "Adding source file: ${asm_file}")
++ endforeach(asm_file)
++ endif()
++- enable_language(ASM)
++- set(CMAKE_ASM_FLAGS ${ARCH_ASM_FLAGS})
++- message(STATUS "c flags: ${FULL_C_FLAGS}")
++- message(STATUS "asm flags: ${CMAKE_ASM_FLAGS}")
+++ set(CMAKE_ASM-ATT_FLAGS_INIT ${ARCH_ASM_FLAGS})
+++ enable_language(ASM-ATT) # this must be after flags_init
+++ message(STATUS "asm flags: ${CMAKE_ASM-ATT_FLAGS}")
++ endforeach(ARCH)
++
++ else(${CMAKE_VERSION} VERSION_GREATER "2.8.9")
++ message(STATUS "Not enabling ASM support. CMake >= 2.8.10 required.")
++- foreach(machine_name ${available_machines})
++- string(REGEX MATCH "neon" NEON_MACHINE ${machine_name})
++- if( NEON_MACHINE STREQUAL "neon")
++- message(FATAL_ERROR "CMake >= 2.8.10 is required for ARM NEON support")
++- endif()
++- endforeach()
++ endif(${CMAKE_VERSION} VERSION_GREATER "2.8.9")
++
++ ########################################################################
++@@ -517,11 +512,24 @@ if(MSVC)
++ endif()
++
++ #create the volk_gnsssdr runtime library
++-add_library(volk_gnsssdr SHARED ${volk_gnsssdr_sources})
+++
+++#MODIFICATIONS BY GNSS-SDR
+++file(GLOB orc ${CMAKE_SOURCE_DIR}/orc/*.orc)
+++file(GLOB CommonMacros ${CMAKE_SOURCE_DIR}/kernels/CommonMacros/*.h ${CMAKE_SOURCE_DIR}/kernels/CommonMacros/README.txt)
+++
+++#add_library(volk_gnsssdr SHARED ${volk_gnsssdr_sources})
+++add_library(volk_gnsssdr SHARED ${volk_gnsssdr_sources} ${h_files} ${CommonMacros} ${orc})
+++
+++source_group("Kernels" FILES ${h_files})
+++source_group("Common Macros" FILES ${CommonMacros})
+++source_group("ORC Files" FILES ${orc})
+++#END OF MODIFICATIONS
+++
++ target_link_libraries(volk_gnsssdr ${volk_gnsssdr_libraries})
++ set_target_properties(volk_gnsssdr PROPERTIES SOVERSION ${LIBVER})
++ set_target_properties(volk_gnsssdr PROPERTIES DEFINE_SYMBOL "volk_gnsssdr_EXPORTS")
++
+++
++ install(TARGETS volk_gnsssdr
++ LIBRARY DESTINATION lib${LIB_SUFFIX} COMPONENT "volk_gnsssdr_runtime" # .so file
++ ARCHIVE DESTINATION lib${LIB_SUFFIX} COMPONENT "volk_gnsssdr_devel" # .lib file
++diff -rupN /Users/andres/Desktop/volk_gnsssdr/lib/qa_utils.cc /Users/andres/Desktop/volk_gnsssdr_original/lib/qa_utils.cc
++--- /Users/andres/Desktop/volk_gnsssdr/lib/qa_utils.cc 2014-10-17 03:00:41.000000000 +0200
+++++ /Users/andres/Desktop/volk_gnsssdr_original/lib/qa_utils.cc 2014-10-17 01:54:35.000000000 +0200
++@@ -5,9 +5,7 @@
++ #include <boost/tokenizer.hpp>
++ #include <boost/xpressive/xpressive.hpp>
++ #include <iostream>
++-#include <fstream>
++ #include <vector>
++-#include <map>
++ #include <list>
++ #include <ctime>
++ #include <cmath>
++@@ -217,6 +215,72 @@ inline void run_cast_test3_s32fc(volk_gn
++ while(iter--) func(buffs[0], buffs[1], buffs[2], scalar, vlen, arch.c_str());
++ }
++
+++//ADDED BY GNSS-SDR. START
+++inline void run_cast_test1_s8i(volk_gnsssdr_fn_1arg_s8i func, std::vector<void *> &buffs, char scalar, unsigned int vlen, unsigned int iter, std::string arch) {
+++ while(iter--) func(buffs[0], scalar, vlen, arch.c_str());
+++}
+++
+++inline void run_cast_test2_s8i(volk_gnsssdr_fn_2arg_s8i func, std::vector<void *> &buffs, char scalar, unsigned int vlen, unsigned int iter, std::string arch) {
+++ while(iter--) func(buffs[0], buffs[1], scalar, vlen, arch.c_str());
+++}
+++
+++inline void run_cast_test3_s8i(volk_gnsssdr_fn_3arg_s8i func, std::vector<void *> &buffs, char scalar, unsigned int vlen, unsigned int iter, std::string arch) {
+++ while(iter--) func(buffs[0], buffs[1], buffs[2], scalar, vlen, arch.c_str());
+++}
+++
+++inline void run_cast_test1_s8ic(volk_gnsssdr_fn_1arg_s8ic func, std::vector<void *> &buffs, lv_8sc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) {
+++ while(iter--) func(buffs[0], scalar, vlen, arch.c_str());
+++}
+++
+++inline void run_cast_test2_s8ic(volk_gnsssdr_fn_2arg_s8ic func, std::vector<void *> &buffs, lv_8sc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) {
+++ while(iter--) func(buffs[0], buffs[1], scalar, vlen, arch.c_str());
+++}
+++
+++inline void run_cast_test3_s8ic(volk_gnsssdr_fn_3arg_s8ic func, std::vector<void *> &buffs, lv_8sc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) {
+++ while(iter--) func(buffs[0], buffs[1], buffs[2], scalar, vlen, arch.c_str());
+++}
+++
+++inline void run_cast_test8(volk_gnsssdr_fn_8arg func, std::vector<void *> &buffs, unsigned int vlen, unsigned int iter, std::string arch) {
+++ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], vlen, arch.c_str());
+++}
+++
+++inline void run_cast_test8_s8i(volk_gnsssdr_fn_8arg_s8i func, std::vector<void *> &buffs, char scalar, unsigned int vlen, unsigned int iter, std::string arch) {
+++ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], scalar, vlen, arch.c_str());
+++}
+++
+++inline void run_cast_test8_s8ic(volk_gnsssdr_fn_8arg_s8ic func, std::vector<void *> &buffs, lv_8sc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) {
+++ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], scalar, vlen, arch.c_str());
+++}
+++
+++inline void run_cast_test8_s32f(volk_gnsssdr_fn_8arg_s32f func, std::vector<void *> &buffs, float scalar, unsigned int vlen, unsigned int iter, std::string arch) {
+++ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], scalar, vlen, arch.c_str());
+++}
+++
+++inline void run_cast_test8_s32fc(volk_gnsssdr_fn_8arg_s32fc func, std::vector<void *> &buffs, lv_32fc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) {
+++ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], scalar, vlen, arch.c_str());
+++}
+++
+++inline void run_cast_test12(volk_gnsssdr_fn_12arg func, std::vector<void *> &buffs, unsigned int vlen, unsigned int iter, std::string arch) {
+++ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], buffs[8], buffs[9], buffs[10], buffs[11], vlen, arch.c_str());
+++}
+++
+++inline void run_cast_test12_s8i(volk_gnsssdr_fn_12arg_s8i func, std::vector<void *> &buffs, char scalar, unsigned int vlen, unsigned int iter, std::string arch) {
+++ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], buffs[8], buffs[9], buffs[10], buffs[11], scalar, vlen, arch.c_str());
+++}
+++
+++inline void run_cast_test12_s8ic(volk_gnsssdr_fn_12arg_s8ic func, std::vector<void *> &buffs, lv_8sc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) {
+++ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], buffs[8], buffs[9], buffs[10], buffs[11], scalar, vlen, arch.c_str());
+++}
+++
+++inline void run_cast_test12_s32f(volk_gnsssdr_fn_12arg_s32f func, std::vector<void *> &buffs, float scalar, unsigned int vlen, unsigned int iter, std::string arch) {
+++ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], buffs[8], buffs[9], buffs[10], buffs[11], scalar, vlen, arch.c_str());
+++}
+++
+++inline void run_cast_test12_s32fc(volk_gnsssdr_fn_12arg_s32fc func, std::vector<void *> &buffs, lv_32fc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) {
+++ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], buffs[8], buffs[9], buffs[10], buffs[11], scalar, vlen, arch.c_str());
+++}
+++//ADDED BY GNSS-SDR. END
+++
++ // This function is a nop that helps resolve GNU Radio bugs 582 and 583.
++ // Without this the cast in run_volk_gnsssdr_tests for tol_i = static_cast<int>(float tol)
++ // won't happen on armhf (reported on cortex A9 and A15).
++@@ -330,9 +394,9 @@ bool run_volk_gnsssdr_tests(volk_gnsssdr
++ lv_32fc_t scalar,
++ int vlen,
++ int iter,
++- std::vector<volk_gnsssdr_test_results_t> *results,
++- std::string puppet_master_name,
++- bool benchmark_mode,
+++ std::vector<std::string> *best_arch_vector = 0,
+++ std::string puppet_master_name = "NULL",
+++ bool benchmark_mode,
++ std::string kernel_regex
++ ) {
++ boost::xpressive::sregex kernel_expression = boost::xpressive::sregex::compile(kernel_regex);
++@@ -340,12 +404,6 @@ bool run_volk_gnsssdr_tests(volk_gnsssdr
++ // in this case we have a regex and are only looking to test one kernel
++ return false;
++ }
++- if(results) {
++- results->push_back(volk_gnsssdr_test_results_t());
++- results->back().name = name;
++- results->back().vlen = vlen;
++- results->back().iter = iter;
++- }
++ std::cout << "RUN_VOLK_TESTS: " << name << "(" << vlen << "," << iter << ")" << std::endl;
++
++ // The multiply and lv_force_cast_hf are work arounds for GNU Radio bugs 582 and 583
++@@ -426,7 +484,17 @@ bool run_volk_gnsssdr_tests(volk_gnsssdr
++ } else {
++ run_cast_test1_s32f((volk_gnsssdr_fn_1arg_s32f)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]);
++ }
++- } else throw "unsupported 1 arg function >1 scalars";
+++ }
+++ //ADDED BY GNSS-SDR. START
+++ else if(inputsc.size() == 1 && !inputsc[0].is_float) {
+++ if(inputsc[0].is_complex) {
+++ run_cast_test1_s8ic((volk_gnsssdr_fn_1arg_s8ic)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]);
+++ } else {
+++ run_cast_test1_s8i((volk_gnsssdr_fn_1arg_s8i)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]);
+++ }
+++ }
+++ //ADDED BY GNSS-SDR. END
+++ else throw "unsupported 1 arg function >1 scalars";
++ break;
++ case 2:
++ if(inputsc.size() == 0) {
++@@ -437,7 +505,17 @@ bool run_volk_gnsssdr_tests(volk_gnsssdr
++ } else {
++ run_cast_test2_s32f((volk_gnsssdr_fn_2arg_s32f)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]);
++ }
++- } else throw "unsupported 2 arg function >1 scalars";
+++ }
+++ //ADDED BY GNSS-SDR. START
+++ else if(inputsc.size() == 1 && !inputsc[0].is_float) {
+++ if(inputsc[0].is_complex) {
+++ run_cast_test2_s8ic((volk_gnsssdr_fn_2arg_s8ic)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]);
+++ } else {
+++ run_cast_test2_s8i((volk_gnsssdr_fn_2arg_s8i)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]);
+++ }
+++ }
+++ //ADDED BY GNSS-SDR. END
+++ else throw "unsupported 2 arg function >1 scalars";
++ break;
++ case 3:
++ if(inputsc.size() == 0) {
++@@ -448,11 +526,61 @@ bool run_volk_gnsssdr_tests(volk_gnsssdr
++ } else {
++ run_cast_test3_s32f((volk_gnsssdr_fn_3arg_s32f)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]);
++ }
++- } else throw "unsupported 3 arg function >1 scalars";
+++ }
+++ //ADDED BY GNSS-SDR. START
+++ else if(inputsc.size() == 1 && !inputsc[0].is_float) {
+++ if(inputsc[0].is_complex) {
+++ run_cast_test3_s8ic((volk_gnsssdr_fn_3arg_s8ic)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]);
+++ } else {
+++ run_cast_test3_s8i((volk_gnsssdr_fn_3arg_s8i)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]);
+++ }
+++ }
+++ //ADDED BY GNSS-SDR. END
+++ else throw "unsupported 3 arg function >1 scalars";
++ break;
++ case 4:
++ run_cast_test4((volk_gnsssdr_fn_4arg)(manual_func), test_data[i], vlen, iter, arch_list[i]);
++ break;
+++ //ADDED BY GNSS-SDR. START
+++ case 8:
+++ if(inputsc.size() == 0) {
+++ run_cast_test8((volk_gnsssdr_fn_8arg)(manual_func), test_data[i], vlen, iter, arch_list[i]);
+++ } else if(inputsc.size() == 1 && inputsc[0].is_float) {
+++ if(inputsc[0].is_complex) {
+++ run_cast_test8_s32fc((volk_gnsssdr_fn_8arg_s32fc)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]);
+++ } else {
+++ run_cast_test8_s32f((volk_gnsssdr_fn_8arg_s32f)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]);
+++ }
+++ }
+++ else if(inputsc.size() == 1 && !inputsc[0].is_float) {
+++ if(inputsc[0].is_complex) {
+++ run_cast_test8_s8ic((volk_gnsssdr_fn_8arg_s8ic)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]);
+++ } else {
+++ run_cast_test8_s8i((volk_gnsssdr_fn_8arg_s8i)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]);
+++ }
+++ }
+++ else throw "unsupported 8 arg function >1 scalars";
+++ break;
+++ case 12:
+++ if(inputsc.size() == 0) {
+++ run_cast_test12((volk_gnsssdr_fn_12arg)(manual_func), test_data[i], vlen, iter, arch_list[i]);
+++ } else if(inputsc.size() == 1 && inputsc[0].is_float) {
+++ if(inputsc[0].is_complex) {
+++ run_cast_test12_s32fc((volk_gnsssdr_fn_12arg_s32fc)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]);
+++ } else {
+++ run_cast_test12_s32f((volk_gnsssdr_fn_12arg_s32f)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]);
+++ }
+++ }
+++ else if(inputsc.size() == 1 && !inputsc[0].is_float) {
+++ if(inputsc[0].is_complex) {
+++ run_cast_test12_s8ic((volk_gnsssdr_fn_12arg_s8ic)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]);
+++ } else {
+++ run_cast_test12_s8i((volk_gnsssdr_fn_12arg_s8i)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]);
+++ }
+++ }
+++ else throw "unsupported 12 arg function >1 scalars";
+++ break;
+++ //ADDED BY GNSS-SDR. END
++ default:
++ throw "no function handler for this signature";
++ break;
++@@ -461,13 +589,6 @@ bool run_volk_gnsssdr_tests(volk_gnsssdr
++ end = clock();
++ double arch_time = 1000.0 * (double)(end-start)/(double)CLOCKS_PER_SEC;
++ std::cout << arch_list[i] << " completed in " << arch_time << "ms" << std::endl;
++- if(results) {
++- volk_gnsssdr_test_time_t result;
++- result.name = arch_list[i];
++- result.time = arch_time;
++- result.units = "ms";
++- results->back().results[result.name] = result;
++- }
++
++ profile_times.push_back(arch_time);
++ }
++@@ -568,14 +689,13 @@ bool run_volk_gnsssdr_tests(volk_gnsssdr
++
++ std::cout << "Best aligned arch: " << best_arch_a << std::endl;
++ std::cout << "Best unaligned arch: " << best_arch_u << std::endl;
++- if(results) {
+++ if(best_arch_vector) {
++ if(puppet_master_name == "NULL") {
++- results->back().config_name = name;
++- } else {
++- results->back().config_name = puppet_master_name;
+++ best_arch_vector->push_back(name + " " + best_arch_a + " " + best_arch_u);
+++ }
+++ else {
+++ best_arch_vector->push_back(puppet_master_name + " " + best_arch_a + " " + best_arch_u);
++ }
++- results->back().best_arch_a = best_arch_a;
++- results->back().best_arch_u = best_arch_u;
++ }
++
++ return fail_global;
++diff -rupN /Users/andres/Desktop/volk_gnsssdr/lib/qa_utils.h /Users/andres/Desktop/volk_gnsssdr_original/lib/qa_utils.h
++--- /Users/andres/Desktop/volk_gnsssdr/lib/qa_utils.h 2014-10-17 03:00:41.000000000 +0200
+++++ /Users/andres/Desktop/volk_gnsssdr_original/lib/qa_utils.h 2014-10-15 01:55:08.000000000 +0200
++@@ -3,10 +3,7 @@
++
++ #include <cstdlib>
++ #include <string>
++-#include <iostream>
++-#include <fstream>
++ #include <vector>
++-#include <map>
++ #include <volk_gnsssdr/volk_gnsssdr.h>
++ #include <volk_gnsssdr/volk_gnsssdr_common.h>
++
++@@ -24,46 +21,10 @@ volk_gnsssdr_type_t volk_gnsssdr_type_fr
++ float uniform(void);
++ void random_floats(float *buf, unsigned n);
++
++-class volk_gnsssdr_test_time_t {
++- public:
++- std::string name;
++- double time;
++- std::string units;
++-};
+++bool run_volk_gnsssdr_tests(volk_gnsssdr_func_desc_t, void(*)(), std::string, float, lv_32fc_t, int, int, std::vector<std::string> *, std::string, bool benchmark_mode=false, std::string kernel_regex="");
++
++-class volk_gnsssdr_test_results_t {
++- public:
++- std::string name;
++- std::string config_name;
++- int vlen;
++- int iter;
++- std::map<std::string, volk_gnsssdr_test_time_t> results;
++- std::string best_arch_a;
++- std::string best_arch_u;
++-};
++
++-bool run_volk_gnsssdr_tests(
++- volk_gnsssdr_func_desc_t,
++- void(*)(),
++- std::string,
++- float,
++- lv_32fc_t,
++- int,
++- int,
++- std::vector<volk_gnsssdr_test_results_t> *results = NULL,
++- std::string puppet_master_name = "NULL",
++- bool benchmark_mode=false,
++- std::string kernel_regex=""
++- );
++-
++-
++-#define VOLK_RUN_TESTS(func, tol, scalar, len, iter) \
++- BOOST_AUTO_TEST_CASE(func##_test) { \
++- BOOST_CHECK_EQUAL(run_volk_gnsssdr_tests( \
++- func##_get_func_desc(), (void (*)())func##_manual, \
++- std::string(#func), tol, scalar, len, iter, 0, "NULL"), \
++- 0); \
++- }
+++#define VOLK_RUN_TESTS(func, tol, scalar, len, iter) BOOST_AUTO_TEST_CASE(func##_test) { BOOST_CHECK_EQUAL(run_volk_gnsssdr_tests(func##_get_func_desc(), (void (*)())func##_manual, std::string(#func), tol, scalar, len, iter, 0, "NULL"), 0); }
++ #define VOLK_PROFILE(func, tol, scalar, len, iter, results, bnmode, kernel_regex) run_volk_gnsssdr_tests(func##_get_func_desc(), (void (*)())func##_manual, std::string(#func), tol, scalar, len, iter, results, "NULL", bnmode, kernel_regex)
++ #define VOLK_PUPPET_PROFILE(func, puppet_master_func, tol, scalar, len, iter, results, bnmode, kernel_regex) run_volk_gnsssdr_tests(func##_get_func_desc(), (void (*)())func##_manual, std::string(#func), tol, scalar, len, iter, results, std::string(#puppet_master_func), bnmode, kernel_regex)
++ typedef void (*volk_gnsssdr_fn_1arg)(void *, unsigned int, const char*); //one input, operate in place
++@@ -77,4 +38,25 @@ typedef void (*volk_gnsssdr_fn_1arg_s32f
++ typedef void (*volk_gnsssdr_fn_2arg_s32fc)(void *, void *, lv_32fc_t, unsigned int, const char*);
++ typedef void (*volk_gnsssdr_fn_3arg_s32fc)(void *, void *, void *, lv_32fc_t, unsigned int, const char*);
++
+++//ADDED BY GNSS-SDR. START
+++typedef void (*volk_gnsssdr_fn_1arg_s8i)(void *, char, unsigned int, const char*); //one input vector, one scalar char input
+++typedef void (*volk_gnsssdr_fn_2arg_s8i)(void *, void *, char, unsigned int, const char*);
+++typedef void (*volk_gnsssdr_fn_3arg_s8i)(void *, void *, void *, char, unsigned int, const char*);
+++typedef void (*volk_gnsssdr_fn_1arg_s8ic)(void *, lv_8sc_t, unsigned int, const char*); //one input vector, one scalar lv_8sc_t vector input
+++typedef void (*volk_gnsssdr_fn_2arg_s8ic)(void *, void *, lv_8sc_t, unsigned int, const char*);
+++typedef void (*volk_gnsssdr_fn_3arg_s8ic)(void *, void *, void *, lv_8sc_t, unsigned int, const char*);
+++
+++typedef void (*volk_gnsssdr_fn_8arg)(void *, void *, void *, void *, void *, void *, void *, void *, unsigned int, const char*);
+++typedef void (*volk_gnsssdr_fn_8arg_s32f)(void *, void *, void *, void *, void *, void *, void *, void *, float, unsigned int, const char*);
+++typedef void (*volk_gnsssdr_fn_8arg_s32fc)(void *, void *, void *, void *, void *, void *, void *, void *, lv_32fc_t, unsigned int, const char*);
+++typedef void (*volk_gnsssdr_fn_8arg_s8i)(void *, void *, void *, void *, void *, void *, void *, void *, char, unsigned int, const char*);
+++typedef void (*volk_gnsssdr_fn_8arg_s8ic)(void *, void *, void *, void *, void *, void *, void *, void *, lv_8sc_t, unsigned int, const char*);
+++
+++typedef void (*volk_gnsssdr_fn_12arg)(void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, unsigned int, const char*);
+++typedef void (*volk_gnsssdr_fn_12arg_s32f)(void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, float, unsigned int, const char*);
+++typedef void (*volk_gnsssdr_fn_12arg_s32fc)(void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, lv_32fc_t, unsigned int, const char*);
+++typedef void (*volk_gnsssdr_fn_12arg_s8i)(void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, char, unsigned int, const char*);
+++typedef void (*volk_gnsssdr_fn_12arg_s8ic)(void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, lv_8sc_t, unsigned int, const char*);
+++//ADDED BY GNSS-SDR. END
+++
++ #endif //VOLK_QA_UTILS_H
++diff -rupN /Users/andres/Desktop/volk_gnsssdr/lib/testqa.cc /Users/andres/Desktop/volk_gnsssdr_original/lib/testqa.cc
++--- /Users/andres/Desktop/volk_gnsssdr/lib/testqa.cc 2014-10-17 03:00:41.000000000 +0200
+++++ /Users/andres/Desktop/volk_gnsssdr_original/lib/testqa.cc 2014-10-15 01:55:08.000000000 +0200
++@@ -24,6 +24,58 @@
++ #include <volk_gnsssdr/volk_gnsssdr.h>
++ #include <boost/test/unit_test.hpp>
++
+++//VOLK PROTOKERNELS OBTAINED FROM THE GNURADIO BASE
+++VOLK_RUN_TESTS(volk_gnsssdr_32fc_x2_multiply_32fc, 1e-4, 0, 20462, 1);
+++VOLK_RUN_TESTS(volk_gnsssdr_32fc_x2_dot_prod_32fc, 1e-4, 0, 204603, 1);
+++VOLK_RUN_TESTS(volk_gnsssdr_32fc_s32fc_multiply_32fc, 1e-4, 0, 20462, 1);
+++VOLK_RUN_TESTS(volk_gnsssdr_32fc_conjugate_32fc, 1e-4, 0, 20462, 1);
+++VOLK_RUN_TESTS(volk_gnsssdr_32f_x2_add_32f, 1e-4, 0, 20462, 1);
+++VOLK_RUN_TESTS(volk_gnsssdr_32f_index_max_16u, 3, 0, 20462, 1);
+++VOLK_RUN_TESTS(volk_gnsssdr_32f_accumulator_s32f, 1e-4, 0, 20462, 1);
+++VOLK_RUN_TESTS(volk_gnsssdr_32fc_magnitude_squared_32f, 1e-4, 0, 20462, 1);
+++VOLK_RUN_TESTS(volk_gnsssdr_32f_s32f_convert_16i, 3, 0, 20462, 1);
+++
+++//GNSS-SDR PROTO-KERNELS
+++VOLK_RUN_TESTS(volk_gnsssdr_8ic_x2_multiply_8ic, 1e-4, 0, 20462, 1);
+++VOLK_RUN_TESTS(volk_gnsssdr_8u_x2_multiply_8u, 1e-4, 0, 20462, 1);
+++VOLK_RUN_TESTS(volk_gnsssdr_8ic_x2_dot_prod_8ic, 1e-4, 0, 204603, 1);
+++VOLK_RUN_TESTS(volk_gnsssdr_8ic_s8ic_multiply_8ic, 1e-4, 0, 20462, 1);
+++VOLK_RUN_TESTS(volk_gnsssdr_8ic_conjugate_8ic, 1e-4, 0, 20462, 1);
+++VOLK_RUN_TESTS(volk_gnsssdr_8i_x2_add_8i, 1e-4, 0, 20462, 1);
+++VOLK_RUN_TESTS(volk_gnsssdr_8i_index_max_16u, 3, 0, 20462, 1);
+++VOLK_RUN_TESTS(volk_gnsssdr_8i_accumulator_s8i, 1e-4, 0, 20462, 1);
+++VOLK_RUN_TESTS(volk_gnsssdr_8ic_magnitude_squared_8i, 1e-4, 0, 20462, 1);
+++
+++VOLK_RUN_TESTS(volk_gnsssdr_8i_max_s8i, 3, 0, 20462, 1);
+++VOLK_RUN_TESTS(volk_gnsssdr_64f_accumulator_64f, 3, 0, 20462, 1);
+++
+++VOLK_RUN_TESTS(volk_gnsssdr_32fc_convert_16ic, 3, 0, 20462, 1);
+++VOLK_RUN_TESTS(volk_gnsssdr_32fc_s32f_convert_8ic, 3, 0, 20462, 1);
+++VOLK_RUN_TESTS(volk_gnsssdr_32fc_convert_8ic, 3, 0, 20462, 1);
+++VOLK_RUN_TESTS(volk_gnsssdr_16i_s32f_convert_32f, 3, 0, 20462, 1);
+++
+++VOLK_RUN_TESTS(volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3, 1e-4, 0, 20462, 1);
+++VOLK_RUN_TESTS(volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3, 1e-4, 0, 20462, 1);
+++VOLK_RUN_TESTS(volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3, 1e-4, 0, 20462, 1);
+++VOLK_RUN_TESTS(volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3, 1e-4, 0, 20462, 1);
+++VOLK_RUN_TESTS(volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3, 1e-4, 0, 20462, 1);
+++
+++VOLK_RUN_TESTS(volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5, 1e-4, 0, 20462, 1);
+++VOLK_RUN_TESTS(volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5, 1e-4, 0, 20462, 1);
+++VOLK_RUN_TESTS(volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5, 1e-4, 0, 20462, 1);
+++VOLK_RUN_TESTS(volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5, 1e-4, 0, 20462, 1);
+++VOLK_RUN_TESTS(volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5, 1e-4, 0, 20462, 1);
+++VOLK_RUN_TESTS(volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5, 1e-4, 0, 20462, 1);
+++
+++VOLK_RUN_TESTS(volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc, 1e-4, 0, 20462, 1);
+++VOLK_RUN_TESTS(volk_gnsssdr_s32f_x2_update_local_carrier_32fc, 1e-4, 0, 20462, 1);
+++
+++
+++
+++
+++
+++
+++
++ //VOLK_RUN_TESTS(volk_gnsssdr_16i_x5_add_quad_16i_x4, 1e-4, 2046, 10000);
++ //VOLK_RUN_TESTS(volk_gnsssdr_16i_branch_4_state_8, 1e-4, 2046, 10000);
++ //VOLK_RUN_TESTS(volk_gnsssdr_16i_max_star_16i, 0, 0, 20462, 10000);
++diff -rupN /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_32f_x2_add_32f.orc /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_32f_x2_add_32f.orc
++--- /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_32f_x2_add_32f.orc 1970-01-01 01:00:00.000000000 +0100
+++++ /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_32f_x2_add_32f.orc 2014-10-15 01:55:08.000000000 +0200
++@@ -0,0 +1,5 @@
+++.function volk_gnsssdr_32f_x2_add_32f_a_orc_impl
+++.dest 4 dst
+++.source 4 src1
+++.source 4 src2
+++addf dst, src1, src2
++diff -rupN /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_32fc_s32fc_multiply_32fc.orc /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_32fc_s32fc_multiply_32fc.orc
++--- /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_32fc_s32fc_multiply_32fc.orc 1970-01-01 01:00:00.000000000 +0100
+++++ /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_32fc_s32fc_multiply_32fc.orc 2014-10-15 01:55:08.000000000 +0200
++@@ -0,0 +1,18 @@
+++.function volk_gnsssdr_32fc_s32fc_multiply_32fc_a_orc_impl
+++.source 8 src1
+++.floatparam 8 scalar
+++.dest 8 dst
+++.temp 8 iqprod
+++.temp 4 real
+++.temp 4 imag
+++.temp 4 ac
+++.temp 4 bd
+++.temp 8 swapped
+++x2 mulf iqprod, src1, scalar
+++splitql bd, ac, iqprod
+++subf real, ac, bd
+++swaplq swapped, src1
+++x2 mulf iqprod, swapped, scalar
+++splitql bd, ac, iqprod
+++addf imag, ac, bd
+++mergelq dst, real, imag
++diff -rupN /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_32fc_x2_multiply_32fc.orc /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_32fc_x2_multiply_32fc.orc
++--- /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_32fc_x2_multiply_32fc.orc 1970-01-01 01:00:00.000000000 +0100
+++++ /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_32fc_x2_multiply_32fc.orc 2014-10-15 01:55:08.000000000 +0200
++@@ -0,0 +1,18 @@
+++.function volk_gnsssdr_32fc_x2_multiply_32fc_a_orc_impl
+++.source 8 src1
+++.source 8 src2
+++.dest 8 dst
+++.temp 8 iqprod
+++.temp 4 real
+++.temp 4 imag
+++.temp 4 ac
+++.temp 4 bd
+++.temp 8 swapped
+++x2 mulf iqprod, src1, src2
+++splitql bd, ac, iqprod
+++subf real, ac, bd
+++swaplq swapped, src1
+++x2 mulf iqprod, swapped, src2
+++splitql bd, ac, iqprod
+++addf imag, ac, bd
+++mergelq dst, real, imag
++diff -rupN /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_8i_accumulator_s8i.orc /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_8i_accumulator_s8i.orc
++--- /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_8i_accumulator_s8i.orc 1970-01-01 01:00:00.000000000 +0100
+++++ /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_8i_accumulator_s8i.orc 2014-10-15 01:55:08.000000000 +0200
++@@ -0,0 +1,40 @@
+++#/*!
+++# * \file volk_gnsssdr_8i_accumulator_s8i.orc
+++# * \brief ORC implementation: 8 bits (char) scalar accumulator
+++# * \authors <ul>
+++# * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
+++# * </ul>
+++# *
+++# * ORC code that implements an accumulator of char values
+++# *
+++# * -------------------------------------------------------------------------
+++# *
+++# * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
+++# *
+++# * GNSS-SDR is a software defined Global Navigation
+++# * Satellite Systems receiver
+++# *
+++# * This file is part of GNSS-SDR.
+++# *
+++# * GNSS-SDR is free software: you can redistribute it and/or modify
+++# * it under the terms of the GNU General Public License as published by
+++# * the Free Software Foundation, either version 3 of the License, or
+++# * at your option) any later version.
+++# *
+++# * GNSS-SDR is distributed in the hope that it will be useful,
+++# * but WITHOUT ANY WARRANTY; without even the implied warranty of
+++# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+++# * GNU General Public License for more details.
+++# *
+++# * You should have received a copy of the GNU General Public License
+++# * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
+++# *
+++# * -------------------------------------------------------------------------
+++# */
+++
+++.function volk_gnsssdr_8i_accumulator_s8i_a_orc_impl
+++.source 1 src1
+++.accumulator 2 acc
+++.temp 2 sum
+++mergebw sum, 0, src1
+++accw acc, sum
++diff -rupN /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_8i_x2_add_8i.orc /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_8i_x2_add_8i.orc
++--- /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_8i_x2_add_8i.orc 1970-01-01 01:00:00.000000000 +0100
+++++ /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_8i_x2_add_8i.orc 2014-10-15 01:55:08.000000000 +0200
++@@ -0,0 +1,39 @@
+++#/*!
+++# * \file volk_gnsssdr_8i_x2_add_8i.orc
+++# * \brief ORC implementation: adds pairs of 8 bits (char) scalars
+++# * \authors <ul>
+++# * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
+++# * </ul>
+++# *
+++# * ORC code that adds pairs of 8 bits (char) scalars
+++# *
+++# * -------------------------------------------------------------------------
+++# *
+++# * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
+++# *
+++# * GNSS-SDR is a software defined Global Navigation
+++# * Satellite Systems receiver
+++# *
+++# * This file is part of GNSS-SDR.
+++# *
+++# * GNSS-SDR is free software: you can redistribute it and/or modify
+++# * it under the terms of the GNU General Public License as published by
+++# * the Free Software Foundation, either version 3 of the License, or
+++# * at your option) any later version.
+++# *
+++# * GNSS-SDR is distributed in the hope that it will be useful,
+++# * but WITHOUT ANY WARRANTY; without even the implied warranty of
+++# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+++# * GNU General Public License for more details.
+++# *
+++# * You should have received a copy of the GNU General Public License
+++# * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
+++# *
+++# * -------------------------------------------------------------------------
+++# */
+++
+++.function volk_gnsssdr_8i_x2_add_8i_a_orc_impl
+++.dest 1 dst
+++.source 1 src1
+++.source 1 src2
+++addb dst, src1, src2
++diff -rupN /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_8ic_conjugate_8ic.orc /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_8ic_conjugate_8ic.orc
++--- /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_8ic_conjugate_8ic.orc 1970-01-01 01:00:00.000000000 +0100
+++++ /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_8ic_conjugate_8ic.orc 2014-10-15 01:55:08.000000000 +0200
++@@ -0,0 +1,42 @@
+++#/*!
+++# * \file volk_gnsssdr_8ic_conjugate_8ic.orc
+++# * \brief ORC implementation: calculates the conjugate of a 16 bits vector
+++# * \authors <ul>
+++# * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
+++# * </ul>
+++# *
+++# * ORC code that calculates the conjugate of a
+++# * 16 bits vector (8 bits the real part and 8 bits the imaginary part)
+++# * result = (real*real) + (imag*imag)
+++# *
+++# * -------------------------------------------------------------------------
+++# *
+++# * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
+++# *
+++# * GNSS-SDR is a software defined Global Navigation
+++# * Satellite Systems receiver
+++# *
+++# * This file is part of GNSS-SDR.
+++# *
+++# * GNSS-SDR is free software: you can redistribute it and/or modify
+++# * it under the terms of the GNU General Public License as published by
+++# * the Free Software Foundation, either version 3 of the License, or
+++# * at your option) any later version.
+++# *
+++# * GNSS-SDR is distributed in the hope that it will be useful,
+++# * but WITHOUT ANY WARRANTY; without even the implied warranty of
+++# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+++# * GNU General Public License for more details.
+++# *
+++# * You should have received a copy of the GNU General Public License
+++# * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
+++# *
+++# * -------------------------------------------------------------------------
+++# */
+++
+++.function volk_gnsssdr_8ic_conjugate_8ic_a_orc_impl
+++.source 2 src1
+++.dest 2 dst
+++.temp 2 merged
+++mergebw merged, 1, -1
+++x2 mullb dst, merged, src1
++diff -rupN /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_8ic_magnitude_squared_8i.orc /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_8ic_magnitude_squared_8i.orc
++--- /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_8ic_magnitude_squared_8i.orc 1970-01-01 01:00:00.000000000 +0100
+++++ /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_8ic_magnitude_squared_8i.orc 2014-10-15 01:55:08.000000000 +0200
++@@ -0,0 +1,45 @@
+++#/*!
+++# * \file volk_gnsssdr_8ic_magnitude_squared_8i.orc
+++# * \brief ORC implementation: calculates the magnitude squared of a 16 bits vector
+++# * \authors <ul>
+++# * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
+++# * </ul>
+++# *
+++# * ORC code that calculates the magnitude squared of a
+++# * 16 bits vector (8 bits the real part and 8 bits the imaginary part)
+++# * result = (real*real) + (imag*imag)
+++# *
+++# * -------------------------------------------------------------------------
+++# *
+++# * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
+++# *
+++# * GNSS-SDR is a software defined Global Navigation
+++# * Satellite Systems receiver
+++# *
+++# * This file is part of GNSS-SDR.
+++# *
+++# * GNSS-SDR is free software: you can redistribute it and/or modify
+++# * it under the terms of the GNU General Public License as published by
+++# * the Free Software Foundation, either version 3 of the License, or
+++# * at your option) any later version.
+++# *
+++# * GNSS-SDR is distributed in the hope that it will be useful,
+++# * but WITHOUT ANY WARRANTY; without even the implied warranty of
+++# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+++# * GNU General Public License for more details.
+++# *
+++# * You should have received a copy of the GNU General Public License
+++# * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
+++# *
+++# * -------------------------------------------------------------------------
+++# */
+++
+++.function volk_gnsssdr_8ic_magnitude_squared_8i_a_orc_impl
+++.source 2 src1
+++.dest 1 dst
+++.temp 2 iqprod
+++.temp 1 ac
+++.temp 1 bd
+++x2 mullb iqprod, src1, src1
+++splitwb bd, ac, iqprod
+++addb dst, ac, bd
++diff -rupN /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_8ic_s8ic_multiply_8ic.orc /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_8ic_s8ic_multiply_8ic.orc
++--- /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_8ic_s8ic_multiply_8ic.orc 1970-01-01 01:00:00.000000000 +0100
+++++ /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_8ic_s8ic_multiply_8ic.orc 2014-10-15 01:55:08.000000000 +0200
++@@ -0,0 +1,58 @@
+++#/*!
+++# * \file volk_gnsssdr_8ic_s8ic_multiply_8ic.orc
+++# * \brief ORC implementation: multiplies a group of 16 bits vectors by one constant vector
+++# * \authors <ul>
+++# * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
+++# * </ul>
+++# *
+++# * ORC code that multiplies a group of 16 bits vectors
+++# * (8 bits the real part and 8 bits the imaginary part) by one constant vector
+++# *
+++# * -------------------------------------------------------------------------
+++# *
+++# * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
+++# *
+++# * GNSS-SDR is a software defined Global Navigation
+++# * Satellite Systems receiver
+++# *
+++# * This file is part of GNSS-SDR.
+++# *
+++# * GNSS-SDR is free software: you can redistribute it and/or modify
+++# * it under the terms of the GNU General Public License as published by
+++# * the Free Software Foundation, either version 3 of the License, or
+++# * at your option) any later version.
+++# *
+++# * GNSS-SDR is distributed in the hope that it will be useful,
+++# * but WITHOUT ANY WARRANTY; without even the implied warranty of
+++# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+++# * GNU General Public License for more details.
+++# *
+++# * You should have received a copy of the GNU General Public License
+++# * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
+++# *
+++# * -------------------------------------------------------------------------
+++# */
+++
+++.function volk_gnsssdr_8ic_s8ic_multiply_8ic_a_orc_impl
+++.source 2 src1
+++.param 2 src2real
+++.param 2 src2imag
+++.dest 2 dst
+++.temp 2 iqprod
+++.temp 1 real
+++.temp 1 imag
+++.temp 1 rr
+++.temp 1 ii
+++.temp 1 ri
+++.temp 1 ir
+++x2 mullb iqprod, src1, src2real
+++splitwb ir, rr, iqprod
+++x2 mullb iqprod, src1, src2imag
+++splitwb ii, ri, iqprod
+++subb real, rr, ii
+++addb imag, ri, ir
+++mergebw dst, real, imag
+++
+++
+++
+++
++diff -rupN /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_8ic_x2_dot_prod_8ic.orc /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_8ic_x2_dot_prod_8ic.orc
++--- /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_8ic_x2_dot_prod_8ic.orc 1970-01-01 01:00:00.000000000 +0100
+++++ /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_8ic_x2_dot_prod_8ic.orc 2014-10-15 01:55:08.000000000 +0200
++@@ -0,0 +1,59 @@
+++#/*!
+++# * \file volk_gnsssdr_8ic_x2_dot_prod_8ic.orc
+++# * \brief ORC implementation: multiplies two 16 bits vectors and accumulates them
+++# * \authors <ul>
+++# * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
+++# * </ul>
+++# *
+++# * ORC code that multiplies two 16 bits vectors (8 bits the real part
+++# * and 8 bits the imaginary part) and accumulates them
+++# *
+++# * -------------------------------------------------------------------------
+++# *
+++# * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
+++# *
+++# * GNSS-SDR is a software defined Global Navigation
+++# * Satellite Systems receiver
+++# *
+++# * This file is part of GNSS-SDR.
+++# *
+++# * GNSS-SDR is free software: you can redistribute it and/or modify
+++# * it under the terms of the GNU General Public License as published by
+++# * the Free Software Foundation, either version 3 of the License, or
+++# * at your option) any later version.
+++# *
+++# * GNSS-SDR is distributed in the hope that it will be useful,
+++# * but WITHOUT ANY WARRANTY; without even the implied warranty of
+++# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+++# * GNU General Public License for more details.
+++# *
+++# * You should have received a copy of the GNU General Public License
+++# * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
+++# *
+++# * -------------------------------------------------------------------------
+++# */
+++
+++.function volk_gnsssdr_8ic_x2_dot_prod_8ic_a_orc_impl
+++.source 2 src1
+++.source 2 src2
+++.accumulator 2 accreal
+++.accumulator 2 accimag
+++.temp 2 iqprod
+++.temp 1 real
+++.temp 1 imag
+++.temp 2 real2
+++.temp 2 imag2
+++.temp 1 ac
+++.temp 1 bd
+++.temp 2 swapped
+++x2 mullb iqprod, src1, src2
+++splitwb bd, ac, iqprod
+++subb real, ac, bd
+++swapw swapped, src1
+++x2 mullb iqprod, swapped, src2
+++splitwb bd, ac, iqprod
+++addb imag, ac, bd
+++mergebw real2, 0, real
+++accw accreal, real2
+++mergebw imag2, 0, imag
+++accw accimag, imag2
++diff -rupN /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_8ic_x2_multiply_8ic.orc /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_8ic_x2_multiply_8ic.orc
++--- /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_8ic_x2_multiply_8ic.orc 1970-01-01 01:00:00.000000000 +0100
+++++ /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_8ic_x2_multiply_8ic.orc 2014-10-15 01:55:08.000000000 +0200
++@@ -0,0 +1,57 @@
+++#/*!
+++# * \file volk_gnsssdr_8ic_x2_multiply_8ic.orc
+++# * \brief ORC implementation: multiplies two 16 bits vectors
+++# * \authors <ul>
+++# * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
+++# * </ul>
+++# *
+++# * ORC code that multiplies two 16 bits vectors (8 bits the real part
+++# * and 8 bits the imaginary part)
+++# *
+++# * -------------------------------------------------------------------------
+++# *
+++# * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
+++# *
+++# * GNSS-SDR is a software defined Global Navigation
+++# * Satellite Systems receiver
+++# *
+++# * This file is part of GNSS-SDR.
+++# *
+++# * GNSS-SDR is free software: you can redistribute it and/or modify
+++# * it under the terms of the GNU General Public License as published by
+++# * the Free Software Foundation, either version 3 of the License, or
+++# * at your option) any later version.
+++# *
+++# * GNSS-SDR is distributed in the hope that it will be useful,
+++# * but WITHOUT ANY WARRANTY; without even the implied warranty of
+++# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+++# * GNU General Public License for more details.
+++# *
+++# * You should have received a copy of the GNU General Public License
+++# * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
+++# *
+++# * -------------------------------------------------------------------------
+++# */
+++
+++.function volk_gnsssdr_8ic_x2_multiply_8ic_a_orc_impl
+++.source 2 src1
+++.source 2 src2
+++.dest 2 dst
+++.temp 2 iqprod
+++.temp 1 real
+++.temp 1 imag
+++.temp 1 ac
+++.temp 1 bd
+++.temp 2 swapped
+++x2 mullb iqprod, src1, src2
+++splitwb bd, ac, iqprod
+++subb real, ac, bd
+++swapw swapped, src1
+++x2 mullb iqprod, swapped, src2
+++splitwb bd, ac, iqprod
+++addb imag, ac, bd
+++mergebw dst, real, imag
+++
+++
+++
+++
++diff -rupN /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3.orc /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3.orc
++--- /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3.orc 1970-01-01 01:00:00.000000000 +0100
+++++ /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3.orc 2014-10-15 01:55:08.000000000 +0200
++@@ -0,0 +1,139 @@
+++#/*!
+++# * \file volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3.orc
+++# * \brief ORC implementation: performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation with 16 bits vectors
+++# * \authors <ul>
+++# * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
+++# * </ul>
+++# *
+++# * ORC code that performs the carrier wipe-off mixing and the
+++# * Early, Prompt, and Late correlation with 16 bits vectors (8 bits the
+++# * real part and 8 bits the imaginary part):
+++# * - The carrier wipe-off is done by multiplying the input signal by the
+++# * carrier (multiplication of 16 bits vectors) It returns the input
+++# * signal in base band (BB)
+++# * - Early values are calculated by multiplying the input signal in BB by the
+++# * early code (multiplication of 16 bits vectors), accumulating the results
+++# * - Prompt values are calculated by multiplying the input signal in BB by the
+++# * prompt code (multiplication of 16 bits vectors), accumulating the results
+++# * - Late values are calculated by multiplying the input signal in BB by the
+++# * late code (multiplication of 16 bits vectors), accumulating the results
+++# *
+++# * -------------------------------------------------------------------------
+++# *
+++# * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
+++# *
+++# * GNSS-SDR is a software defined Global Navigation
+++# * Satellite Systems receiver
+++# *
+++# * This file is part of GNSS-SDR.
+++# *
+++# * GNSS-SDR is free software: you can redistribute it and/or modify
+++# * it under the terms of the GNU General Public License as published by
+++# * the Free Software Foundation, either version 3 of the License, or
+++# * at your option) any later version.
+++# *
+++# * GNSS-SDR is distributed in the hope that it will be useful,
+++# * but WITHOUT ANY WARRANTY; without even the implied warranty of
+++# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+++# * GNU General Public License for more details.
+++# *
+++# * You should have received a copy of the GNU General Public License
+++# * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
+++# *
+++# * -------------------------------------------------------------------------
+++# */
+++
+++.function volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_first_a_orc_impl
+++.source 2 input
+++.source 2 carrier
+++.source 2 E_code
+++.source 2 P_code
+++.accumulator 2 E_out_real
+++.accumulator 2 E_out_imag
+++.accumulator 2 P_out_real
+++.accumulator 2 P_out_imag
+++.temp 2 bb_signal_sample
+++.temp 2 iqprod
+++.temp 1 real
+++.temp 1 imag
+++.temp 1 ac
+++.temp 1 bd
+++.temp 2 swapped
+++
+++.temp 2 real2
+++.temp 2 imag2
+++
+++x2 mullb iqprod, input, carrier
+++splitwb bd, ac, iqprod
+++subb real, ac, bd
+++swapw swapped, input
+++x2 mullb iqprod, swapped, carrier
+++splitwb bd, ac, iqprod
+++addb imag, ac, bd
+++mergebw bb_signal_sample, real, imag
+++
+++swapw swapped, bb_signal_sample
+++
+++x2 mullb iqprod, bb_signal_sample, E_code
+++splitwb bd, ac, iqprod
+++subb real, ac, bd
+++x2 mullb iqprod, swapped, E_code
+++splitwb bd, ac, iqprod
+++addb imag, ac, bd
+++mergebw real2, 0, real
+++mergebw imag2, 0, imag
+++accw E_out_real, real2
+++accw E_out_imag, imag2
+++
+++x2 mullb iqprod, bb_signal_sample, P_code
+++splitwb bd, ac, iqprod
+++subb real, ac, bd
+++x2 mullb iqprod, swapped, P_code
+++splitwb bd, ac, iqprod
+++addb imag, ac, bd
+++mergebw real2, 0, real
+++mergebw imag2, 0, imag
+++accw P_out_real, real2
+++accw P_out_imag, imag2
+++
+++.function volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_second_a_orc_impl
+++.source 2 input
+++.source 2 carrier
+++.source 2 L_code
+++.accumulator 2 L_out_real
+++.accumulator 2 L_out_imag
+++
+++.temp 2 bb_signal_sample
+++.temp 2 iqprod
+++.temp 1 real
+++.temp 1 imag
+++.temp 1 ac
+++.temp 1 bd
+++.temp 2 swapped
+++
+++.temp 2 real2
+++.temp 2 imag2
+++
+++x2 mullb iqprod, input, carrier
+++splitwb bd, ac, iqprod
+++subb real, ac, bd
+++swapw swapped, input
+++x2 mullb iqprod, swapped, carrier
+++splitwb bd, ac, iqprod
+++addb imag, ac, bd
+++mergebw bb_signal_sample, real, imag
+++
+++swapw swapped, bb_signal_sample
+++
+++x2 mullb iqprod, bb_signal_sample, L_code
+++splitwb bd, ac, iqprod
+++subb real, ac, bd
+++x2 mullb iqprod, swapped, L_code
+++splitwb bd, ac, iqprod
+++addb imag, ac, bd
+++mergebw real2, 0, real
+++mergebw imag2, 0, imag
+++accw L_out_real, real2
+++accw L_out_imag, imag2
+++
+++
++diff -rupN /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_8u_x2_multiply_8u.orc /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_8u_x2_multiply_8u.orc
++--- /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_8u_x2_multiply_8u.orc 1970-01-01 01:00:00.000000000 +0100
+++++ /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_8u_x2_multiply_8u.orc 2014-10-15 01:55:08.000000000 +0200
++@@ -0,0 +1,39 @@
+++#/*!
+++# * \file volk_gnsssdr_8u_x2_multiply_8u.orc
+++# * \brief ORC implementation: multiplies unsigned char values
+++# * \authors <ul>
+++# * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
+++# * </ul>
+++# *
+++# * ORC code that multiplies unsigned char values (8 bits data)
+++# *
+++# * -------------------------------------------------------------------------
+++# *
+++# * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
+++# *
+++# * GNSS-SDR is a software defined Global Navigation
+++# * Satellite Systems receiver
+++# *
+++# * This file is part of GNSS-SDR.
+++# *
+++# * GNSS-SDR is free software: you can redistribute it and/or modify
+++# * it under the terms of the GNU General Public License as published by
+++# * the Free Software Foundation, either version 3 of the License, or
+++# * at your option) any later version.
+++# *
+++# * GNSS-SDR is distributed in the hope that it will be useful,
+++# * but WITHOUT ANY WARRANTY; without even the implied warranty of
+++# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+++# * GNU General Public License for more details.
+++# *
+++# * You should have received a copy of the GNU General Public License
+++# * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
+++# *
+++# * -------------------------------------------------------------------------
+++# */
+++
+++.function volk_gnsssdr_8u_x2_multiply_8u_a_orc_impl
+++.source 1 src1
+++.source 1 src2
+++.dest 1 dst
+++mullb dst, src1, src2
++diff -rupN /Users/andres/Desktop/volk_gnsssdr/python/volk_gnsssdr_modtool/CMakeLists.txt /Users/andres/Desktop/volk_gnsssdr_original/python/volk_gnsssdr_modtool/CMakeLists.txt
++--- /Users/andres/Desktop/volk_gnsssdr/python/volk_gnsssdr_modtool/CMakeLists.txt 1970-01-01 01:00:00.000000000 +0100
+++++ /Users/andres/Desktop/volk_gnsssdr_original/python/volk_gnsssdr_modtool/CMakeLists.txt 2014-10-15 01:55:08.000000000 +0200
++@@ -0,0 +1,39 @@
+++# Copyright 2013 Free Software Foundation, Inc.
+++#
+++# This file is part of GNU Radio
+++#
+++# GNU Radio is free software; you can redistribute it and/or modify
+++# it under the terms of the GNU General Public License as published by
+++# the Free Software Foundation; either version 3, or (at your option)
+++# any later version.
+++#
+++# GNU Radio is distributed in the hope that it will be useful,
+++# but WITHOUT ANY WARRANTY; without even the implied warranty of
+++# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+++# GNU General Public License for more details.
+++#
+++# You should have received a copy of the GNU General Public License
+++# along with GNU Radio; see the file COPYING. If not, write to
+++# the Free Software Foundation, Inc., 51 Franklin Street,
+++# Boston, MA 02110-1301, USA.
+++
+++########################################################################
+++# Install python files and apps
+++########################################################################
+++include(GrPython)
+++
+++VOLK_PYTHON_INSTALL(
+++ FILES
+++ __init__.py
+++ cfg.py
+++ volk_gnsssdr_modtool_generate.py
+++ DESTINATION ${VOLK_PYTHON_DIR}/volk_gnsssdr_modtool
+++ COMPONENT "volk_gnsssdr"
+++)
+++
+++VOLK_PYTHON_INSTALL(
+++ PROGRAMS
+++ volk_gnsssdr_modtool
+++ DESTINATION ${VOLK_RUNTIME_DIR}
+++ COMPONENT "volk_gnsssdr"
+++)
++diff -rupN /Users/andres/Desktop/volk_gnsssdr/python/volk_gnsssdr_modtool/README /Users/andres/Desktop/volk_gnsssdr_original/python/volk_gnsssdr_modtool/README
++--- /Users/andres/Desktop/volk_gnsssdr/python/volk_gnsssdr_modtool/README 1970-01-01 01:00:00.000000000 +0100
+++++ /Users/andres/Desktop/volk_gnsssdr_original/python/volk_gnsssdr_modtool/README 2014-10-15 01:55:08.000000000 +0200
++@@ -0,0 +1,114 @@
+++The volk_gnsssdr_modtool tool is installed along with VOLK as a way of helping
+++to construct, add to, and interogate the VOLK library or companion
+++libraries.
+++
+++volk_gnsssdr_modtool is installed into $prefix/bin.
+++
+++VOLK modtool enables creating standalone (out-of-tree) VOLK modules
+++and provides a few tools for sharing VOLK kernels between VOLK
+++modules. If you need to design or work with VOLK kernels away from
+++the canonical VOLK library, this is the tool. If you need to tailor
+++your own VOLK library for whatever reason, this is the tool.
+++
+++The canonical VOLK library installs a volk_gnsssdr.h and a libvolk_gnsssdr.so. Your
+++own library will install volk_gnsssdr_$name.h and libvolk_gnsssdr_$name.so. Ya Gronk?
+++Good.
+++
+++There isn't a substantial difference between the canonical VOLK
+++module and any other VOLK module. They're all peers. Any module
+++created via VOLK modtool will come complete with a default
+++volk_gnsssdr_modtool.cfg file associating the module with the base from which
+++it came, its distinctive $name and its destination (or path). These
+++values (created from user input if VOLK modtool runs without a
+++user-supplied config file or a default config file) serve as default
+++values for some VOLK modtool actions. It's more or less intended for
+++the user to change directories to the top level of a created VOLK
+++module and then run volk_gnsssdr_modtool to take advantage of the values
+++stored in the default volk_gnsssdr_modtool.cfg file.
+++
+++Apart from creating new VOLK modules, VOLK modtool allows you to list
+++the names of kernels in other modules, list the names of kernels in
+++the current module, add kernels from another module into the current
+++module, and remove kernels from the current module. When moving
+++kernels between modules, VOLK modtool does its best to keep the qa
+++and profiling code for those kernels intact. If the base has a test
+++or a profiling call for some kernel, those calls will follow the
+++kernel when VOLK modtool adds that kernel. If QA or profiling
+++requires a puppet kernel, the puppet kernel will follow the original
+++kernel when VOLK modtool adds that original kernel. VOLK modtool
+++respects puppets.
+++
+++======================================================================
+++
+++Installing a new VOLK Library:
+++
+++Run the command "volk_gnsssdr_modtool -i". This will ask you three questions:
+++
+++ name: // the name to give your VOLK library: volk_gnsssdr_<name>
+++ destination: // directory new source tree is built under -- must exists.
+++ // It will create <directory>/volk_gnsssdr_<name>
+++ base: // the directory containing the original VOLK source code
+++
+++The name provided must be alphanumeric (and cannot start with a
+++number). No special characters including dashes and underscores are
+++allowed.
+++
+++This will build a new skeleton directory in the destination provided
+++with the name volk_gnsssdr_<name>. It will contain the necessary structure to
+++build:
+++
+++ mkdir build
+++ cd build
+++ cmake -DCMAKE_INSTALL_PREFIX=/opt/volk_gnsssdr ../
+++ make
+++ sudo make install
+++
+++Right now, the library is empty and contains no kernels. Kernels can
+++be added from another VOLK library using the '-a' option. If not
+++specified, the kernel will be extracted from the base VOLK
+++directory. Using the '-b' allows us to specify another VOLK library to
+++use for this purpose.
+++
+++ volk_gnsssdr_modtool -a -n 32fc_x2_conjugate_dot_prod_32fc
+++
+++This will put the code for the new kernel into
+++<destination>/volk_gnsssdr_<name>/kernels/volk_gnsssdr_<name>/
+++
+++Other kernels must be added by hand. See the following webpages for
+++more information about creating VOLK kernels:
+++ http://gnuradio.org/doc/doxygen/volk_gnsssdr_guide.html
+++ http://gnuradio.org/redmine/projects/gnuradio/wiki/Volk
+++
+++
+++======================================================================
+++
+++OPTIONS
+++
+++Options for Adding and Removing Kernels:
+++ -a, --add_kernel
+++ Add kernel from existing VOLK module. Uses the base VOLK module
+++ unless -b is used. Use -n to specify the kernel name.
+++ Requires: -n.
+++ Optional: -b
+++
+++ -A, --add_all_kernels
+++ Add all kernels from existing VOLK module. Uses the base VOLK
+++ module unless -b is used.
+++ Optional: -b
+++
+++ -x, --remove_kernel
+++ Remove kernel from module.
+++ Required: -n.
+++ Optional: -b
+++
+++Options for Listing Kernels:
+++ -l, --list
+++ Lists all kernels available in the base VOLK module.
+++
+++ -k, --kernels
+++ Lists all kernels in this VOLK module.
+++
+++ -r, --remote-list
+++ Lists all kernels in another VOLK module that is specified
+++ using the -b option.
+++
++diff -rupN /Users/andres/Desktop/volk_gnsssdr/python/volk_gnsssdr_modtool/__init__.py /Users/andres/Desktop/volk_gnsssdr_original/python/volk_gnsssdr_modtool/__init__.py
++--- /Users/andres/Desktop/volk_gnsssdr/python/volk_gnsssdr_modtool/__init__.py 1970-01-01 01:00:00.000000000 +0100
+++++ /Users/andres/Desktop/volk_gnsssdr_original/python/volk_gnsssdr_modtool/__init__.py 2014-10-15 01:55:08.000000000 +0200
++@@ -0,0 +1,24 @@
+++#!/usr/bin/env python
+++#
+++# Copyright 2013 Free Software Foundation, Inc.
+++#
+++# This file is part of GNU Radio
+++#
+++# GNU Radio is free software; you can redistribute it and/or modify
+++# it under the terms of the GNU General Public License as published by
+++# the Free Software Foundation; either version 3, or (at your option)
+++# any later version.
+++#
+++# GNU Radio is distributed in the hope that it will be useful,
+++# but WITHOUT ANY WARRANTY; without even the implied warranty of
+++# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+++# GNU General Public License for more details.
+++#
+++# You should have received a copy of the GNU General Public License
+++# along with GNU Radio; see the file COPYING. If not, write to
+++# the Free Software Foundation, Inc., 51 Franklin Street,
+++# Boston, MA 02110-1301, USA.
+++#
+++
+++from cfg import volk_gnsssdr_modtool_config
+++from volk_gnsssdr_modtool_generate import volk_gnsssdr_modtool
++Binary files /Users/andres/Desktop/volk_gnsssdr/python/volk_gnsssdr_modtool/__init__.pyc and /Users/andres/Desktop/volk_gnsssdr_original/python/volk_gnsssdr_modtool/__init__.pyc differ
++diff -rupN /Users/andres/Desktop/volk_gnsssdr/python/volk_gnsssdr_modtool/cfg.py /Users/andres/Desktop/volk_gnsssdr_original/python/volk_gnsssdr_modtool/cfg.py
++--- /Users/andres/Desktop/volk_gnsssdr/python/volk_gnsssdr_modtool/cfg.py 1970-01-01 01:00:00.000000000 +0100
+++++ /Users/andres/Desktop/volk_gnsssdr_original/python/volk_gnsssdr_modtool/cfg.py 2014-10-15 01:55:08.000000000 +0200
++@@ -0,0 +1,104 @@
+++#!/usr/bin/env python
+++#
+++# Copyright 2013 Free Software Foundation, Inc.
+++#
+++# This file is part of GNU Radio
+++#
+++# GNU Radio is free software; you can redistribute it and/or modify
+++# it under the terms of the GNU General Public License as published by
+++# the Free Software Foundation; either version 3, or (at your option)
+++# any later version.
+++#
+++# GNU Radio is distributed in the hope that it will be useful,
+++# but WITHOUT ANY WARRANTY; without even the implied warranty of
+++# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+++# GNU General Public License for more details.
+++#
+++# You should have received a copy of the GNU General Public License
+++# along with GNU Radio; see the file COPYING. If not, write to
+++# the Free Software Foundation, Inc., 51 Franklin Street,
+++# Boston, MA 02110-1301, USA.
+++#
+++
+++import ConfigParser
+++import sys
+++import os
+++import exceptions
+++import re
+++
+++
+++class volk_gnsssdr_modtool_config:
+++ def key_val_sub(self, num, stuff, section):
+++ return re.sub('\$' + 'k' + str(num), stuff[num][0], (re.sub('\$' + str(num), stuff[num][1], section[1][num])));
+++
+++ def verify(self):
+++ for i in self.verification:
+++ self.verify_section(i)
+++ def remap(self):
+++ for i in self.remapification:
+++ self.verify_section(i)
+++
+++ def verify_section(self, section):
+++ stuff = self.cfg.items(section[0])
+++ for i in range(len(section[1])):
+++ eval(self.key_val_sub(i, stuff, section))
+++ try:
+++ val = eval(self.key_val_sub(i, stuff, section))
+++ if val == False:
+++ raise exceptions.ValueError
+++ except ValueError:
+++ raise exceptions.ValueError('Verification function returns False... key:%s, val:%s'%(stuff[i][0], stuff[i][1]))
+++ except:
+++ raise exceptions.IOError('bad configuration... key:%s, val:%s'%(stuff[i][0], stuff[i][1]))
+++
+++
+++ def __init__(self, cfg=None):
+++ self.config_name = 'config'
+++ self.config_defaults = ['name', 'destination', 'base']
+++ self.config_defaults_remap = ['1',
+++ 'self.cfg.set(self.config_name, \'$k1\', os.path.realpath(os.path.expanduser(\'$1\')))',
+++ 'self.cfg.set(self.config_name, \'$k2\', os.path.realpath(os.path.expanduser(\'$2\')))']
+++
+++ self.config_defaults_verify = ['re.match(\'[a-zA-Z0-9]+$\', \'$0\')',
+++ 'os.path.exists(\'$1\')',
+++ 'os.path.exists(\'$2\')']
+++ self.remapification = [(self.config_name, self.config_defaults_remap)]
+++ self.verification = [(self.config_name, self.config_defaults_verify)]
+++ default = os.path.join(os.getcwd(), 'volk_gnsssdr_modtool.cfg')
+++ icfg = ConfigParser.RawConfigParser()
+++ if cfg:
+++ icfg.read(cfg)
+++ elif os.path.exists(default):
+++ icfg.read(default)
+++ else:
+++ print "Initializing config file..."
+++ icfg.add_section(self.config_name)
+++ for kn in self.config_defaults:
+++ rv = raw_input("%s: "%(kn))
+++ icfg.set(self.config_name, kn, rv)
+++ self.cfg = icfg
+++ self.remap()
+++ self.verify()
+++
+++
+++
+++ def read_map(self, name, inp):
+++ if self.cfg.has_section(name):
+++ self.cfg.remove_section(name)
+++ self.cfg.add_section(name)
+++ for i in inp:
+++ self.cfg.set(name, i, inp[i])
+++
+++ def get_map(self, name):
+++ retval = {}
+++ stuff = self.cfg.items(name)
+++ for i in stuff:
+++ retval[i[0]] = i[1]
+++ return retval
+++
+++
+++
+++
+++
+++
+++
++Binary files /Users/andres/Desktop/volk_gnsssdr/python/volk_gnsssdr_modtool/cfg.pyc and /Users/andres/Desktop/volk_gnsssdr_original/python/volk_gnsssdr_modtool/cfg.pyc differ
++diff -rupN /Users/andres/Desktop/volk_gnsssdr/python/volk_gnsssdr_modtool/volk_gnsssdr_modtool /Users/andres/Desktop/volk_gnsssdr_original/python/volk_gnsssdr_modtool/volk_gnsssdr_modtool
++--- /Users/andres/Desktop/volk_gnsssdr/python/volk_gnsssdr_modtool/volk_gnsssdr_modtool 1970-01-01 01:00:00.000000000 +0100
+++++ /Users/andres/Desktop/volk_gnsssdr_original/python/volk_gnsssdr_modtool/volk_gnsssdr_modtool 2014-10-15 01:55:08.000000000 +0200
++@@ -0,0 +1,128 @@
+++#!/usr/bin/env python
+++#
+++# Copyright 2013 Free Software Foundation, Inc.
+++#
+++# This file is part of GNU Radio
+++#
+++# GNU Radio is free software; you can redistribute it and/or modify
+++# it under the terms of the GNU General Public License as published by
+++# the Free Software Foundation; either version 3, or (at your option)
+++# any later version.
+++#
+++# GNU Radio is distributed in the hope that it will be useful,
+++# but WITHOUT ANY WARRANTY; without even the implied warranty of
+++# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+++# GNU General Public License for more details.
+++#
+++# You should have received a copy of the GNU General Public License
+++# along with GNU Radio; see the file COPYING. If not, write to
+++# the Free Software Foundation, Inc., 51 Franklin Street,
+++# Boston, MA 02110-1301, USA.
+++#
+++
+++from volk_gnsssdr_modtool import volk_gnsssdr_modtool, volk_gnsssdr_modtool_config
+++from optparse import OptionParser, OptionGroup
+++
+++import exceptions
+++import os
+++import sys
+++
+++if __name__ == '__main__':
+++ parser = OptionParser();
+++ actions = OptionGroup(parser, 'Actions');
+++ actions.add_option('-i', '--install', action='store_true',
+++ help='Create a new volk_gnsssdr module.')
+++ parser.add_option('-b', '--base_path', action='store', default=None,
+++ help='Base path for action. By default, volk_gnsssdr_modtool.cfg loads this value.')
+++ parser.add_option('-n', '--kernel_name', action='store', default=None,
+++ help='Kernel name for action. No default')
+++ parser.add_option('-c', '--config', action='store', dest='config_file', default=None,
+++ help='Config file for volk_gnsssdr_modtool. By default, volk_gnsssdr_modtool.cfg in the local directory will be used/created.')
+++ actions.add_option('-a', '--add_kernel', action='store_true',
+++ help='Add kernel from existing volk_gnsssdr module. Requires: -n. Optional: -b')
+++ actions.add_option('-A', '--add_all_kernels', action='store_true',
+++ help='Add all kernels from existing volk_gnsssdr module. Optional: -b')
+++ actions.add_option('-x', '--remove_kernel', action='store_true',
+++ help='Remove kernel from module. Required: -n. Optional: -b')
+++ actions.add_option('-l', '--list', action='store_true',
+++ help='List all kernels in the base.')
+++ actions.add_option('-k', '--kernels', action='store_true',
+++ help='List all kernels in the module.')
+++ actions.add_option('-r', '--remote_list', action='store_true',
+++ help='List all available kernels in remote volk_gnsssdr module. Requires: -b.')
+++ actions.add_option('-m', '--moo', action='store_true',
+++ help='Have you mooed today?')
+++ parser.add_option_group(actions)
+++
+++ (options, args) = parser.parse_args();
+++ if len(sys.argv) < 2:
+++ parser.print_help()
+++
+++ elif options.moo:
+++ print " (__) "
+++ print " (oo) "
+++ print " /------\/ "
+++ print " / | || "
+++ print " * /\---/\ "
+++ print " ~~ ~~ "
+++
+++ else:
+++ my_cfg = volk_gnsssdr_modtool_config(options.config_file);
+++
+++ my_modtool = volk_gnsssdr_modtool(my_cfg.get_map(my_cfg.config_name));
+++
+++
+++ if options.install:
+++ my_modtool.make_module_skeleton();
+++ my_modtool.write_default_cfg(my_cfg.cfg);
+++
+++
+++ if options.add_kernel:
+++ if not options.kernel_name:
+++ raise exceptions.IOError("This action requires the -n option.");
+++ else:
+++ name = options.kernel_name;
+++ if options.base_path:
+++ base = options.base_path;
+++ else:
+++ base = my_cfg.cfg.get(my_cfg.config_name, 'base');
+++ my_modtool.import_kernel(name, base);
+++
+++ if options.remove_kernel:
+++ if not options.kernel_name:
+++ raise exceptions.IOError("This action requires the -n option.");
+++ else:
+++ name = options.kernel_name;
+++ my_modtool.remove_kernel(name);
+++
+++ if options.add_all_kernels:
+++
+++ if options.base_path:
+++ base = options.base_path;
+++ else:
+++ base = my_cfg.cfg.get(my_cfg.config_name, 'base');
+++ kernelset = my_modtool.get_current_kernels(base);
+++ for i in kernelset:
+++ my_modtool.import_kernel(i, base);
+++
+++ if options.remote_list:
+++ if not options.base_path:
+++ raise exceptions.IOError("This action requires the -b option. Try -l or -k for listing kernels in the base or the module.")
+++ else:
+++ base = options.base_path;
+++ kernelset = my_modtool.get_current_kernels(base);
+++ for i in kernelset:
+++ print i;
+++
+++ if options.list:
+++ kernelset = my_modtool.get_current_kernels();
+++ for i in kernelset:
+++ print i;
+++
+++ if options.kernels:
+++ dest = my_cfg.cfg.get(my_cfg.config_name, 'destination');
+++ name = my_cfg.cfg.get(my_cfg.config_name, 'name');
+++ base = os.path.join(dest, 'volk_gnsssdr_' + name);
+++ kernelset = my_modtool.get_current_kernels(base);
+++ for i in kernelset:
+++ print i;
++diff -rupN /Users/andres/Desktop/volk_gnsssdr/python/volk_gnsssdr_modtool/volk_gnsssdr_modtool_generate.py /Users/andres/Desktop/volk_gnsssdr_original/python/volk_gnsssdr_modtool/volk_gnsssdr_modtool_generate.py
++--- /Users/andres/Desktop/volk_gnsssdr/python/volk_gnsssdr_modtool/volk_gnsssdr_modtool_generate.py 1970-01-01 01:00:00.000000000 +0100
+++++ /Users/andres/Desktop/volk_gnsssdr_original/python/volk_gnsssdr_modtool/volk_gnsssdr_modtool_generate.py 2014-10-15 01:55:08.000000000 +0200
++@@ -0,0 +1,330 @@
+++#
+++# Copyright 2013 Free Software Foundation, Inc.
+++#
+++# This file is part of GNU Radio
+++#
+++# GNU Radio is free software; you can redistribute it and/or modify
+++# it under the terms of the GNU General Public License as published by
+++# the Free Software Foundation; either version 3, or (at your option)
+++# any later version.
+++#
+++# GNU Radio is distributed in the hope that it will be useful,
+++# but WITHOUT ANY WARRANTY; without even the implied warranty of
+++# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+++# GNU General Public License for more details.
+++#
+++# You should have received a copy of the GNU General Public License
+++# along with GNU Radio; see the file COPYING. If not, write to
+++# the Free Software Foundation, Inc., 51 Franklin Street,
+++# Boston, MA 02110-1301, USA.
+++#
+++
+++import os
+++import glob
+++import sys
+++import re
+++import glob
+++import shutil
+++import exceptions
+++from sets import Set
+++
+++class volk_gnsssdr_modtool:
+++ def __init__(self, cfg):
+++ self.volk_gnsssdr = re.compile('volk_gnsssdr');
+++ self.remove_after_underscore = re.compile("_.*");
+++ self.volk_gnsssdr_run_tests = re.compile('^\s*VOLK_RUN_TESTS.*\n', re.MULTILINE);
+++ self.volk_gnsssdr_profile = re.compile('^\s*(VOLK_PROFILE|VOLK_PUPPET_PROFILE).*\n', re.MULTILINE);
+++ self.my_dict = cfg;
+++ self.lastline = re.compile('\s*char path\[1024\];.*');
+++ self.badassert = re.compile('^\s*assert\(toked\[0\] == "volk_gnsssdr_.*\n', re.MULTILINE);
+++ self.goodassert = ' assert(toked[0] == "volk_gnsssdr");\n'
+++ self.baderase = re.compile('^\s*toked.erase\(toked.begin\(\)\);.*\n', re.MULTILINE);
+++ self.gooderase = ' toked.erase(toked.begin());\n toked.erase(toked.begin());\n';
+++
+++ def get_basename(self, base=None):
+++ if not base:
+++ base = self.my_dict['base']
+++ candidate = base.split('/')[-1];
+++ if len(candidate.split('_')) == 1:
+++ return '';
+++ else:
+++ return candidate.split('_')[-1];
+++
+++ def get_current_kernels(self, base=None):
+++ if not base:
+++ base = self.my_dict['base']
+++ name = self.get_basename();
+++ else:
+++ name = self.get_basename(base);
+++ if name == '':
+++ hdr_files = glob.glob(os.path.join(base, "kernels/volk_gnsssdr/*.h"));
+++ begins = re.compile("(?<=volk_gnsssdr_).*")
+++ else:
+++ hdr_files = glob.glob(os.path.join(base, "kernels/volk_gnsssdr_" + name + "/*.h"));
+++ begins = re.compile("(?<=volk_gnsssdr_" + name + "_).*")
+++
+++ datatypes = [];
+++ functions = [];
+++
+++
+++ for line in hdr_files:
+++
+++ subline = re.search(".*\.h.*", os.path.basename(line))
+++ if subline:
+++ subsubline = begins.search(subline.group(0));
+++ if subsubline:
+++ dtype = self.remove_after_underscore.sub("", subsubline.group(0));
+++ subdtype = re.search("[0-9]+[A-z]+", dtype);
+++ if subdtype:
+++ datatypes.append(subdtype.group(0));
+++
+++
+++ datatypes = set(datatypes);
+++
+++ for line in hdr_files:
+++ for dt in datatypes:
+++ if dt in line:
+++ #subline = re.search("(?<=volk_gnsssdr_)" + dt + ".*(?=\.h)", line);
+++ subline = re.search(begins.pattern[:-2] + dt + ".*(?=\.h)", line);
+++ if subline:
+++ functions.append(subline.group(0));
+++
+++ return set(functions);
+++
+++ def make_module_skeleton(self):
+++
+++ dest = os.path.join(self.my_dict['destination'], 'volk_gnsssdr_' + self.my_dict['name'])
+++ if os.path.exists(dest):
+++ raise exceptions.IOError("Destination %s already exits!"%(dest));
+++
+++ if not os.path.exists(os.path.join(self.my_dict['destination'], 'volk_gnsssdr_' + self.my_dict['name'], 'kernels/volk_gnsssdr_' + self.my_dict['name'])):
+++ os.makedirs(os.path.join(self.my_dict['destination'], 'volk_gnsssdr_' + self.my_dict['name'], 'kernels/volk_gnsssdr_' + self.my_dict['name']))
+++
+++ current_kernel_names = self.get_current_kernels();
+++
+++ for root, dirnames, filenames in os.walk(self.my_dict['base']):
+++ for name in filenames:
+++ t_table = map(lambda a: re.search(a, name), current_kernel_names);
+++ t_table = set(t_table);
+++ if t_table == set([None]):
+++ infile = os.path.join(root, name);
+++ instring = open(infile, 'r').read();
+++ outstring = re.sub(self.volk_gnsssdr, 'volk_gnsssdr_' + self.my_dict['name'], instring);
+++ newname = re.sub(self.volk_gnsssdr, 'volk_gnsssdr_' + self.my_dict['name'], name);
+++ relpath = os.path.relpath(infile, self.my_dict['base']);
+++ newrelpath = re.sub(self.volk_gnsssdr, 'volk_gnsssdr_' + self.my_dict['name'], relpath);
+++ dest = os.path.join(self.my_dict['destination'], 'volk_gnsssdr_' + self.my_dict['name'], os.path.dirname(newrelpath), newname);
+++
+++ if not os.path.exists(os.path.dirname(dest)):
+++ os.makedirs(os.path.dirname(dest))
+++ open(dest, 'w+').write(outstring);
+++
+++
+++ infile = os.path.join(self.my_dict['destination'], 'volk_gnsssdr_' + self.my_dict['name'], 'lib/testqa.cc');
+++ instring = open(infile, 'r').read();
+++ outstring = re.sub(self.volk_gnsssdr_run_tests, '', instring);
+++ open(infile, 'w+').write(outstring);
+++
+++ infile = os.path.join(self.my_dict['destination'], 'volk_gnsssdr_' + self.my_dict['name'], 'apps/volk_gnsssdr_' + self.my_dict['name'] + '_profile.cc');
+++ instring = open(infile, 'r').read();
+++ outstring = re.sub(self.volk_gnsssdr_profile, '', instring);
+++ open(infile, 'w+').write(outstring);
+++
+++ infile = os.path.join(self.my_dict['destination'], 'volk_gnsssdr_' + self.my_dict['name'], 'lib/qa_utils.cc');
+++ instring = open(infile, 'r').read();
+++ outstring = re.sub(self.badassert, self.goodassert, instring);
+++ outstring = re.sub(self.baderase, self.gooderase, outstring);
+++ open(infile, 'w+').write(outstring);
+++
+++ def write_default_cfg(self, cfg):
+++ outfile = open(os.path.join(self.my_dict['destination'], 'volk_gnsssdr_' + self.my_dict['name'], 'volk_gnsssdr_modtool.cfg'), 'wb');
+++ cfg.write(outfile);
+++ outfile.close();
+++
+++
+++ def convert_kernel(self, oldvolk_gnsssdr, name, base, inpath, top):
+++ infile = os.path.join(inpath, 'kernels/' + top[:-1] + '/' + top + name + '.h');
+++ instring = open(infile, 'r').read();
+++ outstring = re.sub(oldvolk_gnsssdr, 'volk_gnsssdr_' + self.my_dict['name'], instring);
+++ newname = 'volk_gnsssdr_' + self.my_dict['name'] + '_' + name + '.h';
+++ relpath = os.path.relpath(infile, base);
+++ newrelpath = re.sub(oldvolk_gnsssdr, 'volk_gnsssdr_' + self.my_dict['name'], relpath);
+++ dest = os.path.join(self.my_dict['destination'], 'volk_gnsssdr_' + self.my_dict['name'], os.path.dirname(newrelpath), newname);
+++
+++ if not os.path.exists(os.path.dirname(dest)):
+++ os.makedirs(os.path.dirname(dest))
+++ open(dest, 'w+').write(outstring);
+++
+++ # copy orc proto-kernels if they exist
+++ for orcfile in glob.glob(inpath + '/orc/' + top + name + '*.orc'):
+++ if os.path.isfile(orcfile):
+++ instring = open(orcfile, 'r').read();
+++ outstring = re.sub(oldvolk_gnsssdr, 'volk_gnsssdr_' + self.my_dict['name'], instring);
+++ newname = 'volk_gnsssdr_' + self.my_dict['name'] + '_' + name + '.orc';
+++ relpath = os.path.relpath(orcfile, base);
+++ newrelpath = re.sub(oldvolk_gnsssdr, 'volk_gnsssdr_' + self.my_dict['name'], relpath);
+++ dest = os.path.join(self.my_dict['destination'], 'volk_gnsssdr_' + self.my_dict['name'], os.path.dirname(newrelpath), newname);
+++ if not os.path.exists(os.path.dirname(dest)):
+++ os.makedirs(os.path.dirname(dest));
+++ open(dest, 'w+').write(outstring)
+++
+++
+++ def remove_kernel(self, name):
+++ basename = self.my_dict['name'];
+++ if len(basename) > 0:
+++ top = 'volk_gnsssdr_' + basename + '_';
+++ else:
+++ top = 'volk_gnsssdr_'
+++ base = os.path.join(self.my_dict['destination'], top[:-1]) ;
+++
+++ if not name in self.get_current_kernels():
+++
+++ raise exceptions.IOError("Requested kernel %s is not in module %s"%(name,base));
+++
+++
+++
+++ inpath = os.path.abspath(base);
+++
+++
+++ kernel = re.compile(name)
+++ search_kernels = Set([kernel])
+++ profile = re.compile('^\s*VOLK_PROFILE')
+++ puppet = re.compile('^\s*VOLK_PUPPET')
+++ src_dest = os.path.join(inpath, 'apps/', top[:-1] + '_profile.cc');
+++ infile = open(src_dest);
+++ otherlines = infile.readlines();
+++ open(src_dest, 'w+').write('');
+++
+++ for otherline in otherlines:
+++ write_okay = True;
+++ if kernel.search(otherline):
+++ write_okay = False;
+++ if puppet.match(otherline):
+++ args = re.search("(?<=VOLK_PUPPET_PROFILE).*", otherline)
+++ m_func = args.group(0).split(',')[0];
+++ func = re.search('(?<=' + top + ').*', m_func);
+++ search_kernels.add(re.compile(func.group(0)));
+++ if write_okay:
+++ open(src_dest, 'a').write(otherline);
+++
+++
+++ src_dest = os.path.join(inpath, 'lib/testqa.cc')
+++ infile = open(src_dest);
+++ otherlines = infile.readlines();
+++ open(src_dest, 'w+').write('');
+++
+++ for otherline in otherlines:
+++ write_okay = True;
+++
+++ for kernel in search_kernels:
+++ if kernel.search(otherline):
+++ write_okay = False;
+++
+++ if write_okay:
+++ open(src_dest, 'a').write(otherline);
+++
+++ for kernel in search_kernels:
+++ infile = os.path.join(inpath, 'kernels/' + top[:-1] + '/' + top + kernel.pattern + '.h');
+++ print "Removing kernel %s"%(kernel.pattern)
+++ if os.path.exists(infile):
+++ os.remove(infile);
+++ # remove the orc proto-kernels if they exist. There are no puppets here
+++ # so just need to glob for files matching kernel name
+++ print glob.glob(inpath + '/orc/' + top + name + '*.orc');
+++ for orcfile in glob.glob(inpath + '/orc/' + top + name + '*.orc'):
+++ print orcfile
+++ if(os.path.exists(orcfile)):
+++ os.remove(orcfile);
+++
+++ def import_kernel(self, name, base):
+++ if not (base):
+++ base = self.my_dict['base'];
+++ basename = self.getbasename();
+++ else:
+++ basename = self.get_basename(base);
+++ if not name in self.get_current_kernels(base):
+++ raise exceptions.IOError("Requested kernel %s is not in module %s"%(name,base));
+++
+++ inpath = os.path.abspath(base);
+++ if len(basename) > 0:
+++ top = 'volk_gnsssdr_' + basename + '_';
+++ else:
+++ top = 'volk_gnsssdr_'
+++ oldvolk_gnsssdr = re.compile(top[:-1]);
+++
+++ self.convert_kernel(oldvolk_gnsssdr, name, base, inpath, top);
+++
+++ kernel = re.compile(name)
+++ search_kernels = Set([kernel])
+++
+++ profile = re.compile('^\s*VOLK_PROFILE')
+++ puppet = re.compile('^\s*VOLK_PUPPET')
+++ infile = open(os.path.join(inpath, 'apps/', oldvolk_gnsssdr.pattern + '_profile.cc'));
+++ otherinfile = open(os.path.join(self.my_dict['destination'], 'volk_gnsssdr_' + self.my_dict['name'], 'apps/volk_gnsssdr_' + self.my_dict['name'] + '_profile.cc'));
+++ dest = os.path.join(self.my_dict['destination'], 'volk_gnsssdr_' + self.my_dict['name'], 'apps/volk_gnsssdr_' + self.my_dict['name'] + '_profile.cc');
+++ lines = infile.readlines();
+++ otherlines = otherinfile.readlines();
+++ open(dest, 'w+').write('');
+++ insert = False;
+++ inserted = False
+++ for otherline in otherlines:
+++
+++ if self.lastline.match(otherline):
+++ insert = True;
+++ if insert and not inserted:
+++ inserted = True;
+++ for line in lines:
+++ if kernel.search(line):
+++ if profile.match(line):
+++ outline = re.sub(oldvolk_gnsssdr, 'volk_gnsssdr_' + self.my_dict['name'], line);
+++ open(dest, 'a').write(outline);
+++ elif puppet.match(line):
+++ outline = re.sub(oldvolk_gnsssdr, 'volk_gnsssdr_' + self.my_dict['name'], line);
+++ open(dest, 'a').write(outline);
+++ args = re.search("(?<=VOLK_PUPPET_PROFILE).*", line)
+++ m_func = args.group(0).split(',')[0];
+++ func = re.search('(?<=' + top + ').*', m_func);
+++ search_kernels.add(re.compile(func.group(0)));
+++ self.convert_kernel(oldvolk_gnsssdr, func.group(0), base, inpath, top);
+++ write_okay = True;
+++ for kernel in search_kernels:
+++ if kernel.search(otherline):
+++ write_okay = False
+++ if write_okay:
+++ open(dest, 'a').write(otherline);
+++
+++ for kernel in search_kernels:
+++ print "Adding kernel %s from module %s"%(kernel.pattern,base)
+++
+++ infile = open(os.path.join(inpath, 'lib/testqa.cc'));
+++ otherinfile = open(os.path.join(self.my_dict['destination'], 'volk_gnsssdr_' + self.my_dict['name'], 'lib/testqa.cc'));
+++ dest = os.path.join(self.my_dict['destination'], 'volk_gnsssdr_' + self.my_dict['name'], 'lib/testqa.cc');
+++ lines = infile.readlines();
+++ otherlines = otherinfile.readlines();
+++ open(dest, 'w+').write('');
+++ inserted = False;
+++ insert = False
+++ for otherline in otherlines:
+++
+++ if (re.match('\s*', otherline) == None or re.match('\s*#.*', otherline) == None):
+++
+++ insert = True;
+++ if insert and not inserted:
+++ inserted = True;
+++ for line in lines:
+++ for kernel in search_kernels:
+++ if kernel.search(line):
+++ if self.volk_gnsssdr_run_tests.match(line):
+++ outline = re.sub(oldvolk_gnsssdr, 'volk_gnsssdr_' + self.my_dict['name'], line);
+++ open(dest, 'a').write(outline);
+++ write_okay = True;
+++ for kernel in search_kernels:
+++ if kernel.search(otherline):
+++ write_okay = False
+++ if write_okay:
+++ open(dest, 'a').write(otherline);
+++
+++
+++
+++
+++
++Binary files /Users/andres/Desktop/volk_gnsssdr/python/volk_gnsssdr_modtool/volk_gnsssdr_modtool_generate.pyc and /Users/andres/Desktop/volk_gnsssdr_original/python/volk_gnsssdr_modtool/volk_gnsssdr_modtool_generate.pyc differ
++diff -rupN /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr.tmpl.h /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr.tmpl.h
++--- /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr.tmpl.h 2014-10-17 03:00:41.000000000 +0200
+++++ /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr.tmpl.h 2014-10-15 01:55:08.000000000 +0200
++@@ -19,8 +19,8 @@
++ * Boston, MA 02110-1301, USA.
++ */
++
++-#ifndef INCLUDED_VOLK_RUNTIME
++-#define INCLUDED_VOLK_RUNTIME
+++#ifndef INCLUDED_VOLK_GNSSSDR_RUNTIME
+++#define INCLUDED_VOLK_GNSSSDR_RUNTIME
++
++ #include <volk_gnsssdr/volk_gnsssdr_typedefs.h>
++ #include <volk_gnsssdr/volk_gnsssdr_config_fixed.h>
++diff -rupN /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr_config_fixed.tmpl.h /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr_config_fixed.tmpl.h
++--- /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr_config_fixed.tmpl.h 2014-10-17 03:00:41.000000000 +0200
+++++ /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr_config_fixed.tmpl.h 2014-10-15 01:55:08.000000000 +0200
++@@ -19,8 +19,8 @@
++ * Boston, MA 02110-1301, USA.
++ */
++
++-#ifndef INCLUDED_VOLK_CONFIG_FIXED_H
++-#define INCLUDED_VOLK_CONFIG_FIXED_H
+++#ifndef INCLUDED_VOLK_GNSSSDR_CONFIG_FIXED_H
+++#define INCLUDED_VOLK_GNSSSDR_CONFIG_FIXED_H
++
++ #for $i, $arch in enumerate($archs)
++ #define LV_$(arch.name.upper()) $i
++diff -rupN /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr_cpu.tmpl.h /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr_cpu.tmpl.h
++--- /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr_cpu.tmpl.h 2014-10-17 03:00:41.000000000 +0200
+++++ /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr_cpu.tmpl.h 2014-10-15 01:55:08.000000000 +0200
++@@ -19,8 +19,8 @@
++ * Boston, MA 02110-1301, USA.
++ */
++
++-#ifndef INCLUDED_VOLK_CPU_H
++-#define INCLUDED_VOLK_CPU_H
+++#ifndef INCLUDED_VOLK_GNSSSDR_CPU_H
+++#define INCLUDED_VOLK_GNSSSDR_CPU_H
++
++ #include <volk_gnsssdr/volk_gnsssdr_common.h>
++
++diff -rupN /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr_machines.tmpl.h /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr_machines.tmpl.h
++--- /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr_machines.tmpl.h 2014-10-17 03:00:41.000000000 +0200
+++++ /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr_machines.tmpl.h 2014-10-15 01:55:08.000000000 +0200
++@@ -19,8 +19,8 @@
++ * Boston, MA 02110-1301, USA.
++ */
++
++-#ifndef INCLUDED_LIBVOLK_MACHINES_H
++-#define INCLUDED_LIBVOLK_MACHINES_H
+++#ifndef INCLUDED_LIBVOLK_GNSSSDR_MACHINES_H
+++#define INCLUDED_LIBVOLK_GNSSSDR_MACHINES_H
++
++ #include <volk_gnsssdr/volk_gnsssdr_common.h>
++ #include <volk_gnsssdr/volk_gnsssdr_typedefs.h>
++diff -rupN /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr_typedefs.tmpl.h /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr_typedefs.tmpl.h
++--- /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr_typedefs.tmpl.h 2014-10-17 03:00:41.000000000 +0200
+++++ /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr_typedefs.tmpl.h 2014-10-15 01:55:08.000000000 +0200
++@@ -19,8 +19,8 @@
++ * Boston, MA 02110-1301, USA.
++ */
++
++-#ifndef INCLUDED_VOLK_TYPEDEFS
++-#define INCLUDED_VOLK_TYPEDEFS
+++#ifndef INCLUDED_VOLK_GNSSSDR_TYPEDEFS
+++#define INCLUDED_VOLK_GNSSSDR_TYPEDEFS
++
++ #include <inttypes.h>
++ #include <volk_gnsssdr/volk_gnsssdr_complex.h>
++diff -rupN /Users/andres/Desktop/volk_gnsssdr/volk_modtool.cfg /Users/andres/Desktop/volk_gnsssdr_original/volk_modtool.cfg
++--- /Users/andres/Desktop/volk_gnsssdr/volk_modtool.cfg 1970-01-01 01:00:00.000000000 +0100
+++++ /Users/andres/Desktop/volk_gnsssdr_original/volk_modtool.cfg 2014-10-15 01:55:08.000000000 +0200
++@@ -0,0 +1,5 @@
+++[config]
+++name = gnsssdr
+++destination = /Users/andres/Github/gnss-sdr/src/algorithms/libs
+++base = /Users/andres/github/gnuradio/volk
+++
+diff -rupN /Users/andres/Desktop/volk_gnsssdr/patches for generating volk_gnsssdr/README.txt /Users/andres/Desktop/volk_gnsssdr_original/patches for generating volk_gnsssdr/README.txt
+--- /Users/andres/Desktop/volk_gnsssdr/patches for generating volk_gnsssdr/README.txt 1970-01-01 01:00:00.000000000 +0100
++++ /Users/andres/Desktop/volk_gnsssdr_original/patches for generating volk_gnsssdr/README.txt 2014-10-17 04:26:17.000000000 +0200
+@@ -0,0 +1,71 @@
++########################################################################
++# Patching original volk module
++########################################################################
++In order to fit the GNSS-SDR needs, the original volk module must be patched.
++
++The folder containing this file has some patches to automatize the process and
++modify the files quickly. To apply them you will need to run the following command:
++$ patch -p5 < /Path/Of/The/Patch/nameOfThePatch.patch
++
++The number after “-p” may change, read the patch documentation for more help.
++
++You may need this information if you want to recreate the volk_gnsssdr module again
++or you want to update the volk_gnsssdr module with the improvements introduced by GNURadio.
++
++########################################################################
++########################################################################
++# Operations apply by the patches and other information (not needed if you know how to apply the patches!!!)
++########################################################################
++########################################################################
++
++To create the volk module you will need to follow the following steps:
++In order to understand and follow the creation and setup of the volk_gnsssdr module I will use some absolute paths: /Users/andres/Github/gnuradio => a cloned repository of the GNURadio project. /Users/andres/Github/gnss-sdr => a cloned repository of the GNSS- SDR project.
++
++########################################################################
++#FIRST STEP: using volk_modtool to create a new volk module
++########################################################################
++GNURadio offers a tool called volk_modtool to create and manage new volk modules and their proto-kernels. The steps to create the volk_gnsssdr module are:
++
++1) Export the PYTHONPATH, that indicates where volk_modtool is:
++$ export PYTHONPATH=/Users/andres/Github/gnuradio/volk/python
++
++2) Go to the folder where volk_modtool executable is: $ cd /Users/andres/Github/gnuradio/volk/python/volk_modtool
++
++3) Execute volk_modtool indicating that we want to create a new volk module (-i): $ ./volk_modtool -i
++
++4) volk_modtool will ask us about the name of the newly created module, the destination folder where you want to store it and the base module (the base module is the volk module inside the GNURadio project): name: gnsssdr destination: /Users/andres/Github/gnss-sdr/src/algorithms/libs base: /Users/andres/github/gnuradio/volk
++
++########################################################################
++#SECOND STEP: add proto-kernels to the module
++########################################################################
++After creating the module you will need to add some proto-kernels to it. To accomplish it you will need to: 1) Copy your proto-kernels inside the /kernels folder. Copy the ORC implementations inside the /orc folder. Copy the macros implementations inside the /kernels/CommonMacros folder. (those folders are found in the root of the volk_gnsssdr module)
++ 2) Add one profiling line for each of the proto-kernels inside the /apps/volk_gnsssdr_profile.cc file.
++
++3) Add one test line for each of the proto-kernels inside the /lib/testqa.cc file. ########################################################################
++#THIRD STEP: modifications to allow profiling of some proto-kernels with special parameters
++######################################################################## Some of the proto-kernels that GNSS-SDR needs are not supported by the profiling environment of the volk_gnsssdr module. In order to profile them some modifications need to be done to two files: 1) Modify /src/algorithms/libs/volk_gnsssdr/lib/qa_utils.cc At the first part of this file there are defined the parameters supported by the environment. The number after run_cast_test indicates the total number of parameters passed to the proto-kernel (input +output parameters). The other part indicates the type of the data passed. Inside func(....) you will need to add the same number of buffs[ ] that the one specified after run_cast_test.
++
++2) Modify /src/algorithms/libs/volk_gnsssdr/lib/qa_utils.h In the header you will need to add typedefs for the new definitions made in the .cc file. Take care: you will need to add the same number of void * that the one specified after run_cast_test.
++
++########################################################################
++#FOURTH STEP: optional modifications
++########################################################################
++1) Modify /src/algorithms/libs/volk_gnsssdr/lib/CMakeLists.txt in order to see kernel files, ORC files and macros when generating the IDE project.
++
++2) To be able to use volk_gnsssdr and default volk functions at the same time i n the same file you will need to modify the template files that volk_gnsssdr module uses at build time to generate some headers.
++The files modified are found inside /tmpl: volk_gnsssdr.tmpl.h
++volk_gnsssdr_typedefs.tmpl.h
++volk_gnsssdr_machines.tmpl.h
++volk_gnsssdr_cpu.tmpl.h
++volk_gnsssdr_config_fixed.tmpl.h The modifications consist of changing the defines of those files to different ones to allow the definition of the volk_gnsssdr functions although the default volk functions are already defined.
++
++########################################################################
++#FIFTH STEP: add volk_gnsssdr module to the GNSS-SDR project
++########################################################################
++In order to add the volk_gnsssdr module to the GNSS-SDR project the CMakeLists.txt global file needs to be edited.
++
++########################################################################
++#SIXTH STEP: using volk_gnsssdr functions
++########################################################################
++To use the proto-kernels inside volk_gnsssdr project two steps are needed: 1) in the CMakeFiles.txt you will need to add $ {VOLK_GNSSSDR_INCLUDE_DIRS} inside the include_directories function, and also add $ {VOLK_GNSSSDR_LIBRARIES} inside the target_link_libraries function.
++ 2) Add the line #include “volk_gnsssdr.h” at the top of the file.
+\ No newline at end of file
+Binary files /Users/andres/Desktop/volk_gnsssdr/python/volk_gnsssdr_modtool/__init__.pyc and /Users/andres/Desktop/volk_gnsssdr_original/python/volk_gnsssdr_modtool/__init__.pyc differ
+Binary files /Users/andres/Desktop/volk_gnsssdr/python/volk_gnsssdr_modtool/cfg.pyc and /Users/andres/Desktop/volk_gnsssdr_original/python/volk_gnsssdr_modtool/cfg.pyc differ
+Binary files /Users/andres/Desktop/volk_gnsssdr/python/volk_gnsssdr_modtool/volk_gnsssdr_modtool_generate.pyc and /Users/andres/Desktop/volk_gnsssdr_original/python/volk_gnsssdr_modtool/volk_gnsssdr_modtool_generate.pyc differ
+diff -rupN /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr.tmpl.h /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr.tmpl.h
+--- /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr.tmpl.h 2014-10-17 04:26:39.000000000 +0200
++++ /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr.tmpl.h 2014-10-17 04:23:30.000000000 +0200
+@@ -19,8 +19,8 @@
+ * Boston, MA 02110-1301, USA.
+ */
+
+-#ifndef INCLUDED_VOLK_RUNTIME
+-#define INCLUDED_VOLK_RUNTIME
++#ifndef INCLUDED_VOLK_GNSSSDR_RUNTIME
++#define INCLUDED_VOLK_GNSSSDR_RUNTIME
+
+ #include <volk_gnsssdr/volk_gnsssdr_typedefs.h>
+ #include <volk_gnsssdr/volk_gnsssdr_config_fixed.h>
+@@ -91,4 +91,4 @@ extern VOLK_API volk_gnsssdr_func_desc_t
+
+ __VOLK_DECL_END
+
+-#endif /*INCLUDED_VOLK_RUNTIME*/
++#endif /*INCLUDED_VOLK_GNSSSDR_RUNTIME*/
+diff -rupN /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr_config_fixed.tmpl.h /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr_config_fixed.tmpl.h
+--- /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr_config_fixed.tmpl.h 2014-10-17 04:26:39.000000000 +0200
++++ /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr_config_fixed.tmpl.h 2014-10-17 04:22:58.000000000 +0200
+@@ -19,11 +19,11 @@
+ * Boston, MA 02110-1301, USA.
+ */
+
+-#ifndef INCLUDED_VOLK_CONFIG_FIXED_H
+-#define INCLUDED_VOLK_CONFIG_FIXED_H
++#ifndef INCLUDED_VOLK_GNSSSDR_CONFIG_FIXED_H
++#define INCLUDED_VOLK_GNSSSDR_CONFIG_FIXED_H
+
+ #for $i, $arch in enumerate($archs)
+ #define LV_$(arch.name.upper()) $i
+ #end for
+
+-#endif /*INCLUDED_VOLK_CONFIG_FIXED*/
++#endif /*INCLUDED_VOLK_GNSSSDR_CONFIG_FIXED*/
+diff -rupN /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr_cpu.tmpl.h /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr_cpu.tmpl.h
+--- /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr_cpu.tmpl.h 2014-10-17 04:26:39.000000000 +0200
++++ /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr_cpu.tmpl.h 2014-10-17 04:23:07.000000000 +0200
+@@ -19,8 +19,8 @@
+ * Boston, MA 02110-1301, USA.
+ */
+
+-#ifndef INCLUDED_VOLK_CPU_H
+-#define INCLUDED_VOLK_CPU_H
++#ifndef INCLUDED_VOLK_GNSSSDR_CPU_H
++#define INCLUDED_VOLK_GNSSSDR_CPU_H
+
+ #include <volk_gnsssdr/volk_gnsssdr_common.h>
+
+@@ -39,4 +39,4 @@ unsigned int volk_gnsssdr_get_lvarch ();
+
+ __VOLK_DECL_END
+
+-#endif /*INCLUDED_VOLK_CPU_H*/
++#endif /*INCLUDED_VOLK_GNSSSDR_CPU_H*/
+diff -rupN /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr_machines.tmpl.h /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr_machines.tmpl.h
+--- /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr_machines.tmpl.h 2014-10-17 04:26:39.000000000 +0200
++++ /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr_machines.tmpl.h 2014-10-17 04:23:16.000000000 +0200
+@@ -19,8 +19,8 @@
+ * Boston, MA 02110-1301, USA.
+ */
+
+-#ifndef INCLUDED_LIBVOLK_MACHINES_H
+-#define INCLUDED_LIBVOLK_MACHINES_H
++#ifndef INCLUDED_LIBVOLK_GNSSSDR_MACHINES_H
++#define INCLUDED_LIBVOLK_GNSSSDR_MACHINES_H
+
+ #include <volk_gnsssdr/volk_gnsssdr_common.h>
+ #include <volk_gnsssdr/volk_gnsssdr_typedefs.h>
+@@ -52,4 +52,4 @@ extern struct volk_gnsssdr_machine volk_
+
+ __VOLK_DECL_END
+
+-#endif //INCLUDED_LIBVOLK_MACHINES_H
++#endif //INCLUDED_LIBVOLK_GNSSSDR_MACHINES_H
+diff -rupN /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr_typedefs.tmpl.h /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr_typedefs.tmpl.h
+--- /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr_typedefs.tmpl.h 2014-10-17 04:26:39.000000000 +0200
++++ /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr_typedefs.tmpl.h 2014-10-17 04:23:23.000000000 +0200
+@@ -19,8 +19,8 @@
+ * Boston, MA 02110-1301, USA.
+ */
+
+-#ifndef INCLUDED_VOLK_TYPEDEFS
+-#define INCLUDED_VOLK_TYPEDEFS
++#ifndef INCLUDED_VOLK_GNSSSDR_TYPEDEFS
++#define INCLUDED_VOLK_GNSSSDR_TYPEDEFS
+
+ #include <inttypes.h>
+ #include <volk_gnsssdr/volk_gnsssdr_complex.h>
+@@ -29,4 +29,4 @@
+ typedef void (*$(kern.pname))($kern.arglist_types);
+ #end for
+
+-#endif /*INCLUDED_VOLK_TYPEDEFS*/
++#endif /*INCLUDED_VOLK_GNSSSDR_TYPEDEFS*/
+diff -rupN /Users/andres/Desktop/volk_gnsssdr/volk_modtool.cfg /Users/andres/Desktop/volk_gnsssdr_original/volk_modtool.cfg
+--- /Users/andres/Desktop/volk_gnsssdr/volk_modtool.cfg 2014-10-17 04:26:39.000000000 +0200
++++ /Users/andres/Desktop/volk_gnsssdr_original/volk_modtool.cfg 2014-10-15 01:55:08.000000000 +0200
+@@ -1,5 +1,5 @@
+ [config]
+ name = gnsssdr
+-destination = /Users/andres/Github
++destination = /Users/andres/Github/gnss-sdr/src/algorithms/libs
+ base = /Users/andres/github/gnuradio/volk
+
diff -rupN /Users/andres/Desktop/volk_gnsssdr/patches for generating volk_gnsssdr/README.txt /Users/andres/Desktop/volk_gnsssdr_original/patches for generating volk_gnsssdr/README.txt
--- /Users/andres/Desktop/volk_gnsssdr/patches for generating volk_gnsssdr/README.txt 1970-01-01 01:00:00.000000000 +0100
+++ /Users/andres/Desktop/volk_gnsssdr_original/patches for generating volk_gnsssdr/README.txt 2014-10-17 04:26:17.000000000 +0200
@@ -0,0 +1,71 @@
+########################################################################
+# Patching original volk module
+########################################################################
+In order to fit the GNSS-SDR needs, the original volk module must be patched.
+
+The folder containing this file has some patches to automatize the process and
+modify the files quickly. To apply them you will need to run the following command:
+$ patch -p5 < /Path/Of/The/Patch/nameOfThePatch.patch
+
+The number after “-p” may change, read the patch documentation for more help.
+
+You may need this information if you want to recreate the volk_gnsssdr module again
+or you want to update the volk_gnsssdr module with the improvements introduced by GNURadio.
+
+########################################################################
+########################################################################
+# Operations apply by the patches and other information (not needed if you know how to apply the patches!!!)
+########################################################################
+########################################################################
+
+To create the volk module you will need to follow the following steps:
+In order to understand and follow the creation and setup of the volk_gnsssdr module I will use some absolute paths: /Users/andres/Github/gnuradio => a cloned repository of the GNURadio project. /Users/andres/Github/gnss-sdr => a cloned repository of the GNSS- SDR project.
+
+########################################################################
+#FIRST STEP: using volk_modtool to create a new volk module
+########################################################################
+GNURadio offers a tool called volk_modtool to create and manage new volk modules and their proto-kernels. The steps to create the volk_gnsssdr module are:
+
+1) Export the PYTHONPATH, that indicates where volk_modtool is:
+$ export PYTHONPATH=/Users/andres/Github/gnuradio/volk/python
+
+2) Go to the folder where volk_modtool executable is: $ cd /Users/andres/Github/gnuradio/volk/python/volk_modtool
+
+3) Execute volk_modtool indicating that we want to create a new volk module (-i): $ ./volk_modtool -i
+
+4) volk_modtool will ask us about the name of the newly created module, the destination folder where you want to store it and the base module (the base module is the volk module inside the GNURadio project): name: gnsssdr destination: /Users/andres/Github/gnss-sdr/src/algorithms/libs base: /Users/andres/github/gnuradio/volk
+
+########################################################################
+#SECOND STEP: add proto-kernels to the module
+########################################################################
+After creating the module you will need to add some proto-kernels to it. To accomplish it you will need to: 1) Copy your proto-kernels inside the /kernels folder. Copy the ORC implementations inside the /orc folder. Copy the macros implementations inside the /kernels/CommonMacros folder. (those folders are found in the root of the volk_gnsssdr module)
+ 2) Add one profiling line for each of the proto-kernels inside the /apps/volk_gnsssdr_profile.cc file.
+
+3) Add one test line for each of the proto-kernels inside the /lib/testqa.cc file. ########################################################################
+#THIRD STEP: modifications to allow profiling of some proto-kernels with special parameters
+######################################################################## Some of the proto-kernels that GNSS-SDR needs are not supported by the profiling environment of the volk_gnsssdr module. In order to profile them some modifications need to be done to two files: 1) Modify /src/algorithms/libs/volk_gnsssdr/lib/qa_utils.cc At the first part of this file there are defined the parameters supported by the environment. The number after run_cast_test indicates the total number of parameters passed to the proto-kernel (input +output parameters). The other part indicates the type of the data passed. Inside func(....) you will need to add the same number of buffs[ ] that the one specified after run_cast_test.
+
+2) Modify /src/algorithms/libs/volk_gnsssdr/lib/qa_utils.h In the header you will need to add typedefs for the new definitions made in the .cc file. Take care: you will need to add the same number of void * that the one specified after run_cast_test.
+
+########################################################################
+#FOURTH STEP: optional modifications
+########################################################################
+1) Modify /src/algorithms/libs/volk_gnsssdr/lib/CMakeLists.txt in order to see kernel files, ORC files and macros when generating the IDE project.
+
+2) To be able to use volk_gnsssdr and default volk functions at the same time i n the same file you will need to modify the template files that volk_gnsssdr module uses at build time to generate some headers.
+The files modified are found inside /tmpl: volk_gnsssdr.tmpl.h
+volk_gnsssdr_typedefs.tmpl.h
+volk_gnsssdr_machines.tmpl.h
+volk_gnsssdr_cpu.tmpl.h
+volk_gnsssdr_config_fixed.tmpl.h The modifications consist of changing the defines of those files to different ones to allow the definition of the volk_gnsssdr functions although the default volk functions are already defined.
+
+########################################################################
+#FIFTH STEP: add volk_gnsssdr module to the GNSS-SDR project
+########################################################################
+In order to add the volk_gnsssdr module to the GNSS-SDR project the CMakeLists.txt global file needs to be edited.
+
+########################################################################
+#SIXTH STEP: using volk_gnsssdr functions
+########################################################################
+To use the proto-kernels inside volk_gnsssdr project two steps are needed: 1) in the CMakeFiles.txt you will need to add $ {VOLK_GNSSSDR_INCLUDE_DIRS} inside the include_directories function, and also add $ {VOLK_GNSSSDR_LIBRARIES} inside the target_link_libraries function.
+ 2) Add the line #include “volk_gnsssdr.h” at the top of the file.
\ No newline at end of file
Binary files /Users/andres/Desktop/volk_gnsssdr/python/volk_gnsssdr_modtool/__init__.pyc and /Users/andres/Desktop/volk_gnsssdr_original/python/volk_gnsssdr_modtool/__init__.pyc differ
Binary files /Users/andres/Desktop/volk_gnsssdr/python/volk_gnsssdr_modtool/cfg.pyc and /Users/andres/Desktop/volk_gnsssdr_original/python/volk_gnsssdr_modtool/cfg.pyc differ
Binary files /Users/andres/Desktop/volk_gnsssdr/python/volk_gnsssdr_modtool/volk_gnsssdr_modtool_generate.pyc and /Users/andres/Desktop/volk_gnsssdr_original/python/volk_gnsssdr_modtool/volk_gnsssdr_modtool_generate.pyc differ
diff -rupN /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr.tmpl.h /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr.tmpl.h
--- /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr.tmpl.h 2014-10-17 05:07:25.000000000 +0200
+++ /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr.tmpl.h 2014-10-17 04:23:30.000000000 +0200
@@ -19,8 +19,8 @@
* Boston, MA 02110-1301, USA.
*/
-#ifndef INCLUDED_VOLK_RUNTIME
-#define INCLUDED_VOLK_RUNTIME
+#ifndef INCLUDED_VOLK_GNSSSDR_RUNTIME
+#define INCLUDED_VOLK_GNSSSDR_RUNTIME
#include <volk_gnsssdr/volk_gnsssdr_typedefs.h>
#include <volk_gnsssdr/volk_gnsssdr_config_fixed.h>
@@ -91,4 +91,4 @@ extern VOLK_API volk_gnsssdr_func_desc_t
__VOLK_DECL_END
-#endif /*INCLUDED_VOLK_RUNTIME*/
+#endif /*INCLUDED_VOLK_GNSSSDR_RUNTIME*/
diff -rupN /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr_config_fixed.tmpl.h /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr_config_fixed.tmpl.h
--- /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr_config_fixed.tmpl.h 2014-10-17 05:07:25.000000000 +0200
+++ /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr_config_fixed.tmpl.h 2014-10-17 04:22:58.000000000 +0200
@@ -19,11 +19,11 @@
* Boston, MA 02110-1301, USA.
*/
-#ifndef INCLUDED_VOLK_CONFIG_FIXED_H
-#define INCLUDED_VOLK_CONFIG_FIXED_H
+#ifndef INCLUDED_VOLK_GNSSSDR_CONFIG_FIXED_H
+#define INCLUDED_VOLK_GNSSSDR_CONFIG_FIXED_H
#for $i, $arch in enumerate($archs)
#define LV_$(arch.name.upper()) $i
#end for
-#endif /*INCLUDED_VOLK_CONFIG_FIXED*/
+#endif /*INCLUDED_VOLK_GNSSSDR_CONFIG_FIXED*/
diff -rupN /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr_cpu.tmpl.h /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr_cpu.tmpl.h
--- /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr_cpu.tmpl.h 2014-10-17 05:07:25.000000000 +0200
+++ /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr_cpu.tmpl.h 2014-10-17 04:23:07.000000000 +0200
@@ -19,8 +19,8 @@
* Boston, MA 02110-1301, USA.
*/
-#ifndef INCLUDED_VOLK_CPU_H
-#define INCLUDED_VOLK_CPU_H
+#ifndef INCLUDED_VOLK_GNSSSDR_CPU_H
+#define INCLUDED_VOLK_GNSSSDR_CPU_H
#include <volk_gnsssdr/volk_gnsssdr_common.h>
@@ -39,4 +39,4 @@ unsigned int volk_gnsssdr_get_lvarch ();
__VOLK_DECL_END
-#endif /*INCLUDED_VOLK_CPU_H*/
+#endif /*INCLUDED_VOLK_GNSSSDR_CPU_H*/
diff -rupN /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr_machines.tmpl.h /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr_machines.tmpl.h
--- /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr_machines.tmpl.h 2014-10-17 05:07:25.000000000 +0200
+++ /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr_machines.tmpl.h 2014-10-17 04:23:16.000000000 +0200
@@ -19,8 +19,8 @@
* Boston, MA 02110-1301, USA.
*/
-#ifndef INCLUDED_LIBVOLK_MACHINES_H
-#define INCLUDED_LIBVOLK_MACHINES_H
+#ifndef INCLUDED_LIBVOLK_GNSSSDR_MACHINES_H
+#define INCLUDED_LIBVOLK_GNSSSDR_MACHINES_H
#include <volk_gnsssdr/volk_gnsssdr_common.h>
#include <volk_gnsssdr/volk_gnsssdr_typedefs.h>
@@ -52,4 +52,4 @@ extern struct volk_gnsssdr_machine volk_
__VOLK_DECL_END
-#endif //INCLUDED_LIBVOLK_MACHINES_H
+#endif //INCLUDED_LIBVOLK_GNSSSDR_MACHINES_H
diff -rupN /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr_typedefs.tmpl.h /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr_typedefs.tmpl.h
--- /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr_typedefs.tmpl.h 2014-10-17 05:07:25.000000000 +0200
+++ /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr_typedefs.tmpl.h 2014-10-17 04:23:23.000000000 +0200
@@ -19,8 +19,8 @@
* Boston, MA 02110-1301, USA.
*/
-#ifndef INCLUDED_VOLK_TYPEDEFS
-#define INCLUDED_VOLK_TYPEDEFS
+#ifndef INCLUDED_VOLK_GNSSSDR_TYPEDEFS
+#define INCLUDED_VOLK_GNSSSDR_TYPEDEFS
#include <inttypes.h>
#include <volk_gnsssdr/volk_gnsssdr_complex.h>
@@ -29,4 +29,4 @@
typedef void (*$(kern.pname))($kern.arglist_types);
#end for
-#endif /*INCLUDED_VOLK_TYPEDEFS*/
+#endif /*INCLUDED_VOLK_GNSSSDR_TYPEDEFS*/
diff -rupN /Users/andres/Desktop/volk_gnsssdr/volk_modtool.cfg /Users/andres/Desktop/volk_gnsssdr_original/volk_modtool.cfg
--- /Users/andres/Desktop/volk_gnsssdr/volk_modtool.cfg 2014-10-17 05:07:25.000000000 +0200
+++ /Users/andres/Desktop/volk_gnsssdr_original/volk_modtool.cfg 2014-10-15 01:55:08.000000000 +0200
@@ -1,5 +1,5 @@
[config]
name = gnsssdr
-destination = /Users/andres/Github
+destination = /Users/andres/Github/gnss-sdr/src/algorithms/libs
base = /Users/andres/github/gnuradio/volk