1
0
mirror of https://github.com/gnss-sdr/gnss-sdr synced 2024-11-17 23:34:56 +00:00

Merge branch 'next' of https://github.com/gnss-sdr/gnss-sdr into next

This commit is contained in:
Carles Fernandez 2016-02-15 14:48:18 +01:00
commit 3385d7a3e1
10 changed files with 580 additions and 359 deletions

View File

@ -48,6 +48,7 @@ Downloading, building and installing [GNU Radio](http://gnuradio.org/redmine/pro
~~~~~~ ~~~~~~
$ git clone https://github.com/gnuradio/pybombs.git $ git clone https://github.com/gnuradio/pybombs.git
$ cd pybombs $ cd pybombs
$ python setup.py build
$ sudo python setup.py install $ sudo python setup.py install
$ pybombs recipes add gr-recipes https://github.com/gnuradio/gr-recipes.git $ pybombs recipes add gr-recipes https://github.com/gnuradio/gr-recipes.git
$ pybombs recipes add gr-etcetera https://github.com/gnuradio/gr-etcetera.git $ pybombs recipes add gr-etcetera https://github.com/gnuradio/gr-etcetera.git
@ -408,7 +409,7 @@ Install Armadillo and dependencies:
$ brew tap homebrew/science $ brew tap homebrew/science
$ brew install cmake hdf5 arpack superlu $ brew install cmake hdf5 arpack superlu
$ brew install armadillo $ brew install armadillo
$ brew install glog gflags $ brew install glog gflags gnutls
~~~~~~ ~~~~~~
#### Build GNSS-SDR #### Build GNSS-SDR

View File

@ -163,19 +163,26 @@ int gps_l1_ca_pvt_cc::general_work (int noutput_items, gr_vector_int &ninput_ite
Gnss_Synchro **in = (Gnss_Synchro **) &input_items[0]; //Get the input pointer Gnss_Synchro **in = (Gnss_Synchro **) &input_items[0]; //Get the input pointer
//Gnss_Synchro **out = (Gnss_Synchro **) &output_items[0]; //Get the output pointer //Gnss_Synchro **out = (Gnss_Synchro **) &output_items[0]; //Get the output pointer
// ############ 1. READ EPHEMERIS FROM GLOBAL MAP ####
d_ls_pvt->gps_ephemeris_map = global_gps_ephemeris_map.get_map_copy();
for (unsigned int i = 0; i < d_nchannels; i++) for (unsigned int i = 0; i < d_nchannels; i++)
{ {
std::map<int,Gps_Ephemeris>::iterator gps_ephemeris_iter;
gps_ephemeris_iter = d_ls_pvt->gps_ephemeris_map.begin();
if (in[i][0].Flag_valid_pseudorange == true) if (in[i][0].Flag_valid_pseudorange == true)
{ {
gnss_pseudoranges_map.insert(std::pair<int,Gnss_Synchro>(in[i][0].PRN, in[i][0])); // store valid pseudoranges in a map gnss_pseudoranges_map.insert(std::pair<int,Gnss_Synchro>(in[i][0].PRN, in[i][0])); // store valid pseudoranges in a map
d_rx_time = in[i][0].d_TOW_at_current_symbol; // all the channels have the same RX timestamp (common RX time pseudoranges) d_rx_time = in[i][0].d_TOW_at_current_symbol; // all the channels have the same RX timestamp (common RX time pseudoranges)
d_rtcm_printer->lock_time(gps_ephemeris_iter->second, d_rx_time, in[i][0]); // keep track of locking time
}
else
{
d_rtcm_printer->lock_time(gps_ephemeris_iter->second, 0.0, in[i][0]);
} }
} }
// ############ 1. READ EPHEMERIS/UTC_MODE/IONO FROM GLOBAL MAPS #### // ############ READ UTC_MODEL/IONO FROM GLOBAL MAPS ####
d_ls_pvt->gps_ephemeris_map = global_gps_ephemeris_map.get_map_copy();
if (global_gps_utc_model_map.size() > 0) if (global_gps_utc_model_map.size() > 0)
{ {
// UTC MODEL data is shared for all the GPS satellites. Read always at a locked channel // UTC MODEL data is shared for all the GPS satellites. Read always at a locked channel
@ -337,7 +344,7 @@ int gps_l1_ca_pvt_cc::general_work (int noutput_items, gr_vector_int &ninput_ite
if (gps_ephemeris_iter != d_ls_pvt->gps_ephemeris_map.end()) if (gps_ephemeris_iter != d_ls_pvt->gps_ephemeris_map.end())
{ {
d_rtcm_printer->Print_Rtcm_MT1019(gps_ephemeris_iter->second); d_rtcm_printer->Print_Rtcm_MT1019(gps_ephemeris_iter->second);
d_rtcm_printer->Print_Rtcm_MT1002(gps_ephemeris_iter->second, d_rx_time, gnss_pseudoranges_map); d_rtcm_printer->Print_Rtcm_MSM(1, gps_ephemeris_iter->second, {}, {}, d_rx_time, gnss_pseudoranges_map, 1234, 0, 0, 0, 0, 0);
} }
b_rtcm_writing_started = true; b_rtcm_writing_started = true;
} }

View File

@ -304,3 +304,9 @@ std::string Rtcm_Printer::print_MT1005_test()
std::string test = rtcm->print_MT1005_test(); std::string test = rtcm->print_MT1005_test();
return test; return test;
} }
unsigned int Rtcm_Printer::lock_time(const Gps_Ephemeris& gps_eph, double obs_time, const Gnss_Synchro & gnss_synchro)
{
return rtcm->lock_time(gps_eph, obs_time, gnss_synchro);
}

View File

@ -73,6 +73,7 @@ public:
bool more_messages); bool more_messages);
std::string print_MT1005_test(); //<! For testing purposes std::string print_MT1005_test(); //<! For testing purposes
unsigned int lock_time(const Gps_Ephemeris& gps_eph, double obs_time, const Gnss_Synchro & gnss_synchro);
private: private:
std::string rtcm_filename; // String with the RTCM log filename std::string rtcm_filename; // String with the RTCM log filename

View File

@ -38,9 +38,6 @@
#include <volk_gnsssdr/volk_gnsssdr_complex.h> #include <volk_gnsssdr/volk_gnsssdr_complex.h>
#include <math.h> #include <math.h>
#include <stdio.h>
#define ROTATOR_RELOAD 512
#ifdef LV_HAVE_GENERIC #ifdef LV_HAVE_GENERIC
@ -56,20 +53,9 @@
static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_generic(lv_16sc_t* outVector, const lv_16sc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points) static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_generic(lv_16sc_t* outVector, const lv_16sc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points)
{ {
unsigned int i = 0; unsigned int i = 0;
int j = 0;
lv_16sc_t tmp16; lv_16sc_t tmp16;
lv_32fc_t tmp32; lv_32fc_t tmp32;
for(i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); ++i) for(i = 0; i < (unsigned int)(num_points); ++i)
{
for(j = 0; j < ROTATOR_RELOAD; ++j)
{
tmp16 = *inVector++;
tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase);
*outVector++ = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32)));
(*phase) *= phase_inc;
}
}
for(i = 0; i < num_points % ROTATOR_RELOAD; ++i)
{ {
tmp16 = *inVector++; tmp16 = *inVector++;
tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase); tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase);

View File

@ -78,7 +78,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_generic(lv_16sc_t* resu
\param[in] num_a_vectors Number of vectors to be multiplied by the reference vector and accumulated \param[in] num_a_vectors Number of vectors to be multiplied by the reference vector and accumulated
\param[in] num_points The Number of complex values to be multiplied together, accumulated and stored into result \param[in] num_points The Number of complex values to be multiplied together, accumulated and stored into result
*/ */
static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_sse2(lv_16sc_t* out, const lv_16sc_t* in_common, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points) static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_sse2(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points)
{ {
lv_16sc_t dotProduct = lv_cmake(0,0); lv_16sc_t dotProduct = lv_cmake(0,0);
@ -86,7 +86,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_sse2(lv_16sc_t* out,
const lv_16sc_t** _in_a = in_a; const lv_16sc_t** _in_a = in_a;
const lv_16sc_t* _in_common = in_common; const lv_16sc_t* _in_common = in_common;
lv_16sc_t* _out = out; lv_16sc_t* _out = result;
if (sse_iters > 0) if (sse_iters > 0)
{ {
@ -100,7 +100,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_sse2(lv_16sc_t* out,
realcacc = (__m128i*)calloc(num_a_vectors, sizeof(__m128i)); //calloc also sets memory to 0 realcacc = (__m128i*)calloc(num_a_vectors, sizeof(__m128i)); //calloc also sets memory to 0
imagcacc = (__m128i*)calloc(num_a_vectors, sizeof(__m128i)); //calloc also sets memory to 0 imagcacc = (__m128i*)calloc(num_a_vectors, sizeof(__m128i)); //calloc also sets memory to 0
__m128i a,b,c, c_sr, mask_imag, mask_real, real, imag, imag1,imag2, b_sl, a_sl, result; __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, results;
mask_imag = _mm_set_epi8(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0); mask_imag = _mm_set_epi8(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0);
mask_real = _mm_set_epi8(0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255); mask_real = _mm_set_epi8(0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255);
@ -116,10 +116,10 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_sse2(lv_16sc_t* out,
{ {
a = _mm_load_si128((__m128i*)&(_in_a[n_vec][number*4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg a = _mm_load_si128((__m128i*)&(_in_a[n_vec][number*4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
c = _mm_mullo_epi16 (a, b); // a3.i*b3.i, a3.r*b3.r, .... c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
c_sr = _mm_srli_si128 (c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
real = _mm_subs_epi16 (c, c_sr); real = _mm_subs_epi16(c, c_sr);
b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i .... b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i .... a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
@ -129,23 +129,23 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_sse2(lv_16sc_t* out,
imag = _mm_adds_epi16(imag1, imag2); imag = _mm_adds_epi16(imag1, imag2);
realcacc[n_vec] = _mm_adds_epi16 (realcacc[n_vec], real); realcacc[n_vec] = _mm_adds_epi16(realcacc[n_vec], real);
imagcacc[n_vec] = _mm_adds_epi16 (imagcacc[n_vec], imag); imagcacc[n_vec] = _mm_adds_epi16(imagcacc[n_vec], imag);
} }
_in_common += 4; _in_common += 4;
} }
for (int n_vec=0;n_vec<num_a_vectors;n_vec++) for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
{ {
realcacc[n_vec] = _mm_and_si128 (realcacc[n_vec], mask_real); realcacc[n_vec] = _mm_and_si128(realcacc[n_vec], mask_real);
imagcacc[n_vec] = _mm_and_si128 (imagcacc[n_vec], mask_imag); imagcacc[n_vec] = _mm_and_si128(imagcacc[n_vec], mask_imag);
result = _mm_or_si128 (realcacc[n_vec], imagcacc[n_vec]); results = _mm_or_si128(realcacc[n_vec], imagcacc[n_vec]);
_mm_store_si128((__m128i*)dotProductVector, result); // Store the results back into the dot product vector _mm_store_si128((__m128i*)dotProductVector, results); // Store the results back into the dot product vector
dotProduct = lv_cmake(0,0); dotProduct = lv_cmake(0,0);
for (int i = 0; i<4; ++i) for (int i = 0; i < 4; ++i)
{ {
dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[i])), dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[i])),
sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[i]))); sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[i])));
@ -181,7 +181,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_sse2(lv_16sc_t* out,
\param[in] num_a_vectors Number of vectors to be multiplied by the reference vector and accumulated \param[in] num_a_vectors Number of vectors to be multiplied by the reference vector and accumulated
\param[in] num_points The Number of complex values to be multiplied together, accumulated and stored into result \param[in] num_points The Number of complex values to be multiplied together, accumulated and stored into result
*/ */
static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_sse2(lv_16sc_t* out, const lv_16sc_t* in_common, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points) static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_sse2(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points)
{ {
lv_16sc_t dotProduct = lv_cmake(0,0); lv_16sc_t dotProduct = lv_cmake(0,0);
@ -189,7 +189,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_sse2(lv_16sc_t* out,
const lv_16sc_t** _in_a = in_a; const lv_16sc_t** _in_a = in_a;
const lv_16sc_t* _in_common = in_common; const lv_16sc_t* _in_common = in_common;
lv_16sc_t* _out = out; lv_16sc_t* _out = result;
if (sse_iters > 0) if (sse_iters > 0)
{ {
@ -203,7 +203,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_sse2(lv_16sc_t* out,
realcacc = (__m128i*)calloc(num_a_vectors, sizeof(__m128i)); //calloc also sets memory to 0 realcacc = (__m128i*)calloc(num_a_vectors, sizeof(__m128i)); //calloc also sets memory to 0
imagcacc = (__m128i*)calloc(num_a_vectors, sizeof(__m128i)); //calloc also sets memory to 0 imagcacc = (__m128i*)calloc(num_a_vectors, sizeof(__m128i)); //calloc also sets memory to 0
__m128i a,b,c, c_sr, mask_imag, mask_real, real, imag, imag1,imag2, b_sl, a_sl, result; __m128i a,b,c, c_sr, mask_imag, mask_real, real, imag, imag1,imag2, b_sl, a_sl, results;
mask_imag = _mm_set_epi8(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0); mask_imag = _mm_set_epi8(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0);
mask_real = _mm_set_epi8(0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255); mask_real = _mm_set_epi8(0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255);
@ -219,10 +219,10 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_sse2(lv_16sc_t* out,
{ {
a = _mm_loadu_si128((__m128i*)&(_in_a[n_vec][number*4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg a = _mm_loadu_si128((__m128i*)&(_in_a[n_vec][number*4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
c = _mm_mullo_epi16 (a, b); // a3.i*b3.i, a3.r*b3.r, .... c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
c_sr = _mm_srli_si128 (c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
real = _mm_subs_epi16 (c, c_sr); real = _mm_subs_epi16(c, c_sr);
b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i .... b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i .... a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
@ -239,16 +239,16 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_sse2(lv_16sc_t* out,
_in_common += 4; _in_common += 4;
} }
for (int n_vec=0;n_vec<num_a_vectors;n_vec++) for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
{ {
realcacc[n_vec] = _mm_and_si128(realcacc[n_vec], mask_real); realcacc[n_vec] = _mm_and_si128(realcacc[n_vec], mask_real);
imagcacc[n_vec] = _mm_and_si128(imagcacc[n_vec], mask_imag); imagcacc[n_vec] = _mm_and_si128(imagcacc[n_vec], mask_imag);
result = _mm_or_si128(realcacc[n_vec], imagcacc[n_vec]); results = _mm_or_si128(realcacc[n_vec], imagcacc[n_vec]);
_mm_storeu_si128((__m128i*)dotProductVector, result); // Store the results back into the dot product vector _mm_storeu_si128((__m128i*)dotProductVector, results); // Store the results back into the dot product vector
dotProduct = lv_cmake(0,0); dotProduct = lv_cmake(0,0);
for (int i = 0; i<4; ++i) for (int i = 0; i < 4; ++i)
{ {
dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[i])), dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[i])),
sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[i]))); sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[i])));
@ -284,7 +284,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_sse2(lv_16sc_t* out,
\param[in] num_a_vectors Number of vectors to be multiplied by the reference vector and accumulated \param[in] num_a_vectors Number of vectors to be multiplied by the reference vector and accumulated
\param[in] num_points The Number of complex values to be multiplied together, accumulated and stored into result \param[in] num_points The Number of complex values to be multiplied together, accumulated and stored into result
*/ */
static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon(lv_16sc_t* out, const lv_16sc_t* in_common, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points) static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points)
{ {
lv_16sc_t dotProduct = lv_cmake(0,0); lv_16sc_t dotProduct = lv_cmake(0,0);
@ -292,7 +292,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon(lv_16sc_t* out, co
const lv_16sc_t** _in_a = in_a; const lv_16sc_t** _in_a = in_a;
const lv_16sc_t* _in_common = in_common; const lv_16sc_t* _in_common = in_common;
lv_16sc_t* _out = out; lv_16sc_t* _out = result;
if (neon_iters > 0) if (neon_iters > 0)
{ {
@ -319,7 +319,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon(lv_16sc_t* out, co
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++) for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
{ {
a_val = vld2_s16((int16_t*)&(_in_a[n_vec][number*4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg a_val = vld2_s16((int16_t*)&(_in_a[n_vec][number*4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
//__builtin_prefetch(_in_a[n_vec] + 8); //__builtin_prefetch(&_in_a[n_vec][number*4] + 8);
// multiply the real*real and imag*imag to get real result // multiply the real*real and imag*imag to get real result
// a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r

View File

@ -49,8 +49,9 @@ static inline void volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic_generic(lv_16sc_t*
for(unsigned int n = 0; n < num_a_vectors; n++) for(unsigned int n = 0; n < num_a_vectors; n++)
{ {
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
memcpy(in_a[n], in, sizeof(lv_16sc_t) * num_points); memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points);
} }
volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_generic(result, local_code, (const lv_16sc_t**) in_a, num_a_vectors, num_points); volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_generic(result, local_code, (const lv_16sc_t**) in_a, num_a_vectors, num_points);
for(unsigned int n = 0; n < num_a_vectors; n++) for(unsigned int n = 0; n < num_a_vectors; n++)
@ -72,6 +73,7 @@ static inline void volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic_a_sse2(lv_16sc_t* r
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points); memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points);
} }
volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_sse2(result, local_code, (const lv_16sc_t**) in_a, num_a_vectors, num_points); volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_sse2(result, local_code, (const lv_16sc_t**) in_a, num_a_vectors, num_points);
for(unsigned int n = 0; n < num_a_vectors; n++) for(unsigned int n = 0; n < num_a_vectors; n++)
@ -92,8 +94,9 @@ static inline void volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic_u_sse2(lv_16sc_t* r
for(unsigned int n = 0; n < num_a_vectors; n++) for(unsigned int n = 0; n < num_a_vectors; n++)
{ {
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t)*num_points, volk_gnsssdr_get_alignment()); in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t)*num_points, volk_gnsssdr_get_alignment());
memcpy(in_a[n], in, sizeof(lv_16sc_t)*num_points); memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t)*num_points);
} }
volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_sse2(result, local_code, (const lv_16sc_t**) in_a, num_a_vectors, num_points); volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_sse2(result, local_code, (const lv_16sc_t**) in_a, num_a_vectors, num_points);
for(unsigned int n = 0; n < num_a_vectors; n++) for(unsigned int n = 0; n < num_a_vectors; n++)
@ -114,8 +117,9 @@ static inline void volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic_neon(lv_16sc_t* res
for(unsigned int n = 0; n < num_a_vectors; n++) for(unsigned int n = 0; n < num_a_vectors; n++)
{ {
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t)*num_points, volk_gnsssdr_get_alignment()); in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t)*num_points, volk_gnsssdr_get_alignment());
memcpy(in_a[n], in, sizeof(lv_16sc_t)*num_points); memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t)*num_points);
} }
volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon(result, local_code, (const lv_16sc_t**) in_a, num_a_vectors, num_points); volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon(result, local_code, (const lv_16sc_t**) in_a, num_a_vectors, num_points);
for(unsigned int n = 0; n < num_a_vectors; n++) for(unsigned int n = 0; n < num_a_vectors; n++)

View File

@ -1,11 +1,14 @@
/*! /*!
* \file volk_gnsssdr_16ic_x2_dot_prod_16ic_xn.h * \file volk_gnsssdr_16ic_x2_dot_prod_16ic_xn.h
* \brief Volk protokernel: multiplies N 16 bits vectors by a common vector phase rotated and accumulates the results in N 16 bits short complex outputs. * \brief Volk protokernel: multiplies N 16 bits vectors by a common vector
* phase rotated and accumulates the results in N 16 bits short complex outputs.
* \authors <ul> * \authors <ul>
* <li> Javier Arribas, 2015. jarribas(at)cttc.es * <li> Javier Arribas, 2015. jarribas(at)cttc.es
* </ul> * </ul>
* *
* Volk protokernel that multiplies N 16 bits vectors by a common vector, which is phase-rotated by phase offset and phase increment, and accumulates the results in N 16 bits short complex outputs. * Volk protokernel that multiplies N 16 bits vectors by a common vector, which is
* phase-rotated by phase offset and phase increment, and accumulates the results
* in N 16 bits short complex outputs.
* It is optimized to perform the N tap correlation process in GNSS receivers. * It is optimized to perform the N tap correlation process in GNSS receivers.
* *
* ------------------------------------------------------------------------- * -------------------------------------------------------------------------
@ -39,36 +42,39 @@
#include <volk_gnsssdr/volk_gnsssdr_complex.h> #include <volk_gnsssdr/volk_gnsssdr_complex.h>
#include <volk_gnsssdr/saturation_arithmetic.h> #include <volk_gnsssdr/saturation_arithmetic.h>
#include <math.h>
#ifdef LV_HAVE_GENERIC #ifdef LV_HAVE_GENERIC
/*! /*!
\brief Multiplies the reference complex vector with multiple versions of another complex vector, accumulates the results and stores them in the output vector \brief Rotates and multiplies the reference complex vector with multiple versions of another complex vector, accumulates the results and stores them in the output vector
\param[out] result Array of num_a_vectors components with the multiple versions of in_a multiplied and accumulated The vector where the accumulated result will be stored \param[out] result Array of num_a_vectors components with the multiple versions of in_a multiplied and accumulated The vector where the accumulated result will be stored
\param[in] in_common Pointer to one of the vectors to be multiplied and accumulated (reference vector) \param[in] in_common Pointer to one of the vectors to be rotated, multiplied and accumulated (reference vector)
\param[in] in_a Pointer to an array of pointers to multiple versions of the other vector to be multiplied and accumulated \param[in] phase_inc Phase increment = lv_cmake(cos(phase_step_rad), -sin(phase_step_rad))
\param[in] num_a_vectors Number of vectors to be multiplied by the reference vector and accumulated \param[in,out] phase Initial / final phase
\param[in] num_points The Number of complex values to be multiplied together, accumulated and stored into result \param[in] in_a Pointer to an array of pointers to multiple versions of the other vector to be multiplied and accumulated
\param[in] num_a_vectors Number of vectors to be multiplied by the reference vector and accumulated
\param[in] num_points The Number of complex values to be multiplied together, accumulated and stored into result
*/ */
static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_generic(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points) static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_generic(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points)
{ {
lv_16sc_t tmp16; lv_16sc_t tmp16;
lv_32fc_t tmp32; lv_32fc_t tmp32;
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++) for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
{ {
result[n_vec] = lv_cmake(0,0); result[n_vec] = lv_cmake(0,0);
} }
for (unsigned int n = 0; n < num_points; n++) for (unsigned int n = 0; n < num_points; n++)
{ {
tmp16 = *in_common++; tmp16 = *in_common++;
tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase); tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase);
tmp16 = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32))); tmp16 = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32)));
(*phase) *= phase_inc; (*phase) *= phase_inc;
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++) for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
{ {
lv_16sc_t tmp = tmp16 * in_a[n_vec][n]; lv_16sc_t tmp = tmp16 * in_a[n_vec][n];
result[n_vec] = lv_cmake(sat_adds16i(lv_creal(result[n_vec]), lv_creal(tmp)), sat_adds16i(lv_cimag(result[n_vec]), lv_cimag(tmp))); result[n_vec] = lv_cmake(sat_adds16i(lv_creal(result[n_vec]), lv_creal(tmp)), sat_adds16i(lv_cimag(result[n_vec]), lv_cimag(tmp)));
} }
} }
} }
#endif /*LV_HAVE_GENERIC*/ #endif /*LV_HAVE_GENERIC*/
@ -78,159 +84,160 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_generic(lv_16sc
#include <pmmintrin.h> #include <pmmintrin.h>
/*! /*!
\brief Multiplies the reference complex vector with multiple versions of another complex vector, accumulates the results and stores them in the output vector \brief Rotates and multiplies the reference complex vector with multiple versions of another complex vector, accumulates the results and stores them in the output vector
\param[out] result Array of num_a_vectors components with the multiple versions of in_a multiplied and accumulated The vector where the accumulated result will be stored \param[out] result Array of num_a_vectors components with the multiple versions of in_a multiplied and accumulated The vector where the accumulated result will be stored
\param[in] in_common Pointer to one of the vectors to be multiplied and accumulated (reference vector) \param[in] in_common Pointer to one of the vectors to be rotated, multiplied and accumulated (reference vector)
\param[in] in_a Pointer to an array of pointers to multiple versions of the other vector to be multiplied and accumulated \param[in] phase_inc Phase increment = lv_cmake(cos(phase_step_rad), -sin(phase_step_rad))
\param[in] num_a_vectors Number of vectors to be multiplied by the reference vector and accumulated \param[in,out] phase Initial / final phase
\param[in] num_points The Number of complex values to be multiplied together, accumulated and stored into result \param[in] in_a Pointer to an array of pointers to multiple versions of the other vector to be multiplied and accumulated
\param[in] num_a_vectors Number of vectors to be multiplied by the reference vector and accumulated
\param[in] num_points The Number of complex values to be multiplied together, accumulated and stored into result
*/ */
static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3(lv_16sc_t* out, const lv_16sc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points) static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points)
{ {
lv_16sc_t dotProduct = lv_cmake(0,0); lv_16sc_t dotProduct = lv_cmake(0,0);
const unsigned int sse_iters = num_points / 4; const unsigned int sse_iters = num_points / 4;
const lv_16sc_t** _in_a = in_a; const lv_16sc_t** _in_a = in_a;
const lv_16sc_t* _in_common = in_common; const lv_16sc_t* _in_common = in_common;
lv_16sc_t* _out = out; lv_16sc_t* _out = result;
__VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4]; __VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4];
//todo dyn mem reg //todo dyn mem reg
__m128i* realcacc; __m128i* realcacc;
__m128i* imagcacc; __m128i* imagcacc;
realcacc = (__m128i*)calloc(num_a_vectors, sizeof(__m128i)); //calloc also sets memory to 0 realcacc = (__m128i*)calloc(num_a_vectors, sizeof(__m128i)); //calloc also sets memory to 0
imagcacc = (__m128i*)calloc(num_a_vectors, sizeof(__m128i)); //calloc also sets memory to 0 imagcacc = (__m128i*)calloc(num_a_vectors, sizeof(__m128i)); //calloc also sets memory to 0
__m128i a,b,c, c_sr, mask_imag, mask_real, real, imag, imag1,imag2, b_sl, a_sl, result; __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, results;
mask_imag = _mm_set_epi8(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0); mask_imag = _mm_set_epi8(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0);
mask_real = _mm_set_epi8(0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255); mask_real = _mm_set_epi8(0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255);
// phase rotation registers // phase rotation registers
__m128 pa, pb, two_phase_acc_reg, two_phase_inc_reg; __m128 pa, pb, two_phase_acc_reg, two_phase_inc_reg;
__m128i pc1, pc2; __m128i pc1, pc2;
__attribute__((aligned(16))) lv_32fc_t two_phase_inc[2]; __attribute__((aligned(16))) lv_32fc_t two_phase_inc[2];
two_phase_inc[0] = phase_inc * phase_inc; two_phase_inc[0] = phase_inc * phase_inc;
two_phase_inc[1] = phase_inc * phase_inc; two_phase_inc[1] = phase_inc * phase_inc;
two_phase_inc_reg = _mm_load_ps((float*) two_phase_inc); two_phase_inc_reg = _mm_load_ps((float*) two_phase_inc);
__attribute__((aligned(16))) lv_32fc_t two_phase_acc[2]; __attribute__((aligned(16))) lv_32fc_t two_phase_acc[2];
two_phase_acc[0] = (*phase); two_phase_acc[0] = (*phase);
two_phase_acc[1] = (*phase) * phase_inc; two_phase_acc[1] = (*phase) * phase_inc;
two_phase_acc_reg = _mm_load_ps((float*)two_phase_acc); two_phase_acc_reg = _mm_load_ps((float*)two_phase_acc);
__m128 yl, yh, tmp1, tmp2, tmp3; __m128 yl, yh, tmp1, tmp2, tmp3;
lv_16sc_t tmp16; lv_16sc_t tmp16;
lv_32fc_t tmp32; lv_32fc_t tmp32;
for(unsigned int number = 0; number < sse_iters; number++) for(unsigned int number = 0; number < sse_iters; number++)
{ {
// Phase rotation on operand in_common starts here: // Phase rotation on operand in_common starts here:
pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
//complex 32fc multiplication b=a*two_phase_acc_reg //complex 32fc multiplication b=a*two_phase_acc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
pa = _mm_shuffle_ps(pa, pa, 0xB1); // Re-arrange x to be ai,ar,bi,br pa = _mm_shuffle_ps(pa, pa, 0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(pa, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di tmp2 = _mm_mul_ps(pa, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
pc1 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic pc1 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
//next two samples //next two samples
_in_common += 2; _in_common += 2;
pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
//complex 32fc multiplication b=a*two_phase_acc_reg //complex 32fc multiplication b=a*two_phase_acc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
pa = _mm_shuffle_ps(pa, pa, 0xB1); // Re-arrange x to be ai,ar,bi,br pa = _mm_shuffle_ps(pa, pa, 0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(pa, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di tmp2 = _mm_mul_ps(pa, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
pc2 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic pc2 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
// store four rotated in_common samples in the register b // store four rotated in_common samples in the register b
b = _mm_packs_epi32(pc1, pc2);// convert from 32ic to 16ic b = _mm_packs_epi32(pc1, pc2);// convert from 32ic to 16ic
//next two samples //next two samples
_in_common += 2; _in_common += 2;
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++) for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
{ {
a = _mm_load_si128((__m128i*)&(_in_a[n_vec][number*4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg a = _mm_load_si128((__m128i*)&(_in_a[n_vec][number*4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
c = _mm_mullo_epi16 (a, b); // a3.i*b3.i, a3.r*b3.r, .... c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
c_sr = _mm_srli_si128 (c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
real = _mm_subs_epi16 (c, c_sr); real = _mm_subs_epi16(c, c_sr);
b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i .... b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i .... a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, .... imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, .... imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
imag = _mm_adds_epi16(imag1, imag2); imag = _mm_adds_epi16(imag1, imag2);
realcacc[n_vec] = _mm_adds_epi16 (realcacc[n_vec], real); realcacc[n_vec] = _mm_adds_epi16 (realcacc[n_vec], real);
imagcacc[n_vec] = _mm_adds_epi16 (imagcacc[n_vec], imag); imagcacc[n_vec] = _mm_adds_epi16 (imagcacc[n_vec], imag);
}
}
} for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
} {
realcacc[n_vec] = _mm_and_si128(realcacc[n_vec], mask_real);
imagcacc[n_vec] = _mm_and_si128(imagcacc[n_vec], mask_imag);
for (int n_vec=0;n_vec<num_a_vectors;n_vec++) results = _mm_or_si128(realcacc[n_vec], imagcacc[n_vec]);
{
realcacc[n_vec] = _mm_and_si128 (realcacc[n_vec], mask_real);
imagcacc[n_vec] = _mm_and_si128 (imagcacc[n_vec], mask_imag);
result = _mm_or_si128 (realcacc[n_vec], imagcacc[n_vec]); _mm_store_si128((__m128i*)dotProductVector, results); // Store the results back into the dot product vector
dotProduct = lv_cmake(0,0);
for (int i = 0; i < 4; ++i)
{
dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[i])),
sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[i])));
}
_out[n_vec] = dotProduct;
}
free(realcacc);
free(imagcacc);
_mm_store_si128((__m128i*)dotProductVector, result); // Store the results back into the dot product vector _mm_store_ps((float*)two_phase_acc, two_phase_acc_reg);
dotProduct = lv_cmake(0,0); (*phase) = lv_cmake(two_phase_acc[0], two_phase_acc[0]);
for (int i = 0; i<4; ++i)
{
dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[i])),
sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[i])));
}
_out[n_vec] = dotProduct;
}
free(realcacc);
free(imagcacc);
_mm_store_ps((float*)two_phase_acc, two_phase_acc_reg); for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
(*phase) = lv_cmake(two_phase_acc[0], two_phase_acc[0]); {
for(unsigned int n = sse_iters * 4; n < num_points; n++)
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++) {
{ tmp16 = *in_common++;
for(unsigned int n = sse_iters * 4; n < num_points; n++) tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase);
{ tmp16 = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32)));
tmp16 = *in_common++; (*phase) *= phase_inc;
tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase); lv_16sc_t tmp = tmp16 * in_a[n_vec][n];
tmp16 = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32))); _out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)),
(*phase) *= phase_inc; sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp)));
lv_16sc_t tmp = tmp16 * in_a[n_vec][n]; }
_out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)), }
sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp)));
}
}
} }
#endif /* LV_HAVE_SSE3 */ #endif /* LV_HAVE_SSE3 */
@ -239,160 +246,331 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3(lv_16sc_
#include <pmmintrin.h> #include <pmmintrin.h>
/*! /*!
\brief Multiplies the reference complex vector with multiple versions of another complex vector, accumulates the results and stores them in the output vector \brief Rotates and multiplies the reference complex vector with multiple versions of another complex vector, accumulates the results and stores them in the output vector
\param[out] result Array of num_a_vectors components with the multiple versions of in_a multiplied and accumulated The vector where the accumulated result will be stored \param[out] result Array of num_a_vectors components with the multiple versions of in_a multiplied and accumulated The vector where the accumulated result will be stored
\param[in] in_common Pointer to one of the vectors to be multiplied and accumulated (reference vector) \param[in] in_common Pointer to one of the vectors to be rotated, multiplied and accumulated (reference vector)
\param[in] in_a Pointer to an array of pointers to multiple versions of the other vector to be multiplied and accumulated \param[in] phase_inc Phase increment = lv_cmake(cos(phase_step_rad), -sin(phase_step_rad))
\param[in] num_a_vectors Number of vectors to be multiplied by the reference vector and accumulated \param[in,out] phase Initial / final phase
\param[in] num_points The Number of complex values to be multiplied together, accumulated and stored into result \param[in] in_a Pointer to an array of pointers to multiple versions of the other vector to be multiplied and accumulated
\param[in] num_a_vectors Number of vectors to be multiplied by the reference vector and accumulated
\param[in] num_points The Number of complex values to be multiplied together, accumulated and stored into result
*/ */
static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_u_sse3(lv_16sc_t* out, const lv_16sc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points) static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_u_sse3(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points)
{ {
lv_16sc_t dotProduct = lv_cmake(0,0); lv_16sc_t dotProduct = lv_cmake(0,0);
const unsigned int sse_iters = num_points / 4; const unsigned int sse_iters = num_points / 4;
const lv_16sc_t** _in_a = in_a; const lv_16sc_t** _in_a = in_a;
const lv_16sc_t* _in_common = in_common; const lv_16sc_t* _in_common = in_common;
lv_16sc_t* _out = out; lv_16sc_t* _out = result;
__VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4]; __VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4];
//todo dyn mem reg //todo dyn mem reg
__m128i* realcacc; __m128i* realcacc;
__m128i* imagcacc; __m128i* imagcacc;
realcacc = (__m128i*)calloc(num_a_vectors, sizeof(__m128i)); //calloc also sets memory to 0 realcacc = (__m128i*)calloc(num_a_vectors, sizeof(__m128i)); //calloc also sets memory to 0
imagcacc = (__m128i*)calloc(num_a_vectors, sizeof(__m128i)); //calloc also sets memory to 0 imagcacc = (__m128i*)calloc(num_a_vectors, sizeof(__m128i)); //calloc also sets memory to 0
__m128i a,b,c, c_sr, mask_imag, mask_real, real, imag, imag1,imag2, b_sl, a_sl, result; __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, results;
mask_imag = _mm_set_epi8(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0); mask_imag = _mm_set_epi8(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0);
mask_real = _mm_set_epi8(0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255); mask_real = _mm_set_epi8(0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255);
// phase rotation registers // phase rotation registers
__m128 pa, pb, two_phase_acc_reg, two_phase_inc_reg; __m128 pa, pb, two_phase_acc_reg, two_phase_inc_reg;
__m128i pc1, pc2; __m128i pc1, pc2;
__attribute__((aligned(16))) lv_32fc_t two_phase_inc[2]; __attribute__((aligned(16))) lv_32fc_t two_phase_inc[2];
two_phase_inc[0] = phase_inc * phase_inc; two_phase_inc[0] = phase_inc * phase_inc;
two_phase_inc[1] = phase_inc * phase_inc; two_phase_inc[1] = phase_inc * phase_inc;
two_phase_inc_reg = _mm_load_ps((float*) two_phase_inc); two_phase_inc_reg = _mm_load_ps((float*) two_phase_inc);
__attribute__((aligned(16))) lv_32fc_t two_phase_acc[2]; __attribute__((aligned(16))) lv_32fc_t two_phase_acc[2];
two_phase_acc[0] = (*phase); two_phase_acc[0] = (*phase);
two_phase_acc[1] = (*phase) * phase_inc; two_phase_acc[1] = (*phase) * phase_inc;
two_phase_acc_reg = _mm_load_ps((float*)two_phase_acc); two_phase_acc_reg = _mm_load_ps((float*)two_phase_acc);
__m128 yl, yh, tmp1, tmp2, tmp3; __m128 yl, yh, tmp1, tmp2, tmp3;
lv_16sc_t tmp16; lv_16sc_t tmp16;
lv_32fc_t tmp32; lv_32fc_t tmp32;
for(unsigned int number = 0; number < sse_iters; number++) for(unsigned int number = 0; number < sse_iters; number++)
{ {
// Phase rotation on operand in_common starts here: // Phase rotation on operand in_common starts here:
pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
//complex 32fc multiplication b=a*two_phase_acc_reg //complex 32fc multiplication b=a*two_phase_acc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
pa = _mm_shuffle_ps(pa, pa, 0xB1); // Re-arrange x to be ai,ar,bi,br pa = _mm_shuffle_ps(pa, pa, 0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(pa, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di tmp2 = _mm_mul_ps(pa, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
pc1 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic pc1 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
//next two samples //next two samples
_in_common += 2; _in_common += 2;
pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
//complex 32fc multiplication b=a*two_phase_acc_reg //complex 32fc multiplication b=a*two_phase_acc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
pa = _mm_shuffle_ps(pa, pa, 0xB1); // Re-arrange x to be ai,ar,bi,br pa = _mm_shuffle_ps(pa, pa, 0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(pa, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di tmp2 = _mm_mul_ps(pa, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
pc2 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic pc2 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
// store four rotated in_common samples in the register b // store four rotated in_common samples in the register b
b = _mm_packs_epi32(pc1, pc2);// convert from 32ic to 16ic b = _mm_packs_epi32(pc1, pc2);// convert from 32ic to 16ic
//next two samples //next two samples
_in_common += 2; _in_common += 2;
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++) for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
{ {
a = _mm_loadu_si128((__m128i*)&(_in_a[n_vec][number*4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg a = _mm_loadu_si128((__m128i*)&(_in_a[n_vec][number*4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
c = _mm_mullo_epi16 (a, b); // a3.i*b3.i, a3.r*b3.r, .... c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
c_sr = _mm_srli_si128 (c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
real = _mm_subs_epi16 (c, c_sr); real = _mm_subs_epi16(c, c_sr);
b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i .... b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i .... a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, .... imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, .... imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
imag = _mm_adds_epi16(imag1, imag2); imag = _mm_adds_epi16(imag1, imag2);
realcacc[n_vec] = _mm_adds_epi16 (realcacc[n_vec], real); realcacc[n_vec] = _mm_adds_epi16(realcacc[n_vec], real);
imagcacc[n_vec] = _mm_adds_epi16 (imagcacc[n_vec], imag); imagcacc[n_vec] = _mm_adds_epi16(imagcacc[n_vec], imag);
}
}
} for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
} {
realcacc[n_vec] = _mm_and_si128 (realcacc[n_vec], mask_real);
imagcacc[n_vec] = _mm_and_si128 (imagcacc[n_vec], mask_imag);
for (int n_vec=0;n_vec<num_a_vectors;n_vec++) results = _mm_or_si128(realcacc[n_vec], imagcacc[n_vec]);
{
realcacc[n_vec] = _mm_and_si128 (realcacc[n_vec], mask_real);
imagcacc[n_vec] = _mm_and_si128 (imagcacc[n_vec], mask_imag);
result = _mm_or_si128 (realcacc[n_vec], imagcacc[n_vec]); _mm_storeu_si128((__m128i*)dotProductVector, results); // Store the results back into the dot product vector
dotProduct = lv_cmake(0,0);
for (int i = 0; i < 4; ++i)
{
dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[i])),
sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[i])));
}
_out[n_vec] = dotProduct;
}
free(realcacc);
free(imagcacc);
_mm_storeu_si128((__m128i*)dotProductVector, result); // Store the results back into the dot product vector _mm_store_ps((float*)two_phase_acc, two_phase_acc_reg);
dotProduct = lv_cmake(0,0); (*phase) = lv_cmake(two_phase_acc[0], two_phase_acc[0]);
for (int i = 0; i<4; ++i)
{
dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[i])),
sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[i])));
}
_out[n_vec] = dotProduct;
}
free(realcacc);
free(imagcacc);
_mm_store_ps((float*)two_phase_acc, two_phase_acc_reg);
(*phase) = lv_cmake(two_phase_acc[0], two_phase_acc[0]);
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
for(unsigned int n = sse_iters * 4; n < num_points; n++)
{
tmp16 = *in_common++;
tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase);
tmp16 = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32)));
(*phase) *= phase_inc;
lv_16sc_t tmp = tmp16 * in_a[n_vec][n];
_out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)),
sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp)));
}
}
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
for(unsigned int n = sse_iters * 4; n < num_points; n++)
{
tmp16 = *in_common++;
tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase);
tmp16 = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32)));
(*phase) *= phase_inc;
lv_16sc_t tmp = tmp16 * in_a[n_vec][n];
_out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)),
sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp)));
}
}
} }
#endif /* LV_HAVE_SSE3 */ #endif /* LV_HAVE_SSE3 */
#ifdef LV_HAVE_NEON
#include <arm_neon.h>
/*!
\brief Rotates and multiplies the reference complex vector with multiple versions of another complex vector, accumulates the results and stores them in the output vector
\param[out] result Array of num_a_vectors components with the multiple versions of in_a multiplied and accumulated The vector where the accumulated result will be stored
\param[in] in_common Pointer to one of the vectors to be rotated, multiplied and accumulated (reference vector)
\param[in] phase_inc Phase increment = lv_cmake(cos(phase_step_rad), -sin(phase_step_rad))
\param[in,out] phase Initial / final phase
\param[in] in_a Pointer to an array of pointers to multiple versions of the other vector to be multiplied and accumulated
\param[in] num_a_vectors Number of vectors to be multiplied by the reference vector and accumulated
\param[in] num_points The Number of complex values to be multiplied together, accumulated and stored into result
*/
static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points)
{
const unsigned int neon_iters = num_points / 4;
const lv_16sc_t** _in_a = in_a;
const lv_16sc_t* _in_common = in_common;
lv_16sc_t* _out = result;
lv_16sc_t tmp16_, tmp;
lv_32fc_t tmp32_;
if (neon_iters > 0)
{
lv_16sc_t dotProduct = lv_cmake(0,0);
lv_32fc_t ___phase4 = phase_inc * phase_inc * phase_inc * phase_inc;
__VOLK_ATTR_ALIGNED(16) float32_t __phase4_real[4] = { lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4) };
__VOLK_ATTR_ALIGNED(16) float32_t __phase4_imag[4] = { lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4) };
float32x4_t _phase4_real = vld1q_f32(__phase4_real);
float32x4_t _phase4_imag = vld1q_f32(__phase4_imag);
lv_32fc_t phase2 = (lv_32fc_t)(*phase) * phase_inc;
lv_32fc_t phase3 = phase2 * phase_inc;
lv_32fc_t phase4 = phase3 * phase_inc;
__VOLK_ATTR_ALIGNED(16) float32_t __phase_real[4] = { lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4) };
__VOLK_ATTR_ALIGNED(16) float32_t __phase_imag[4] = { lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4) };
float32x4_t _phase_real = vld1q_f32(__phase_real);
float32x4_t _phase_imag = vld1q_f32(__phase_imag);
int16x4x2_t a_val, b_val, c_val;
__VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4];
float32x4_t half = vdupq_n_f32(0.5f);
int16x4x2_t tmp16;
int32x4x2_t tmp32i;
float32x4x2_t tmp32f, tmp32_real, tmp32_imag;
float32x4_t sign, PlusHalf, Round;
int16x4x2_t* accumulator;
accumulator = (int16x4x2_t*)calloc(num_a_vectors, sizeof(int16x4x2_t));
for(int n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
accumulator[n_vec].val[0] = vdup_n_s16(0);
accumulator[n_vec].val[1] = vdup_n_s16(0);
}
for(unsigned int number = 0; number < neon_iters; number++)
{
/* load 4 complex numbers (int 16 bits each component) */
tmp16 = vld2_s16((int16_t*)_in_common);
__builtin_prefetch(_in_common + 8);
_in_common += 4;
/* promote them to int 32 bits */
tmp32i.val[0] = vmovl_s16(tmp16.val[0]);
tmp32i.val[1] = vmovl_s16(tmp16.val[1]);
/* promote them to float 32 bits */
tmp32f.val[0] = vcvtq_f32_s32(tmp32i.val[0]);
tmp32f.val[1] = vcvtq_f32_s32(tmp32i.val[1]);
/* complex multiplication of four complex samples (float 32 bits each component) */
tmp32_real.val[0] = vmulq_f32(tmp32f.val[0], _phase_real);
tmp32_real.val[1] = vmulq_f32(tmp32f.val[1], _phase_imag);
tmp32_imag.val[0] = vmulq_f32(tmp32f.val[0], _phase_imag);
tmp32_imag.val[1] = vmulq_f32(tmp32f.val[1], _phase_real);
tmp32f.val[0] = vsubq_f32(tmp32_real.val[0], tmp32_real.val[1]);
tmp32f.val[1] = vaddq_f32(tmp32_imag.val[0], tmp32_imag.val[1]);
/* downcast results to int32 */
/* in __aarch64__ we can do that with vcvtaq_s32_f32(ret1); vcvtaq_s32_f32(ret2); */
sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(tmp32f.val[0]), 31)));
PlusHalf = vaddq_f32(tmp32f.val[0], half);
Round = vsubq_f32(PlusHalf, sign);
tmp32i.val[0] = vcvtq_s32_f32(Round);
sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(tmp32f.val[1]), 31)));
PlusHalf = vaddq_f32(tmp32f.val[1], half);
Round = vsubq_f32(PlusHalf, sign);
tmp32i.val[1] = vcvtq_s32_f32(Round);
/* downcast results to int16 */
tmp16.val[0] = vqmovn_s32(tmp32i.val[0]);
tmp16.val[1] = vqmovn_s32(tmp32i.val[1]);
/* compute next four phases */
tmp32_real.val[0] = vmulq_f32(_phase_real, _phase4_real);
tmp32_real.val[1] = vmulq_f32(_phase_imag, _phase4_imag);
tmp32_imag.val[0] = vmulq_f32(_phase_real, _phase4_imag);
tmp32_imag.val[1] = vmulq_f32(_phase_imag, _phase4_real);
_phase_real = vsubq_f32(tmp32_real.val[0], tmp32_real.val[1]);
_phase_imag = vaddq_f32(tmp32_imag.val[0], tmp32_imag.val[1]);
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
a_val = vld2_s16((int16_t*)&(_in_a[n_vec][number*4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
__builtin_prefetch(&_in_a[n_vec][number*4] + 8);
// multiply the real*real and imag*imag to get real result
// a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
b_val.val[0] = vmul_s16(a_val.val[0], tmp16.val[0]);
// a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i
b_val.val[1] = vmul_s16(a_val.val[1], tmp16.val[1]);
c_val.val[0] = vsub_s16(b_val.val[0], b_val.val[1]);
// Multiply cross terms to get the imaginary result
// a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i
b_val.val[0] = vmul_s16(a_val.val[0], tmp16.val[1]);
// a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r
b_val.val[1] = vmul_s16(a_val.val[1], tmp16.val[0]);
c_val.val[1] = vadd_s16(b_val.val[0], b_val.val[1]);
accumulator[n_vec].val[0] = vadd_s16(accumulator[n_vec].val[0], c_val.val[0]);
accumulator[n_vec].val[1] = vadd_s16(accumulator[n_vec].val[1], c_val.val[1]);
}
}
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
vst2_s16((int16_t*)dotProductVector, accumulator[n_vec]); // Store the results back into the dot product vector
dotProduct = lv_cmake(0,0);
for (int i = 0; i < 4; ++i)
{
dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[i])),
sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[i])));
}
_out[n_vec] = dotProduct;
}
free(accumulator);
vst1q_f32((float32_t*)__phase_real, _phase_real);
vst1q_f32((float32_t*)__phase_imag, _phase_imag);
(*phase) = lv_cmake((float32_t)__phase_real[0], (float32_t)__phase_imag[0]);
}
for (unsigned int n = neon_iters * 4; n < num_points; n++)
{
tmp16_ = in_common[n];
tmp32_ = lv_cmake((float32_t)lv_creal(tmp16_), (float32_t)lv_cimag(tmp16_)) * (*phase);
tmp16_ = lv_cmake((int16_t)rintf(lv_creal(tmp32_)), (int16_t)rintf(lv_cimag(tmp32_)));
(*phase) *= phase_inc;
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
tmp = tmp16_ * in_a[n_vec][n];
_out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)), sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp)));
}
}
}
#endif /* LV_HAVE_NEON */
#endif /*INCLUDED_volk_gnsssdr_16ic_xn_dot_prod_16ic_xn_H*/ #endif /*INCLUDED_volk_gnsssdr_16ic_xn_dot_prod_16ic_xn_H*/

View File

@ -55,16 +55,16 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_generic(lv_
int num_a_vectors = 3; int num_a_vectors = 3;
lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
for(unsigned int n = 0; n < num_a_vectors; n++) for(unsigned int n = 0; n < num_a_vectors; n++)
{ {
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
memcpy(in_a[n], in, sizeof(lv_16sc_t) * num_points); memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points);
} }
volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_generic(result, local_code, phase_inc[0], phase,(const lv_16sc_t**) in_a, num_a_vectors, num_points); volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_generic(result, local_code, phase_inc[0], phase,(const lv_16sc_t**) in_a, num_a_vectors, num_points);
for(unsigned int n = 0; n < num_a_vectors; n++) for(unsigned int n = 0; n < num_a_vectors; n++)
{ {
volk_gnsssdr_free(in_a[n]); volk_gnsssdr_free(in_a[n]);
} }
volk_gnsssdr_free(in_a); volk_gnsssdr_free(in_a);
} }
@ -84,16 +84,17 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_a_sse3(lv_1
int num_a_vectors = 3; int num_a_vectors = 3;
lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
for(unsigned int n = 0; n < num_a_vectors; n++) for(unsigned int n = 0; n < num_a_vectors; n++)
{ {
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points); memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points);
} }
volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3(result, local_code, phase_inc[0], phase, (const lv_16sc_t**) in_a, num_a_vectors, num_points); volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3(result, local_code, phase_inc[0], phase, (const lv_16sc_t**) in_a, num_a_vectors, num_points);
for(unsigned int n = 0; n < num_a_vectors; n++) for(unsigned int n = 0; n < num_a_vectors; n++)
{ {
volk_gnsssdr_free(in_a[n]); volk_gnsssdr_free(in_a[n]);
} }
volk_gnsssdr_free(in_a); volk_gnsssdr_free(in_a);
} }
@ -114,21 +115,53 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_u_sse3(lv_1
int num_a_vectors = 3; int num_a_vectors = 3;
lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
for(unsigned int n = 0; n < num_a_vectors; n++) for(unsigned int n = 0; n < num_a_vectors; n++)
{ {
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t)*num_points, volk_gnsssdr_get_alignment()); in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
memcpy(in_a[n], in, sizeof(lv_16sc_t)*num_points); memcpy(in_a[n], in, sizeof(lv_16sc_t) * num_points);
} }
volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_u_sse3(result, local_code, phase_inc[0], phase, (const lv_16sc_t**) in_a, num_a_vectors, num_points); volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_u_sse3(result, local_code, phase_inc[0], phase, (const lv_16sc_t**) in_a, num_a_vectors, num_points);
for(unsigned int n = 0; n < num_a_vectors; n++) for(unsigned int n = 0; n < num_a_vectors; n++)
{ {
volk_gnsssdr_free(in_a[n]); volk_gnsssdr_free(in_a[n]);
} }
volk_gnsssdr_free(in_a); volk_gnsssdr_free(in_a);
} }
#endif // SSE3 #endif // SSE3
#ifdef LV_HAVE_NEON
static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_neon(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points)
{
// phases must be normalized. Phase rotator expects a complex exponential input!
float rem_carrier_phase_in_rad = 0.345;
float phase_step_rad = 0.123;
lv_32fc_t phase[1];
phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), -sin(rem_carrier_phase_in_rad));
lv_32fc_t phase_inc[1];
phase_inc[0] = lv_cmake(cos(phase_step_rad), -sin(phase_step_rad));
int num_a_vectors = 3;
lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
for(unsigned int n = 0; n < num_a_vectors; n++)
{
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points);
}
volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon(result, local_code, phase_inc[0], phase, (const lv_16sc_t**) in_a, num_a_vectors, num_points);
for(unsigned int n = 0; n < num_a_vectors; n++)
{
volk_gnsssdr_free(in_a[n]);
}
volk_gnsssdr_free(in_a);
}
#endif // NEON
#endif // INCLUDED_volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_H #endif // INCLUDED_volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_H

View File

@ -87,7 +87,7 @@ endif(NOT DEFINED _XOPEN_SOURCE)
######################################################################## ########################################################################
# detect x86 flavor of CPU # detect x86 flavor of CPU
######################################################################## ########################################################################
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(i.86|x86|x86_64|amd64)$") if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(i.86|x86|x86_64|amd64|AMD64)$")
message(STATUS "x86* CPU detected") message(STATUS "x86* CPU detected")
set(CPU_IS_x86 TRUE) set(CPU_IS_x86 TRUE)
endif() endif()
@ -106,19 +106,24 @@ macro(check_arch arch_name)
set(flags ${ARGN}) set(flags ${ARGN})
set(have_${arch_name} TRUE) set(have_${arch_name} TRUE)
foreach(flag ${flags}) foreach(flag ${flags})
include(CheckCXXCompilerFlag) if ( (${COMPILER_NAME} STREQUAL "MSVC") AND (${flag} STREQUAL "/arch:SSE2" OR ${flag} STREQUAL "/arch:SSE" ))
set(have_flag have${flag}) # SSE/SSE2 is supported in MSVC since VS 2005 but flag not available when compiling 64-bit so do not check
execute_process( #make the have_flag have nice alphanum chars (just for looks/not necessary) else()
COMMAND ${PYTHON_EXECUTABLE} -c "import re; print(re.sub('\\W', '_', '${have_flag}'))" include(CheckCXXCompilerFlag)
OUTPUT_VARIABLE have_flag OUTPUT_STRIP_TRAILING_WHITESPACE set(have_flag have${flag})
) #make the have_flag have nice alphanum chars (just for looks/not necessary)
if(VOLK_FLAG_CHECK_FLAGS) execute_process(
set(CMAKE_REQUIRED_FLAGS ${VOLK_FLAG_CHECK_FLAGS}) COMMAND ${PYTHON_EXECUTABLE} -c "import re; print(re.sub('\\W', '_', '${have_flag}'))"
endif() OUTPUT_VARIABLE have_flag OUTPUT_STRIP_TRAILING_WHITESPACE
CHECK_CXX_COMPILER_FLAG(${flag} ${have_flag}) )
unset(CMAKE_REQUIRED_FLAGS) if(VOLK_FLAG_CHECK_FLAGS)
if (NOT ${have_flag}) set(CMAKE_REQUIRED_FLAGS ${VOLK_FLAG_CHECK_FLAGS})
set(have_${arch_name} FALSE) endif()
CHECK_CXX_COMPILER_FLAG(${flag} ${have_flag})
unset(CMAKE_REQUIRED_FLAGS)
if (NOT ${have_flag})
set(have_${arch_name} FALSE)
endif()
endif() endif()
endforeach() endforeach()
if (have_${arch_name}) if (have_${arch_name})