1
0
mirror of https://github.com/gnss-sdr/gnss-sdr synced 2024-06-24 22:13:15 +00:00

introducing new kernels

This commit is contained in:
Carles Fernandez 2016-01-12 22:48:59 +01:00
parent 1bf645c98c
commit eb1dbfe37b
9 changed files with 409 additions and 405 deletions

View File

@ -0,0 +1,170 @@
/*!
* \file volk_gnsssdr_16ic_resampler_16ic.h
* \brief Volk protokernel: resample a 16 bits complex vector
* \authors <ul>
* <li> Javier Arribas, 2015. jarribas(at)cttc.es
* </ul>
*
* Volk protokernel that multiplies two 16 bits vectors (8 bits the real part
* and 8 bits the imaginary part) and accumulates them
*
* -------------------------------------------------------------------------
*
* Copyright (C) 2010-2015 (see AUTHORS file for a list of contributors)
*
* GNSS-SDR is a software defined Global Navigation
* Satellite Systems receiver
*
* This file is part of GNSS-SDR.
*
* GNSS-SDR is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* GNSS-SDR is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
*
* -------------------------------------------------------------------------
*/
#ifndef INCLUDED_volk_gnsssdr_16ic_resampler_16ic_a_H
#define INCLUDED_volk_gnsssdr_16ic_resampler_16ic_a_H
#include <math.h>
#include <volk_gnsssdr/volk_gnsssdr_common.h>
#include <volk_gnsssdr/volk_gnsssdr_complex.h>
#include <volk_gnsssdr/saturated_arithmetic.h>
//#pragma STDC FENV_ACCESS ON
#ifdef LV_HAVE_GENERIC
//int round_int( float r ) {
// return (r > 0.0) ? (r + 0.5) : (r - 0.5);
//}
/*!
\brief Multiplies the two input complex vectors, point-by-point, storing the result in the third vector
\param cVector The vector where the result will be stored
\param aVector One of the vectors to be multiplied
\param bVector One of the vectors to be multiplied
\param num_points The number of complex values in aVector and bVector to be multiplied together, accumulated and stored into cVector
*/
static inline void volk_gnsssdr_16ic_resampler_16ic_generic(lv_16sc_t* result, const lv_16sc_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, unsigned int num_output_samples, unsigned int code_length_chips)
{
int local_code_chip_index;
//fesetround(FE_TONEAREST);
for (unsigned int n = 0; n < num_output_samples; n++)
{
// resample code for current tap
local_code_chip_index = round(code_phase_step_chips * (float)n + rem_code_phase_chips - 0.5f);
if (local_code_chip_index < 0.0) local_code_chip_index += code_length_chips;
if (local_code_chip_index > (code_length_chips-1)) local_code_chip_index -= code_length_chips;
//std::cout<<"g["<<n<<"]="<<code_phase_step_chips*static_cast<float>(n) + rem_code_phase_chips-0.5f<<","<<local_code_chip_index<<" ";
result[n] = local_code[local_code_chip_index];
}
//std::cout<<std::endl;
}
#endif /*LV_HAVE_GENERIC*/
#ifdef LV_HAVE_SSE2
#include <emmintrin.h>
static inline void volk_gnsssdr_16ic_resampler_16ic_sse2(lv_16sc_t* result, const lv_16sc_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, unsigned int num_output_samples, unsigned int code_length_chips)//, int* scratch_buffer, float* scratch_buffer_float)
{
_MM_SET_ROUNDING_MODE (_MM_ROUND_NEAREST);//_MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO
unsigned int number;
const unsigned int quarterPoints = num_output_samples / 4;
lv_16sc_t* _result = result;
__attribute__((aligned(16))) int local_code_chip_index[4];
__m128 _rem_code_phase, _code_phase_step_chips;
__m128i _code_length_chips, _code_length_chips_minus1;
__m128 _code_phase_out, _code_phase_out_with_offset;
rem_code_phase_chips = rem_code_phase_chips - 0.5f;
_rem_code_phase = _mm_load1_ps(&rem_code_phase_chips); //load float to all four float values in m128 register
_code_phase_step_chips = _mm_load1_ps(&code_phase_step_chips); //load float to all four float values in m128 register
__attribute__((aligned(16))) int four_times_code_length_chips_minus1[4];
four_times_code_length_chips_minus1[0] = code_length_chips-1;
four_times_code_length_chips_minus1[1] = code_length_chips-1;
four_times_code_length_chips_minus1[2] = code_length_chips-1;
four_times_code_length_chips_minus1[3] = code_length_chips-1;
__attribute__((aligned(16))) int four_times_code_length_chips[4];
four_times_code_length_chips[0] = code_length_chips;
four_times_code_length_chips[1] = code_length_chips;
four_times_code_length_chips[2] = code_length_chips;
four_times_code_length_chips[3] = code_length_chips;
_code_length_chips = _mm_loadu_si128((__m128i*)&four_times_code_length_chips); //load float to all four float values in m128 register
_code_length_chips_minus1 = _mm_loadu_si128((__m128i*)&four_times_code_length_chips_minus1); //load float to all four float values in m128 register
__m128i negative_indexes, overflow_indexes, _code_phase_out_int, _code_phase_out_int_neg, _code_phase_out_int_over;
__m128i zero = _mm_setzero_si128();
__attribute__((aligned(16))) float init_idx_float[4] = { 0.0f, 1.0f, 2.0f, 3.0f };
__m128 _4output_index = _mm_load_ps(init_idx_float);
__attribute__((aligned(16))) float init_4constant_float[4] = { 4.0f, 4.0f, 4.0f, 4.0f };
__m128 _4constant_float = _mm_load_ps(init_4constant_float);
//__attribute__((aligned(16))) int output_indexes[4];
for(number = 0; number < quarterPoints; number++)
{
_code_phase_out = _mm_mul_ps(_code_phase_step_chips, _4output_index); //compute the code phase point with the phase step
_code_phase_out_with_offset = _mm_add_ps(_code_phase_out, _rem_code_phase); //add the phase offset
_code_phase_out_int = _mm_cvtps_epi32(_code_phase_out_with_offset); //convert to integer
negative_indexes = _mm_cmplt_epi32(_code_phase_out_int, zero); //test for negative values
_code_phase_out_int_neg = _mm_add_epi32(_code_phase_out_int, _code_length_chips); //the negative values branch
//_code_phase_out_int_over=_mm_or_si128(_mm_and_si128(_code_phase_out_int_neg,_code_phase_out_int),_mm_andnot_si128(negative_indexes,_code_phase_out_int));
_code_phase_out_int_neg = _mm_xor_si128(_code_phase_out_int, _mm_and_si128( negative_indexes, _mm_xor_si128( _code_phase_out_int_neg, _code_phase_out_int )));
overflow_indexes = _mm_cmpgt_epi32(_code_phase_out_int_neg, _code_length_chips_minus1); //test for overflow values
_code_phase_out_int_over = _mm_sub_epi32(_code_phase_out_int_neg, _code_length_chips); //the negative values branch
_code_phase_out_int_over = _mm_xor_si128(_code_phase_out_int_neg, _mm_and_si128( overflow_indexes, _mm_xor_si128( _code_phase_out_int_over, _code_phase_out_int_neg )));
_mm_storeu_si128((__m128i*)local_code_chip_index, _code_phase_out_int_over); // Store the results back
//_mm_store_ps((float*)_scratch_buffer_float,_code_phase_out_with_offset);
//todo: optimize the local code lookup table with intrinsics, if possible
*_result++ = local_code[local_code_chip_index[0]];
*_result++ = local_code[local_code_chip_index[1]];
*_result++ = local_code[local_code_chip_index[2]];
*_result++ = local_code[local_code_chip_index[3]];
_4output_index = _mm_add_ps(_4output_index, _4constant_float);
//_scratch_buffer_float+=4;
}
for(number = quarterPoints * 4; number < num_output_samples; number++)
{
local_code_chip_index[0] = (int)(code_phase_step_chips * (float)number + rem_code_phase_chips + 0.5f);
if (local_code_chip_index[0] < 0.0) local_code_chip_index[0] += code_length_chips - 1;
if (local_code_chip_index[0] > (code_length_chips - 1)) local_code_chip_index[0] -= code_length_chips;
*_result++ = local_code[local_code_chip_index[0]];
//*_scratch_buffer_float++=code_phase_step_chips*static_cast<float>(number)+rem_code_phase_chips;
}
// for(unsigned int n=0;n<num_output_samples;n++)
// {
//
// std::cout<<"s["<<n<<"]="<<scratch_buffer_float[n]<<","<<scratch_buffer[n]<<" ";
// }
// std::cout<<std::endl;
}
#endif /* LV_HAVE_SSE2 */
#endif /*INCLUDED_volk_gnsssdr_16ic_resampler_16ic_a_H*/

View File

@ -0,0 +1,152 @@
/*!
* \file volk_gnsssdr_16ic_x2_dot_prod_16ic.h
* \brief Volk protokernel: multiplies two 16 bits vectors and accumulates them
* \authors <ul>
* <li> Javier Arribas, 2015. jarribas(at)cttc.es
* </ul>
*
* Volk protokernel that multiplies two 16 bits vectors (8 bits the real part
* and 8 bits the imaginary part) and accumulates them
*
* -------------------------------------------------------------------------
*
* Copyright (C) 2010-2015 (see AUTHORS file for a list of contributors)
*
* GNSS-SDR is a software defined Global Navigation
* Satellite Systems receiver
*
* This file is part of GNSS-SDR.
*
* GNSS-SDR is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* GNSS-SDR is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
*
* -------------------------------------------------------------------------
*/
#ifndef INCLUDED_volk_gnsssdr_16ic_x2_dot_prod_16ic_u_H
#define INCLUDED_volk_gnsssdr_16ic_x2_dot_prod_16ic_u_H
#include <inttypes.h>
#include <stdio.h>
#include <string.h>
#include <volk_gnsssdr/volk_gnsssdr_common.h>
#include <volk_gnsssdr/volk_gnsssdr_complex.h>
#include <volk_gnsssdr/saturated_arithmetic.h>
#ifdef LV_HAVE_GENERIC
/*!
\brief Multiplies the two input complex vectors and accumulates them, storing the result in the third vector
\param cVector The vector where the accumulated result will be stored
\param aVector One of the vectors to be multiplied and accumulated
\param bVector One of the vectors to be multiplied and accumulated
\param num_points The number of complex values in aVector and bVector to be multiplied together, accumulated and stored into cVector
*/
static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_generic(lv_16sc_t* result, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points)
{
result[0] = lv_cmake((int16_t)0, (int16_t)0);
for (unsigned int n = 0; n < num_points; n++)
{
//r*a.r - i*a.i, i*a.r + r*a.i
//result[0]+=in_a[n]*in_b[n];
lv_16sc_t tmp = in_a[n] * in_b[n];
result[0] = lv_cmake(sat_adds16b(lv_creal(result[0]), lv_creal(tmp)), sat_adds16b(lv_cimag(result[0]), lv_cimag(tmp) ));
//result[0].real(sat_adds16b(result[0].real(),lv_creal(tmp)));
//result[0].imag(sat_adds16b(result[0].imag(),tmp.imag()));
}
}
#endif /*LV_HAVE_GENERIC*/
#ifdef LV_HAVE_SSE2
#include <emmintrin.h>
static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_a_sse2(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points)
{
lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0);
const unsigned int sse_iters = num_points / 4;
const lv_16sc_t* _in_a = in_a;
const lv_16sc_t* _in_b = in_b;
lv_16sc_t* _out = out;
if (sse_iters > 0)
{
__m128i a,b,c, c_sr, mask_imag, mask_real, real, imag, imag1,imag2, b_sl, a_sl, realcacc, imagcacc, result;
realcacc = _mm_setzero_si128();
imagcacc = _mm_setzero_si128();
mask_imag = _mm_set_epi8(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0);
mask_real = _mm_set_epi8(0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255);
for(unsigned int number = 0; number < sse_iters; number++)
{
//std::complex<T> memory structure: real part -> reinterpret_cast<cv T*>(a)[2*i]
//imaginery part -> reinterpret_cast<cv T*>(a)[2*i + 1]
// a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
a = _mm_loadu_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
b = _mm_loadu_si128((__m128i*)_in_b);
c = _mm_mullo_epi16 (a, b); // a3.i*b3.i, a3.r*b3.r, ....
c_sr = _mm_srli_si128 (c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
real = _mm_subs_epi16 (c,c_sr);
b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
imag = _mm_adds_epi16(imag1, imag2); //with saturation aritmetic!
realcacc = _mm_adds_epi16 (realcacc, real);
imagcacc = _mm_adds_epi16 (imagcacc, imag);
_in_a += 4;
_in_b += 4;
}
realcacc = _mm_and_si128 (realcacc, mask_real);
imagcacc = _mm_and_si128 (imagcacc, mask_imag);
result = _mm_or_si128 (realcacc, imagcacc);
__VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4];
_mm_storeu_si128((__m128i*)dotProductVector,result); // Store the results back into the dot product vector
for (int i = 0; i < 4; ++i)
{
dotProduct = lv_cmake(sat_adds16b(lv_creal(dotProduct), lv_creal(dotProductVector[i])), sat_adds16b(lv_cimag(dotProduct), lv_cimag(dotProductVector[i])));
//dotProduct.real(sat_adds16b(lv_creal(dotProduct),lv_creal(dotProductVector[i])));
//dotProduct.imag(sat_adds16b(lv_cimag(dotProduct),lv_cimag(dotProductVector[i])));
}
}
for (unsigned int i = 0; i < (num_points % 4); ++i)
{
//dotProduct += (*_in_a++) * (*_in_b++);
lv_16sc_t tmp = (*_in_a++) * (*_in_b++);
dotProduct = lv_cmake( sat_adds16b(lv_creal(dotProduct), lv_creal(tmp)), sat_adds16b(lv_cimag(dotProduct), lv_cimag(tmp)));
//dotProduct.real(sat_adds16b(lv_creal(dotProduct),lv_creal(tmp)));
//dotProduct.imag(sat_adds16b(lv_cimag(dotProduct),lv_cimag(tmp)));
}
*_out = dotProduct;
}
#endif /* LV_HAVE_SSE2 */
#endif /*INCLUDED_volk_gnsssdr_16ic_x2_dot_prod_16ic_u_H*/

View File

@ -79,6 +79,8 @@ std::vector<volk_gnsssdr_test_case_t> init_test_list(volk_gnsssdr_test_params_t
(VOLK_INIT_TEST(volk_gnsssdr_64f_accumulator_64f, test_params))
(VOLK_INIT_TEST(volk_gnsssdr_32fc_convert_8ic, test_params))
(VOLK_INIT_TEST(volk_gnsssdr_32fc_convert_16ic, test_params))
(VOLK_INIT_TEST(volk_gnsssdr_16ic_x2_dot_prod_16ic, test_params))
(VOLK_INIT_TEST(volk_gnsssdr_16ic_resampler_16ic, test_params))
;
return test_cases;

View File

@ -170,6 +170,14 @@ typedef void (*volk_gnsssdr_fn_1arg_s8ic)(void *, lv_8sc_t, unsigned int, const
typedef void (*volk_gnsssdr_fn_2arg_s8ic)(void *, void *, lv_8sc_t, unsigned int, const char*);
typedef void (*volk_gnsssdr_fn_3arg_s8ic)(void *, void *, void *, lv_8sc_t, unsigned int, const char*);
//typedef void (*volk_gnsssdr_fn_1arg_s16i)(void *, int16_t, unsigned int, const char*); //one input vector, one scalar int16_t input
//typedef void (*volk_gnsssdr_fn_2arg_s16i)(void *, void *, int16_t, unsigned int, const char*);
//typedef void (*volk_gnsssdr_fn_3arg_s16i)(void *, void *, void *, int16_t, unsigned int, const char*);
//typedef void (*volk_gnsssdr_fn_1arg_s16ic)(void *, lv_16sc_t, unsigned int, const char*); //one input vector, one scalar lv_16sc_t vector input
//typedef void (*volk_gnsssdr_fn_2arg_s16ic)(void *, void *, lv_16sc_t, unsigned int, const char*);
//typedef void (*volk_gnsssdr_fn_3arg_s16ic)(void *, void *, void *, lv_16sc_t, unsigned int, const char*);
typedef void (*volk_gnsssdr_fn_6arg_s16ic)(void *, void *, void *, void *, void *, void *, lv_16sc_t, unsigned int, const char*);
typedef void (*volk_gnsssdr_fn_8arg)(void *, void *, void *, void *, void *, void *, void *, void *, unsigned int, const char*);
typedef void (*volk_gnsssdr_fn_8arg_s32f)(void *, void *, void *, void *, void *, void *, void *, void *, float, unsigned int, const char*);
typedef void (*volk_gnsssdr_fn_8arg_s32fc)(void *, void *, void *, void *, void *, void *, void *, void *, lv_32fc_t, unsigned int, const char*);

View File

@ -36,11 +36,14 @@
#include <iostream>
#include <gnuradio/fxpt.h> // fixed point sine and cosine
#include "volk_gnsssdr/volk_gnsssdr.h"
#define LV_HAVE_GENERIC
#define LV_HAVE_SSE2
#include "volk_gnsssdr_16ic_x2_dot_prod_16ic.h"
//#include "volk_gnsssdr_16ic_x2_dot_prod_16ic.h"
#include "volk_gnsssdr_16ic_x2_multiply_16ic.h"
#include "volk_gnsssdr_16ic_resampler_16ic.h"
//#include "volk_gnsssdr_16ic_resampler_16ic.h"
#include "volk_gnsssdr_16ic_xn_resampler_16ic_xn.h"
#include "volk_gnsssdr_16ic_xn_dot_prod_16ic_xn.h"

View File

@ -1,171 +0,0 @@
/*!
* \file volk_gnsssdr_16ic_resampler_16ic.h
* \brief Volk protokernel: resample a 16 bits complex vector
* \authors <ul>
* <li> Javier Arribas, 2015. jarribas(at)cttc.es
* </ul>
*
* Volk protokernel that multiplies two 16 bits vectors (8 bits the real part
* and 8 bits the imaginary part) and accumulates them
*
* -------------------------------------------------------------------------
*
* Copyright (C) 2010-2015 (see AUTHORS file for a list of contributors)
*
* GNSS-SDR is a software defined Global Navigation
* Satellite Systems receiver
*
* This file is part of GNSS-SDR.
*
* GNSS-SDR is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* GNSS-SDR is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
*
* -------------------------------------------------------------------------
*/
#ifndef INCLUDED_volk_gnsssdr_16ic_resampler_16ic_a_H
#define INCLUDED_volk_gnsssdr_16ic_resampler_16ic_a_H
#include <volk_gnsssdr/volk_gnsssdr_common.h>
#include <volk_gnsssdr/volk_gnsssdr_complex.h>
#include <cmath>
//#pragma STDC FENV_ACCESS ON
#ifdef LV_HAVE_GENERIC
//int round_int( float r ) {
// return (r > 0.0) ? (r + 0.5) : (r - 0.5);
//}
/*!
\brief Multiplies the two input complex vectors, point-by-point, storing the result in the third vector
\param cVector The vector where the result will be stored
\param aVector One of the vectors to be multiplied
\param bVector One of the vectors to be multiplied
\param num_points The number of complex values in aVector and bVector to be multiplied together, accumulated and stored into cVector
*/
static inline void volk_gnsssdr_16ic_resampler_16ic_generic(lv_16sc_t* result, const lv_16sc_t* local_code, float rem_code_phase_chips ,float code_phase_step_chips, unsigned int num_output_samples, unsigned int code_length_chips)
{
int local_code_chip_index;
//fesetround(FE_TONEAREST);
for (unsigned int n = 0; n < num_output_samples; n++)
{
// resample code for current tap
local_code_chip_index = round(code_phase_step_chips*static_cast<float>(n) + rem_code_phase_chips-0.5f);
if (local_code_chip_index < 0.0) local_code_chip_index += code_length_chips;
if (local_code_chip_index > (code_length_chips-1)) local_code_chip_index -= code_length_chips;
//std::cout<<"g["<<n<<"]="<<code_phase_step_chips*static_cast<float>(n) + rem_code_phase_chips-0.5f<<","<<local_code_chip_index<<" ";
result[n] = local_code[local_code_chip_index];
}
//std::cout<<std::endl;
}
#endif /*LV_HAVE_GENERIC*/
#ifdef LV_HAVE_SSE2
#include <emmintrin.h>
static inline void volk_gnsssdr_16ic_resampler_16ic_sse2(lv_16sc_t* result, const lv_16sc_t* local_code, float rem_code_phase_chips ,float code_phase_step_chips, unsigned int num_output_samples, int code_length_chips)//, int* scratch_buffer, float* scratch_buffer_float)
{
_MM_SET_ROUNDING_MODE (_MM_ROUND_NEAREST);//_MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO
unsigned int number;
const unsigned int quarterPoints = num_output_samples / 4;
lv_16sc_t* _result = result;
__attribute__((aligned(16))) int local_code_chip_index[4];
__m128 _rem_code_phase,_code_phase_step_chips;
__m128i _code_length_chips,_code_length_chips_minus1;
__m128 _code_phase_out,_code_phase_out_with_offset;
rem_code_phase_chips=rem_code_phase_chips-0.5f;
_rem_code_phase = _mm_load1_ps(&rem_code_phase_chips); //load float to all four float values in m128 register
_code_phase_step_chips = _mm_load1_ps(&code_phase_step_chips); //load float to all four float values in m128 register
__attribute__((aligned(16))) int four_times_code_length_chips_minus1[4];
four_times_code_length_chips_minus1[0]=code_length_chips-1;
four_times_code_length_chips_minus1[1]=code_length_chips-1;
four_times_code_length_chips_minus1[2]=code_length_chips-1;
four_times_code_length_chips_minus1[3]=code_length_chips-1;
__attribute__((aligned(16))) int four_times_code_length_chips[4];
four_times_code_length_chips[0]=code_length_chips;
four_times_code_length_chips[1]=code_length_chips;
four_times_code_length_chips[2]=code_length_chips;
four_times_code_length_chips[3]=code_length_chips;
_code_length_chips = _mm_loadu_si128((__m128i*)&four_times_code_length_chips); //load float to all four float values in m128 register
_code_length_chips_minus1 = _mm_loadu_si128((__m128i*)&four_times_code_length_chips_minus1); //load float to all four float values in m128 register
__m128i negative_indexes, overflow_indexes,_code_phase_out_int, _code_phase_out_int_neg,_code_phase_out_int_over;
__m128i zero=_mm_setzero_si128();
__attribute__((aligned(16))) float init_idx_float[4] = { 0.0f, 1.0f, 2.0f, 3.0f };
__m128 _4output_index=_mm_load_ps(init_idx_float);
__attribute__((aligned(16))) float init_4constant_float[4] = { 4.0f, 4.0f, 4.0f, 4.0f };
__m128 _4constant_float=_mm_load_ps(init_4constant_float);
//__attribute__((aligned(16))) int output_indexes[4];
for(number=0;number < quarterPoints; number++){
_code_phase_out = _mm_mul_ps(_code_phase_step_chips, _4output_index); //compute the code phase point with the phase step
_code_phase_out_with_offset = _mm_add_ps(_code_phase_out,_rem_code_phase); //add the phase offset
_code_phase_out_int=_mm_cvtps_epi32(_code_phase_out_with_offset); //convert to integer
negative_indexes=_mm_cmplt_epi32 (_code_phase_out_int, zero); //test for negative values
_code_phase_out_int_neg=_mm_add_epi32(_code_phase_out_int,_code_length_chips); //the negative values branch
//_code_phase_out_int_over=_mm_or_si128(_mm_and_si128(_code_phase_out_int_neg,_code_phase_out_int),_mm_andnot_si128(negative_indexes,_code_phase_out_int));
_code_phase_out_int_neg=_mm_xor_si128(_code_phase_out_int,_mm_and_si128( negative_indexes,_mm_xor_si128( _code_phase_out_int_neg, _code_phase_out_int )));
overflow_indexes=_mm_cmpgt_epi32 (_code_phase_out_int_neg, _code_length_chips_minus1); //test for overflow values
_code_phase_out_int_over=_mm_sub_epi32(_code_phase_out_int_neg,_code_length_chips); //the negative values branch
_code_phase_out_int_over=_mm_xor_si128(_code_phase_out_int_neg,_mm_and_si128( overflow_indexes,_mm_xor_si128( _code_phase_out_int_over, _code_phase_out_int_neg )));
_mm_storeu_si128((__m128i*)local_code_chip_index,_code_phase_out_int_over); // Store the results back
//_mm_store_ps((float*)_scratch_buffer_float,_code_phase_out_with_offset);
//todo: optimize the local code lookup table with intrinsics, if possible
*_result++=local_code[local_code_chip_index[0]];
*_result++=local_code[local_code_chip_index[1]];
*_result++=local_code[local_code_chip_index[2]];
*_result++=local_code[local_code_chip_index[3]];
_4output_index = _mm_add_ps(_4output_index,_4constant_float);
//_scratch_buffer_float+=4;
}
for(number = quarterPoints * 4;number < num_output_samples; number++){
local_code_chip_index[0]=static_cast<int>(code_phase_step_chips*static_cast<float>(number) + rem_code_phase_chips+0.5f);
if (local_code_chip_index[0] < 0.0) local_code_chip_index[0] += code_length_chips-1;
if (local_code_chip_index[0] > (code_length_chips-1)) local_code_chip_index[0] -= code_length_chips;
*_result++=local_code[local_code_chip_index[0]];
//*_scratch_buffer_float++=code_phase_step_chips*static_cast<float>(number)+rem_code_phase_chips;
}
// for(unsigned int n=0;n<num_output_samples;n++)
// {
//
// std::cout<<"s["<<n<<"]="<<scratch_buffer_float[n]<<","<<scratch_buffer[n]<<" ";
// }
// std::cout<<std::endl;
}
#endif /* LV_HAVE_SSE2 */
#endif /*INCLUDED_volk_gnsssdr_16ic_resampler_16ic_a_H*/

View File

@ -1,152 +0,0 @@
/*!
* \file volk_gnsssdr_16ic_x2_dot_prod_16ic.h
* \brief Volk protokernel: multiplies two 16 bits vectors and accumulates them
* \authors <ul>
* <li> Javier Arribas, 2015. jarribas(at)cttc.es
* </ul>
*
* Volk protokernel that multiplies two 16 bits vectors (8 bits the real part
* and 8 bits the imaginary part) and accumulates them
*
* -------------------------------------------------------------------------
*
* Copyright (C) 2010-2015 (see AUTHORS file for a list of contributors)
*
* GNSS-SDR is a software defined Global Navigation
* Satellite Systems receiver
*
* This file is part of GNSS-SDR.
*
* GNSS-SDR is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* GNSS-SDR is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
*
* -------------------------------------------------------------------------
*/
#ifndef INCLUDED_volk_gnsssdr_16ic_x2_dot_prod_16ic_u_H
#define INCLUDED_volk_gnsssdr_16ic_x2_dot_prod_16ic_u_H
#include <volk_gnsssdr/volk_gnsssdr_common.h>
#include <volk_gnsssdr/volk_gnsssdr_complex.h>
#include <stdio.h>
#include <string.h>
#include "saturated_arithmetic.h"
#ifdef LV_HAVE_GENERIC
/*!
\brief Multiplies the two input complex vectors and accumulates them, storing the result in the third vector
\param cVector The vector where the accumulated result will be stored
\param aVector One of the vectors to be multiplied and accumulated
\param bVector One of the vectors to be multiplied and accumulated
\param num_points The number of complex values in aVector and bVector to be multiplied together, accumulated and stored into cVector
*/
static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_generic(lv_16sc_t* result, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points)
{
result[0]=lv_16sc_t(0,0);
for (unsigned int n=0;n<num_points;n++)
{
//r*a.r - i*a.i, i*a.r + r*a.i
//result[0]+=in_a[n]*in_b[n];
lv_16sc_t tmp=in_a[n]*in_b[n];
result[0].real(sat_adds16b(result[0].real(),tmp.real()));
result[0].imag(sat_adds16b(result[0].imag(),tmp.imag()));
}
}
#endif /*LV_HAVE_GENERIC*/
#ifdef LV_HAVE_SSE2
#include <emmintrin.h>
static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_a_sse2(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points)
{
lv_16sc_t dotProduct=lv_16sc_t(0,0);
const unsigned int sse_iters = num_points / 4;
const lv_16sc_t* _in_a = in_a;
const lv_16sc_t* _in_b = in_b;
lv_16sc_t* _out = out;
if (sse_iters>0)
{
__m128i a,b,c, c_sr, mask_imag, mask_real, real, imag, imag1,imag2, b_sl, a_sl, realcacc, imagcacc, result;
realcacc = _mm_setzero_si128();
imagcacc = _mm_setzero_si128();
mask_imag = _mm_set_epi8(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0);
mask_real = _mm_set_epi8(0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255);
for(unsigned int number = 0;number < sse_iters; number++)
{
//std::complex<T> memory structure: real part -> reinterpret_cast<cv T*>(a)[2*i]
//imaginery part -> reinterpret_cast<cv T*>(a)[2*i + 1]
// a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
a = _mm_loadu_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
b = _mm_loadu_si128((__m128i*)_in_b);
c=_mm_mullo_epi16 (a, b); // a3.i*b3.i, a3.r*b3.r, ....
c_sr = _mm_srli_si128 (c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
real = _mm_subs_epi16 (c,c_sr);
b_sl = _mm_slli_si128(b,2); // b3.r, b2.i ....
a_sl = _mm_slli_si128(a,2); // a3.r, a2.i ....
imag1 = _mm_mullo_epi16(a,b_sl); // a3.i*b3.r, ....
imag2 = _mm_mullo_epi16(b,a_sl); // b3.i*a3.r, ....
imag = _mm_adds_epi16(imag1,imag2); //with saturation aritmetic!
realcacc = _mm_adds_epi16 (realcacc, real);
imagcacc = _mm_adds_epi16 (imagcacc, imag);
_in_a += 4;
_in_b += 4;
}
realcacc = _mm_and_si128 (realcacc, mask_real);
imagcacc = _mm_and_si128 (imagcacc, mask_imag);
result = _mm_or_si128 (realcacc, imagcacc);
__VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4];
_mm_storeu_si128((__m128i*)dotProductVector,result); // Store the results back into the dot product vector
for (int i = 0; i<4; ++i)
{
dotProduct.real(sat_adds16b(dotProduct.real(),dotProductVector[i].real()));
dotProduct.imag(sat_adds16b(dotProduct.imag(),dotProductVector[i].imag()));
}
}
for (unsigned int i = 0; i<(num_points % 4); ++i)
{
//dotProduct += (*_in_a++) * (*_in_b++);
lv_16sc_t tmp=(*_in_a++) * (*_in_b++);
dotProduct.real(sat_adds16b(dotProduct.real(),tmp.real()));
dotProduct.imag(sat_adds16b(dotProduct.imag(),tmp.imag()));
}
*_out = dotProduct;
}
#endif /* LV_HAVE_SSE2 */
#endif /*INCLUDED_volk_gnsssdr_16ic_x2_dot_prod_16ic_u_H*/

View File

@ -38,7 +38,7 @@
#include <volk_gnsssdr/volk_gnsssdr_complex.h>
#include "saturated_arithmetic.h"
#include <volk_gnsssdr/saturated_arithmetic.h>
#ifdef LV_HAVE_GENERIC
/*!
@ -50,18 +50,17 @@
*/
static inline void volk_gnsssdr_16ic_xn_dot_prod_16ic_xn_generic(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_16sc_t** in_a, unsigned int num_points, int num_a_vectors)
{
for (int n_vec=0;n_vec<num_a_vectors;n_vec++)
{
result[n_vec]=lv_cmake(0,0);
for (unsigned int n=0;n<num_points;n++)
{
//r*a.r - i*a.i, i*a.r + r*a.i
//result[n_vec]+=in_common[n]*in_a[n_vec][n];
lv_16sc_t tmp=in_common[n]*in_a[n_vec][n];
result[n_vec]=lv_cmake(sat_adds16b(lv_creal(result[n_vec]),lv_creal(tmp)),sat_adds16b(lv_cimag(result[n_vec]),lv_cimag(tmp)));
}
}
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
result[n_vec] = lv_cmake(0,0);
for (unsigned int n = 0; n < num_points; n++)
{
//r*a.r - i*a.i, i*a.r + r*a.i
//result[n_vec]+=in_common[n]*in_a[n_vec][n];
lv_16sc_t tmp = in_common[n]*in_a[n_vec][n];
result[n_vec] = lv_cmake(sat_adds16b(lv_creal(result[n_vec]), lv_creal(tmp)), sat_adds16b(lv_cimag(result[n_vec]), lv_cimag(tmp)));
}
}
}
#endif /*LV_HAVE_GENERIC*/
@ -71,99 +70,92 @@ static inline void volk_gnsssdr_16ic_xn_dot_prod_16ic_xn_generic(lv_16sc_t* resu
#include <emmintrin.h>
static inline void volk_gnsssdr_16ic_xn_dot_prod_16ic_xn_a_sse2(lv_16sc_t* out, const lv_16sc_t* in_common, const lv_16sc_t** in_a, unsigned int num_points, int num_a_vectors)
{
lv_16sc_t dotProduct=lv_cmake(0,0);
lv_16sc_t dotProduct = lv_cmake(0,0);
const unsigned int sse_iters = num_points / 4;
const lv_16sc_t** _in_a = in_a;
const lv_16sc_t* _in_common = in_common;
lv_16sc_t* _out = out;
const lv_16sc_t** _in_a = in_a;
const lv_16sc_t* _in_common = in_common;
lv_16sc_t* _out = out;
if (sse_iters>0)
if (sse_iters > 0)
{
__VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4];
//todo dyn mem reg
//todo dyn mem reg
__m128i* realcacc;
__m128i* imagcacc;
__m128i* realcacc;
__m128i* imagcacc;
realcacc=(__m128i*)calloc(num_a_vectors,sizeof(__m128i)); //calloc also sets memory to 0
imagcacc=(__m128i*)calloc(num_a_vectors,sizeof(__m128i)); //calloc also sets memory to 0
realcacc=(__m128i*)calloc(num_a_vectors,sizeof(__m128i)); //calloc also sets memory to 0
imagcacc=(__m128i*)calloc(num_a_vectors,sizeof(__m128i)); //calloc also sets memory to 0
__m128i a,b,c, c_sr, mask_imag, mask_real, real, imag, imag1,imag2, b_sl, a_sl, result;
mask_imag = _mm_set_epi8(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0);
mask_real = _mm_set_epi8(0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255);
__m128i a,b,c, c_sr, mask_imag, mask_real, real, imag, imag1,imag2, b_sl, a_sl, result;
for(unsigned int number = 0; number < sse_iters; number++)
{
//std::complex<T> memory structure: real part -> reinterpret_cast<cv T*>(a)[2*i]
//imaginery part -> reinterpret_cast<cv T*>(a)[2*i + 1]
// a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
mask_imag = _mm_set_epi8(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0);
mask_real = _mm_set_epi8(0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255);
b = _mm_loadu_si128((__m128i*)_in_common); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
a = _mm_loadu_si128((__m128i*)&(_in_a[n_vec][number*4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
for(unsigned int number = 0;number < sse_iters; number++)
{
//std::complex<T> memory structure: real part -> reinterpret_cast<cv T*>(a)[2*i]
//imaginery part -> reinterpret_cast<cv T*>(a)[2*i + 1]
// a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
c = _mm_mullo_epi16 (a, b); // a3.i*b3.i, a3.r*b3.r, ....
b = _mm_loadu_si128((__m128i*)_in_common); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
for (int n_vec=0;n_vec<num_a_vectors;n_vec++)
{
c_sr = _mm_srli_si128 (c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
real = _mm_subs_epi16 (c, c_sr);
a = _mm_loadu_si128((__m128i*)&(_in_a[n_vec][number*4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
c=_mm_mullo_epi16 (a, b); // a3.i*b3.i, a3.r*b3.r, ....
imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
c_sr = _mm_srli_si128 (c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
real = _mm_subs_epi16 (c,c_sr);
imag = _mm_adds_epi16(imag1, imag2);
b_sl = _mm_slli_si128(b,2); // b3.r, b2.i ....
a_sl = _mm_slli_si128(a,2); // a3.r, a2.i ....
realcacc[n_vec] = _mm_adds_epi16 (realcacc[n_vec], real);
imagcacc[n_vec] = _mm_adds_epi16 (imagcacc[n_vec], imag);
imag1 = _mm_mullo_epi16(a,b_sl); // a3.i*b3.r, ....
imag2 = _mm_mullo_epi16(b,a_sl); // b3.i*a3.r, ....
}
_in_common += 4;
}
imag = _mm_adds_epi16(imag1,imag2);
for (int n_vec=0;n_vec<num_a_vectors;n_vec++)
{
realcacc[n_vec] = _mm_and_si128 (realcacc[n_vec], mask_real);
imagcacc[n_vec] = _mm_and_si128 (imagcacc[n_vec], mask_imag);
realcacc[n_vec] = _mm_adds_epi16 (realcacc[n_vec], real);
imagcacc[n_vec] = _mm_adds_epi16 (imagcacc[n_vec], imag);
result = _mm_or_si128 (realcacc[n_vec], imagcacc[n_vec]);
}
_in_common += 4;
}
for (int n_vec=0;n_vec<num_a_vectors;n_vec++)
{
realcacc[n_vec] = _mm_and_si128 (realcacc[n_vec], mask_real);
imagcacc[n_vec] = _mm_and_si128 (imagcacc[n_vec], mask_imag);
result = _mm_or_si128 (realcacc[n_vec], imagcacc[n_vec]);
_mm_storeu_si128((__m128i*)dotProductVector,result); // Store the results back into the dot product vector
dotProduct=lv_cmake(0,0);
for (int i = 0; i<4; ++i)
{
dotProduct=lv_cmake(sat_adds16b(lv_creal(dotProduct),lv_creal(dotProductVector[i])),
sat_adds16b(lv_cimag(dotProduct),lv_cimag(dotProductVector[i])));
}
_out[n_vec]=dotProduct;
}
free(realcacc);
free(imagcacc);
_mm_storeu_si128((__m128i*)dotProductVector, result); // Store the results back into the dot product vector
dotProduct = lv_cmake(0,0);
for (int i = 0; i<4; ++i)
{
dotProduct = lv_cmake(sat_adds16b(lv_creal(dotProduct), lv_creal(dotProductVector[i])),
sat_adds16b(lv_cimag(dotProduct), lv_cimag(dotProductVector[i])));
}
_out[n_vec] = dotProduct;
}
free(realcacc);
free(imagcacc);
}
for (int n_vec=0;n_vec<num_a_vectors;n_vec++)
{
for(unsigned int n = sse_iters * 4;n < num_points; n++){
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
for(unsigned int n = sse_iters * 4;n < num_points; n++){
lv_16sc_t tmp=in_common[n]*in_a[n_vec][n];
lv_16sc_t tmp = in_common[n]*in_a[n_vec][n];
_out[n_vec]=lv_cmake(sat_adds16b(lv_creal(_out[n_vec]),lv_creal(tmp)),
sat_adds16b(lv_cimag(_out[n_vec]),lv_cimag(tmp)));
}
}
_out[n_vec] = lv_cmake(sat_adds16b(lv_creal(_out[n_vec]), lv_creal(tmp)),
sat_adds16b(lv_cimag(_out[n_vec]), lv_cimag(tmp)));
}
}
}
#endif /* LV_HAVE_SSE2 */