1
0
mirror of https://github.com/gnss-sdr/gnss-sdr synced 2025-02-22 22:10:19 +00:00

Fixing missing phase increment in SIMD implementations

After computing the rotation with SIMD instructions, we were not
incrementing the phase step, so the first iteration in the 'c region'
had the same phase than the last sample computed with SIMD instructions.
This commit fix the bux in SSE3 and NEON implementations
This commit is contained in:
Carles Fernandez 2016-01-29 19:42:30 +01:00
commit db321d1c2e
2 changed files with 132 additions and 150 deletions

View File

@ -24,9 +24,9 @@ static inline void volk_gnsssdr_16ic_rotatorpuppet_16ic_generic(lv_16sc_t* outVe
#endif /* LV_HAVE_GENERIC */
#ifdef LV_HAVE_SSE2
#ifdef LV_HAVE_SSE3
static inline void volk_gnsssdr_16ic_rotatorpuppet_16ic_a_sse2(lv_16sc_t* outVector, const lv_16sc_t* inVector, unsigned int num_points)
static inline void volk_gnsssdr_16ic_rotatorpuppet_16ic_a_sse3(lv_16sc_t* outVector, const lv_16sc_t* inVector, unsigned int num_points)
{
// phases must be normalized. Phase rotator expects a complex exponential input!
float rem_carrier_phase_in_rad = 0.345;
@ -35,14 +35,14 @@ static inline void volk_gnsssdr_16ic_rotatorpuppet_16ic_a_sse2(lv_16sc_t* outVec
phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), -sin(rem_carrier_phase_in_rad));
lv_32fc_t phase_inc[1];
phase_inc[0] = lv_cmake(cos(phase_step_rad), -sin(phase_step_rad));
volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse2(outVector, inVector, phase_inc[0], phase, num_points);
volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3(outVector, inVector, phase_inc[0], phase, num_points);
}
#endif /* LV_HAVE_SSE2 */
#endif /* LV_HAVE_SSE3 */
#ifdef LV_HAVE_SSE2
#ifdef LV_HAVE_SSE3
static inline void volk_gnsssdr_16ic_rotatorpuppet_16ic_u_sse2(lv_16sc_t* outVector, const lv_16sc_t* inVector, unsigned int num_points)
static inline void volk_gnsssdr_16ic_rotatorpuppet_16ic_u_sse3(lv_16sc_t* outVector, const lv_16sc_t* inVector, unsigned int num_points)
{
// phases must be normalized. Phase rotator expects a complex exponential input!
float rem_carrier_phase_in_rad = 0.345;
@ -51,10 +51,10 @@ static inline void volk_gnsssdr_16ic_rotatorpuppet_16ic_u_sse2(lv_16sc_t* outVec
phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), -sin(rem_carrier_phase_in_rad));
lv_32fc_t phase_inc[1];
phase_inc[0] = lv_cmake(cos(phase_step_rad), -sin(phase_step_rad));
volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse2(outVector, inVector, phase_inc[0], phase, num_points);
volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse3(outVector, inVector, phase_inc[0], phase, num_points);
}
#endif /* LV_HAVE_SSE2 */
#endif /* LV_HAVE_SSE3 */
#ifdef LV_HAVE_NEON

View File

@ -38,6 +38,7 @@
#include <volk_gnsssdr/volk_gnsssdr_complex.h>
#include <math.h>
#include <stdio.h>
#define ROTATOR_RELOAD 512
@ -72,198 +73,179 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_generic(lv_16sc_t* ou
#endif /* LV_HAVE_GENERIC */
#ifdef LV_HAVE_SSE2
#include <emmintrin.h>
#ifdef LV_HAVE_SSE3
#include <pmmintrin.h>
static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse2(lv_16sc_t* outVector, const lv_16sc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points)
static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3(lv_16sc_t* outVector, const lv_16sc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points)
{
const unsigned int sse_iters = num_points / 4;
__m128i a,b,c, c_sr, mask_imag, mask_real, real, imag, imag1,imag2, b_sl, a_sl, result;
__m128 a,b, two_phase_acc_reg,two_phase_inc_reg;
__m128i c1,c2,result;
__attribute__((aligned(16))) lv_32fc_t two_phase_inc[2];
two_phase_inc[0] = phase_inc*phase_inc;
two_phase_inc[1] = phase_inc*phase_inc;
two_phase_inc_reg = _mm_load_ps((float*) two_phase_inc);
__attribute__((aligned(16))) lv_32fc_t two_phase_acc[2];
two_phase_acc[0] = (*phase);
two_phase_acc[1] = (*phase) * phase_inc;
two_phase_acc_reg = _mm_load_ps((float*)two_phase_acc);
mask_imag = _mm_set_epi8(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0);
mask_real = _mm_set_epi8(0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255);
const lv_16sc_t* _in = inVector;
const lv_16sc_t* _in_a = inVector;
__attribute__((aligned(32))) lv_32fc_t four_phase_rotations_32fc[4];
// debug
//__attribute__((aligned(16))) lv_16sc_t four_phase_rotations_16sc[4];
// specify how many bits are used in the rotation (2^(N-1)) (it WILL increase the output signal range!)
__attribute__((aligned(32))) float rotator_amplitude_float[4] = { 4.0f, 4.0f, 4.0f, 4.0f };
__m128 _rotator_amplitude_reg = _mm_load_ps(rotator_amplitude_float);
//const lv_16sc_t* _in_b = in_b;
lv_16sc_t* _out = outVector;
__m128 fc_reg1, fc_reg2;
__m128i sc_reg1, sc_reg2; // is __m128i defined in xmmintrin.h?
__m128 yl, yh, tmp1, tmp2, tmp3;
for(unsigned int number = 0; number < sse_iters; number++)
{
//std::complex<T> memory structure: real part -> reinterpret_cast<cv T*>(a)[2*i]
//imaginery part -> reinterpret_cast<cv T*>(a)[2*i + 1]
// a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
a = _mm_load_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
//b = _mm_loadu_si128((__m128i*)_in_b);
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
//complex 32fc multiplication b=a*two_phase_acc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
b=_mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
// compute next four 16ic complex exponential values for phase rotation
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
two_phase_acc_reg=_mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
// compute next four float complex rotations
four_phase_rotations_32fc[0]=*phase;
(*phase) *= phase_inc;
four_phase_rotations_32fc[1]=*phase;
(*phase) *= phase_inc;
four_phase_rotations_32fc[2]=*phase;
(*phase) *= phase_inc;
four_phase_rotations_32fc[3]=*phase;
(*phase) *= phase_inc;
//convert the rotations to integers
fc_reg1 = _mm_load_ps((float*)&four_phase_rotations_32fc[0]);
//next two samples
_in += 2;
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
//complex 32fc multiplication b=a*two_phase_acc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
b=_mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
// disable next line for 1 bit rotation (equivalent to a square wave NCO)
fc_reg1 = _mm_mul_ps (fc_reg1, _rotator_amplitude_reg);
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
two_phase_acc_reg=_mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
fc_reg2 = _mm_load_ps((float*)&four_phase_rotations_32fc[2]);
sc_reg1 = _mm_cvtps_epi32(fc_reg1);
sc_reg2 = _mm_cvtps_epi32(fc_reg2);
b = _mm_packs_epi32(sc_reg1, sc_reg2);
// debug
//_mm_store_si128((__m128i*)four_phase_rotations_16sc, b);
//printf("phase fc: %f,%f phase sc: %i,%i \n",lv_creal(four_phase_rotations_32fc[0]),lv_cimag(four_phase_rotations_32fc[0]),lv_creal(four_phase_rotations_16sc[0]),lv_cimag(four_phase_rotations_16sc[0]));
// multiply the input vector times the rotations
c = _mm_mullo_epi16 (a, b); // a3.i*b3.i, a3.r*b3.r, ....
c_sr = _mm_srli_si128 (c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
real = _mm_subs_epi16 (c, c_sr);
real = _mm_and_si128 (real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i
b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
imag = _mm_adds_epi16(imag1, imag2);
imag = _mm_and_si128 (imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ...
result = _mm_or_si128 (real, imag);
// normalize the rotations
// TODO
// store results
// store four output samples
result = _mm_packs_epi32(c1, c2);// convert from 32ic to 16ic
_mm_store_si128((__m128i*)_out, result);
_in_a += 4;
//next two samples
_in += 2;
_out += 4;
}
_mm_storeu_ps((float*)two_phase_acc, two_phase_acc_reg);
(*phase) = lv_cmake(two_phase_acc[0], two_phase_acc[0]) * phase_inc;
lv_16sc_t tmp16;
lv_32fc_t tmp32;
for (unsigned int i = sse_iters * 4; i < num_points; ++i)
{
*_out++ = *_in_a++ * (*phase);
tmp16 = *_in++;
tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase);
*_out++ = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32)));
(*phase) *= phase_inc;
}
}
#endif /* LV_HAVE_SSE2 */
#endif /* LV_HAVE_SSE3 */
#ifdef LV_HAVE_SSE2
#include <emmintrin.h>
#ifdef LV_HAVE_SSE3
#include <pmmintrin.h>
static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse2(lv_16sc_t* outVector, const lv_16sc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points)
static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse3(lv_16sc_t* outVector, const lv_16sc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points)
{
const unsigned int sse_iters = num_points / 4;
__m128i a,b,c, c_sr, mask_imag, mask_real, real, imag, imag1,imag2, b_sl, a_sl, result;
__m128 a,b, two_phase_acc_reg,two_phase_inc_reg;
__m128i c1,c2,result;
__attribute__((aligned(16))) lv_32fc_t two_phase_inc[2];
two_phase_inc[0] = phase_inc*phase_inc;
two_phase_inc[1] = phase_inc*phase_inc;
two_phase_inc_reg = _mm_load_ps((float*) two_phase_inc);
__attribute__((aligned(16))) lv_32fc_t two_phase_acc[2];
two_phase_acc[0] = (*phase);
two_phase_acc[1] = (*phase)*phase_inc;
two_phase_acc_reg = _mm_load_ps((float*) two_phase_acc);
mask_imag = _mm_set_epi8(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0);
mask_real = _mm_set_epi8(0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255);
const lv_16sc_t* _in = inVector;
const lv_16sc_t* _in_a = inVector;
__attribute__((aligned(32))) lv_32fc_t four_phase_rotations_32fc[4];
// debug
//__attribute__((aligned(16))) lv_16sc_t four_phase_rotations_16sc[4];
// specify how many bits are used in the rotation (2^(N-1)) (it WILL increase the output signal range!)
__attribute__((aligned(32))) float rotator_amplitude_float[4] = { 4.0f, 4.0f, 4.0f, 4.0f };
__m128 _rotator_amplitude_reg = _mm_load_ps(rotator_amplitude_float);
//const lv_16sc_t* _in_b = in_b;
lv_16sc_t* _out = outVector;
__m128 fc_reg1, fc_reg2;
__m128i sc_reg1, sc_reg2; // is __m128i defined in xmmintrin.h?
__m128 yl, yh, tmp1, tmp2, tmp3;
for(unsigned int number = 0; number < sse_iters; number++)
{
//std::complex<T> memory structure: real part -> reinterpret_cast<cv T*>(a)[2*i]
//imaginery part -> reinterpret_cast<cv T*>(a)[2*i + 1]
// a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
a = _mm_loadu_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
//b = _mm_loadu_si128((__m128i*)_in_b);
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
//complex 32fc multiplication b=a*two_phase_acc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
b=_mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
// compute next four 16ic complex exponential values for phase rotation
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
two_phase_acc_reg=_mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
// compute next four float complex rotations
four_phase_rotations_32fc[0]=*phase;
(*phase) *= phase_inc;
four_phase_rotations_32fc[1]=*phase;
(*phase) *= phase_inc;
four_phase_rotations_32fc[2]=*phase;
(*phase) *= phase_inc;
four_phase_rotations_32fc[3]=*phase;
(*phase) *= phase_inc;
//convert the rotations to integers
fc_reg1 = _mm_load_ps((float*)&four_phase_rotations_32fc[0]);
//next two samples
_in += 2;
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
//complex 32fc multiplication b=a*two_phase_acc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
b=_mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
// disable next line for 1 bit rotation (equivalent to a square wave NCO)
fc_reg1 = _mm_mul_ps (fc_reg1, _rotator_amplitude_reg);
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
fc_reg2 = _mm_load_ps((float*)&four_phase_rotations_32fc[2]);
sc_reg1 = _mm_cvtps_epi32(fc_reg1);
sc_reg2 = _mm_cvtps_epi32(fc_reg2);
b = _mm_packs_epi32(sc_reg1, sc_reg2);
// debug
//_mm_store_si128((__m128i*)four_phase_rotations_16sc, b);
//printf("phase fc: %f,%f phase sc: %i,%i \n",lv_creal(four_phase_rotations_32fc[0]),lv_cimag(four_phase_rotations_32fc[0]),lv_creal(four_phase_rotations_16sc[0]),lv_cimag(four_phase_rotations_16sc[0]));
// multiply the input vector times the rotations
c = _mm_mullo_epi16 (a, b); // a3.i*b3.i, a3.r*b3.r, ....
c_sr = _mm_srli_si128 (c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
real = _mm_subs_epi16 (c, c_sr);
real = _mm_and_si128 (real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i
b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
imag = _mm_adds_epi16(imag1, imag2);
imag = _mm_and_si128 (imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ...
result = _mm_or_si128 (real, imag);
// normalize the rotations
// TODO
// store results
// store four output samples
result = _mm_packs_epi32(c1, c2);// convert from 32ic to 16ic
_mm_storeu_si128((__m128i*)_out, result);
_in_a += 4;
//next two samples
_in += 2;
_out += 4;
}
_mm_storeu_ps((float*)two_phase_acc, two_phase_acc_reg);
(*phase) = lv_cmake(two_phase_acc[0], two_phase_acc[0]) * phase_inc;
lv_16sc_t tmp16;
lv_32fc_t tmp32;
for (unsigned int i = sse_iters * 4; i < num_points; ++i)
{
*_out++ = *_in_a++ * (*phase);
tmp16 = *_in++;
tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase);
*_out++ = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32)));
(*phase) *= phase_inc;
}
}
#endif /* LV_HAVE_SSE2 */
#endif /* LV_HAVE_SSE3 */
#ifdef LV_HAVE_NEON
#include <arm_neon.h>