Fixing missing phase increment in SIMD implementations

After computing the rotation with SIMD instructions, we were not incrementing the phase step, so the first iteration in the 'c region' had the same phase than the last sample computed with SIMD instructions. This commit fix the bux in SSE3 and NEON implementations
2025-07-07 20:42:56 +00:00 · 2016-01-29 19:42:30 +01:00 · 2016-01-29 19:42:30 +01:00 · db321d1c2e
commit db321d1c2e
parent 8c07815852 a26255270e
2 changed files with 132 additions and 150 deletions
--- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_rotatorpuppet_16ic.h
+++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_rotatorpuppet_16ic.h
@ -24,9 +24,9 @@ static inline void volk_gnsssdr_16ic_rotatorpuppet_16ic_generic(lv_16sc_t* outVe
 #endif /* LV_HAVE_GENERIC */


-#ifdef LV_HAVE_SSE2
+#ifdef LV_HAVE_SSE3

-static inline void volk_gnsssdr_16ic_rotatorpuppet_16ic_a_sse2(lv_16sc_t* outVector, const lv_16sc_t* inVector, unsigned int num_points)
+static inline void volk_gnsssdr_16ic_rotatorpuppet_16ic_a_sse3(lv_16sc_t* outVector, const lv_16sc_t* inVector, unsigned int num_points)
 {
    // phases must be normalized. Phase rotator expects a complex exponential input!
    float rem_carrier_phase_in_rad = 0.345;
@ -35,14 +35,14 @@ static inline void volk_gnsssdr_16ic_rotatorpuppet_16ic_a_sse2(lv_16sc_t* outVec
    phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), -sin(rem_carrier_phase_in_rad));
    lv_32fc_t phase_inc[1];
    phase_inc[0] = lv_cmake(cos(phase_step_rad), -sin(phase_step_rad));
-    volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse2(outVector, inVector, phase_inc[0], phase, num_points);
+    volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3(outVector, inVector, phase_inc[0], phase, num_points);
 }

-#endif /* LV_HAVE_SSE2 */
+#endif /* LV_HAVE_SSE3 */

-#ifdef LV_HAVE_SSE2
+#ifdef LV_HAVE_SSE3

-static inline void volk_gnsssdr_16ic_rotatorpuppet_16ic_u_sse2(lv_16sc_t* outVector, const lv_16sc_t* inVector, unsigned int num_points)
+static inline void volk_gnsssdr_16ic_rotatorpuppet_16ic_u_sse3(lv_16sc_t* outVector, const lv_16sc_t* inVector, unsigned int num_points)
 {
    // phases must be normalized. Phase rotator expects a complex exponential input!
    float rem_carrier_phase_in_rad = 0.345;
@ -51,10 +51,10 @@ static inline void volk_gnsssdr_16ic_rotatorpuppet_16ic_u_sse2(lv_16sc_t* outVec
    phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), -sin(rem_carrier_phase_in_rad));
    lv_32fc_t phase_inc[1];
    phase_inc[0] = lv_cmake(cos(phase_step_rad), -sin(phase_step_rad));
-    volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse2(outVector, inVector, phase_inc[0], phase, num_points);
+    volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse3(outVector, inVector, phase_inc[0], phase, num_points);
 }

-#endif /* LV_HAVE_SSE2 */
+#endif /* LV_HAVE_SSE3 */

 #ifdef LV_HAVE_NEON

--- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_s32fc_x2_rotator_16ic.h
+++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_s32fc_x2_rotator_16ic.h
@ -38,6 +38,7 @@

 #include <volk_gnsssdr/volk_gnsssdr_complex.h>
 #include <math.h>
+#include <stdio.h>

 #define ROTATOR_RELOAD 512

@ -72,198 +73,179 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_generic(lv_16sc_t* ou
 #endif /* LV_HAVE_GENERIC */


-#ifdef LV_HAVE_SSE2
-#include <emmintrin.h>
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>

-static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse2(lv_16sc_t* outVector, const lv_16sc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points)
+static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3(lv_16sc_t* outVector, const lv_16sc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points)
 {
    const unsigned int sse_iters = num_points / 4;
-    __m128i a,b,c, c_sr, mask_imag, mask_real, real, imag, imag1,imag2, b_sl, a_sl, result;
+    __m128 a,b, two_phase_acc_reg,two_phase_inc_reg;
+    __m128i c1,c2,result;
+    __attribute__((aligned(16))) lv_32fc_t two_phase_inc[2];
+    two_phase_inc[0] = phase_inc*phase_inc;
+    two_phase_inc[1] = phase_inc*phase_inc;
+    two_phase_inc_reg = _mm_load_ps((float*) two_phase_inc);
+    __attribute__((aligned(16))) lv_32fc_t two_phase_acc[2];
+    two_phase_acc[0] = (*phase);
+    two_phase_acc[1] = (*phase) * phase_inc;
+    two_phase_acc_reg = _mm_load_ps((float*)two_phase_acc);

-    mask_imag = _mm_set_epi8(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0);
-    mask_real = _mm_set_epi8(0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255);
+    const lv_16sc_t* _in = inVector;

-    const lv_16sc_t* _in_a = inVector;
-    __attribute__((aligned(32))) lv_32fc_t four_phase_rotations_32fc[4];
-    // debug
-    //__attribute__((aligned(16))) lv_16sc_t four_phase_rotations_16sc[4];
-
-    // specify how many bits are used in the rotation (2^(N-1)) (it WILL increase the output signal range!)
-    __attribute__((aligned(32))) float rotator_amplitude_float[4] = { 4.0f, 4.0f, 4.0f, 4.0f };
-    __m128 _rotator_amplitude_reg = _mm_load_ps(rotator_amplitude_float);
-
-    //const lv_16sc_t* _in_b = in_b;
    lv_16sc_t* _out = outVector;

-    __m128 fc_reg1, fc_reg2;
-    __m128i sc_reg1, sc_reg2; // is __m128i defined in xmmintrin.h?
+    __m128 yl, yh, tmp1, tmp2, tmp3;

    for(unsigned int number = 0; number < sse_iters; number++)
        {
-            //std::complex<T> memory structure: real part -> reinterpret_cast<cv T*>(a)[2*i]
-            //imaginery part -> reinterpret_cast<cv T*>(a)[2*i + 1]
-            // a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
-            a = _mm_load_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
-            //b = _mm_loadu_si128((__m128i*)_in_b);
+            a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
+            //complex 32fc multiplication b=a*two_phase_acc_reg
+            yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
+            yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
+            tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+            a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br
+            tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+            b=_mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+            c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic

-            // compute next four 16ic complex exponential values for phase rotation
+            //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
+            yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
+            yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
+            tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+            tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br
+            tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+            two_phase_acc_reg=_mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di

-            // compute next four float complex rotations
-            four_phase_rotations_32fc[0]=*phase;
-            (*phase) *= phase_inc;
-            four_phase_rotations_32fc[1]=*phase;
-            (*phase) *= phase_inc;
-            four_phase_rotations_32fc[2]=*phase;
-            (*phase) *= phase_inc;
-            four_phase_rotations_32fc[3]=*phase;
-            (*phase) *= phase_inc;
-            //convert the rotations to integers
-            fc_reg1 = _mm_load_ps((float*)&four_phase_rotations_32fc[0]);
+            //next two samples
+            _in += 2;
+            a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
+            //complex 32fc multiplication b=a*two_phase_acc_reg
+            yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
+            yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
+            tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+            a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br
+            tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+            b=_mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+            c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic

-            // disable next line for 1 bit rotation (equivalent to a square wave NCO)
-            fc_reg1 = _mm_mul_ps (fc_reg1, _rotator_amplitude_reg);
+            //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
+            yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
+            yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
+            tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+            tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br
+            tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+            two_phase_acc_reg=_mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di

-            fc_reg2 = _mm_load_ps((float*)&four_phase_rotations_32fc[2]);
-            sc_reg1 = _mm_cvtps_epi32(fc_reg1);
-            sc_reg2 = _mm_cvtps_epi32(fc_reg2);
-            b = _mm_packs_epi32(sc_reg1, sc_reg2);
-
-            // debug
-            //_mm_store_si128((__m128i*)four_phase_rotations_16sc, b);
-            //printf("phase fc: %f,%f phase sc: %i,%i \n",lv_creal(four_phase_rotations_32fc[0]),lv_cimag(four_phase_rotations_32fc[0]),lv_creal(four_phase_rotations_16sc[0]),lv_cimag(four_phase_rotations_16sc[0]));
-
-            // multiply the input vector times the rotations
-            c = _mm_mullo_epi16 (a, b); // a3.i*b3.i, a3.r*b3.r, ....
-
-            c_sr = _mm_srli_si128 (c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
-            real = _mm_subs_epi16 (c, c_sr);
-            real = _mm_and_si128 (real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0,  a3.r*b3.r- a3.i*b3.i
-
-            b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
-            a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
-
-            imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
-            imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
-
-            imag = _mm_adds_epi16(imag1, imag2);
-            imag = _mm_and_si128 (imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ...
-
-            result = _mm_or_si128 (real, imag);
-
-            // normalize the rotations
-            // TODO
-
-            // store results
+            // store four output samples
+            result = _mm_packs_epi32(c1, c2);// convert from 32ic to 16ic
            _mm_store_si128((__m128i*)_out, result);

-            _in_a += 4;
+            //next two samples
+            _in += 2;
            _out += 4;
        }

+    _mm_storeu_ps((float*)two_phase_acc, two_phase_acc_reg);
+    (*phase) = lv_cmake(two_phase_acc[0], two_phase_acc[0]) * phase_inc;
+    lv_16sc_t tmp16;
+    lv_32fc_t tmp32;
    for (unsigned int i = sse_iters * 4; i < num_points; ++i)
        {
-            *_out++ = *_in_a++ * (*phase);
+            tmp16 = *_in++;
+            tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase);
+            *_out++ = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32)));
            (*phase) *= phase_inc;
        }
 }
-#endif /* LV_HAVE_SSE2 */

+#endif /* LV_HAVE_SSE3 */

-#ifdef LV_HAVE_SSE2
-#include <emmintrin.h>
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>

-static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse2(lv_16sc_t* outVector, const lv_16sc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points)
+static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse3(lv_16sc_t* outVector, const lv_16sc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points)
 {
    const unsigned int sse_iters = num_points / 4;
-    __m128i a,b,c, c_sr, mask_imag, mask_real, real, imag, imag1,imag2, b_sl, a_sl, result;
+    __m128 a,b, two_phase_acc_reg,two_phase_inc_reg;
+    __m128i c1,c2,result;
+    __attribute__((aligned(16))) lv_32fc_t two_phase_inc[2];
+    two_phase_inc[0] = phase_inc*phase_inc;
+    two_phase_inc[1] = phase_inc*phase_inc;
+    two_phase_inc_reg = _mm_load_ps((float*) two_phase_inc);
+    __attribute__((aligned(16))) lv_32fc_t two_phase_acc[2];
+    two_phase_acc[0] = (*phase);
+    two_phase_acc[1] = (*phase)*phase_inc;
+    two_phase_acc_reg = _mm_load_ps((float*) two_phase_acc);

-    mask_imag = _mm_set_epi8(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0);
-    mask_real = _mm_set_epi8(0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255);
+    const lv_16sc_t* _in = inVector;

-    const lv_16sc_t* _in_a = inVector;
-    __attribute__((aligned(32))) lv_32fc_t four_phase_rotations_32fc[4];
-    // debug
-    //__attribute__((aligned(16))) lv_16sc_t four_phase_rotations_16sc[4];
-
-    // specify how many bits are used in the rotation (2^(N-1)) (it WILL increase the output signal range!)
-    __attribute__((aligned(32))) float rotator_amplitude_float[4] = { 4.0f, 4.0f, 4.0f, 4.0f };
-    __m128 _rotator_amplitude_reg = _mm_load_ps(rotator_amplitude_float);
-
-    //const lv_16sc_t* _in_b = in_b;
    lv_16sc_t* _out = outVector;

-    __m128 fc_reg1, fc_reg2;
-    __m128i sc_reg1, sc_reg2; // is __m128i defined in xmmintrin.h?
+    __m128 yl, yh, tmp1, tmp2, tmp3;

    for(unsigned int number = 0; number < sse_iters; number++)
        {
-            //std::complex<T> memory structure: real part -> reinterpret_cast<cv T*>(a)[2*i]
-            //imaginery part -> reinterpret_cast<cv T*>(a)[2*i + 1]
-            // a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
-            a = _mm_loadu_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
-            //b = _mm_loadu_si128((__m128i*)_in_b);
+            a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
+            //complex 32fc multiplication b=a*two_phase_acc_reg
+            yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
+            yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
+            tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+            a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br
+            tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+            b=_mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+            c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic

-            // compute next four 16ic complex exponential values for phase rotation
+            //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
+            yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
+            yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
+            tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+            tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br
+            tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+            two_phase_acc_reg=_mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di

-            // compute next four float complex rotations
-            four_phase_rotations_32fc[0]=*phase;
-            (*phase) *= phase_inc;
-            four_phase_rotations_32fc[1]=*phase;
-            (*phase) *= phase_inc;
-            four_phase_rotations_32fc[2]=*phase;
-            (*phase) *= phase_inc;
-            four_phase_rotations_32fc[3]=*phase;
-            (*phase) *= phase_inc;
-            //convert the rotations to integers
-            fc_reg1 = _mm_load_ps((float*)&four_phase_rotations_32fc[0]);
+            //next two samples
+            _in += 2;
+            a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
+            //complex 32fc multiplication b=a*two_phase_acc_reg
+            yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
+            yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
+            tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+            a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br
+            tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+            b=_mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+            c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic

-            // disable next line for 1 bit rotation (equivalent to a square wave NCO)
-            fc_reg1 = _mm_mul_ps (fc_reg1, _rotator_amplitude_reg);
+            //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
+            yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
+            yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
+            tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+            tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br
+            tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+            two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di

-            fc_reg2 = _mm_load_ps((float*)&four_phase_rotations_32fc[2]);
-            sc_reg1 = _mm_cvtps_epi32(fc_reg1);
-            sc_reg2 = _mm_cvtps_epi32(fc_reg2);
-            b = _mm_packs_epi32(sc_reg1, sc_reg2);
-
-            // debug
-            //_mm_store_si128((__m128i*)four_phase_rotations_16sc, b);
-            //printf("phase fc: %f,%f phase sc: %i,%i \n",lv_creal(four_phase_rotations_32fc[0]),lv_cimag(four_phase_rotations_32fc[0]),lv_creal(four_phase_rotations_16sc[0]),lv_cimag(four_phase_rotations_16sc[0]));
-
-            // multiply the input vector times the rotations
-            c = _mm_mullo_epi16 (a, b); // a3.i*b3.i, a3.r*b3.r, ....
-
-            c_sr = _mm_srli_si128 (c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
-            real = _mm_subs_epi16 (c, c_sr);
-            real = _mm_and_si128 (real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0,  a3.r*b3.r- a3.i*b3.i
-
-            b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
-            a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
-
-            imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
-            imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
-
-            imag = _mm_adds_epi16(imag1, imag2);
-            imag = _mm_and_si128 (imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ...
-
-            result = _mm_or_si128 (real, imag);
-
-            // normalize the rotations
-            // TODO
-
-            // store results
+            // store four output samples
+            result = _mm_packs_epi32(c1, c2);// convert from 32ic to 16ic
            _mm_storeu_si128((__m128i*)_out, result);

-            _in_a += 4;
+            //next two samples
+            _in += 2;
            _out += 4;
        }

+    _mm_storeu_ps((float*)two_phase_acc, two_phase_acc_reg);
+    (*phase) = lv_cmake(two_phase_acc[0], two_phase_acc[0]) * phase_inc;
+    lv_16sc_t tmp16;
+    lv_32fc_t tmp32;
    for (unsigned int i = sse_iters * 4; i < num_points; ++i)
        {
-            *_out++ = *_in_a++ * (*phase);
+            tmp16 = *_in++;
+            tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase);
+            *_out++ = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32)));
            (*phase) *= phase_inc;
        }
 }
-#endif /* LV_HAVE_SSE2 */
+
+#endif /* LV_HAVE_SSE3 */

 #ifdef LV_HAVE_NEON
 #include <arm_neon.h>