Optimize 16ic to 32fc conversion

Optimize conversion from 16-bit integer complex numbers to 32-bit floating-point complex numbers using RVV intrinsics by skipping any need to distinguish between real and imaginary numbers and instead just loading a contiguous selectiong of components. Timing results: Old - generic: 39736.8 ms, rvv: 10406.7 ms New - generic: 40479.9 ms, rvv: 8245.38 ms Signed-off-by: Marcus Alagar <mvala079@gmail.com>
2025-11-04 17:23:20 +00:00 · 2025-08-24 14:07:34 -05:00
parent 1b265d55be
commit f5999872f6
1 changed files with 14 additions and 19 deletions
--- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_convert_32fc.h
+++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_convert_32fc.h
@@ -270,35 +270,30 @@ static inline void volk_gnsssdr_16ic_convert_32fc_neon(lv_32fc_t* outputVector,

 static inline void volk_gnsssdr_16ic_convert_32fc_rvv(lv_32fc_t* outputVector, const lv_16sc_t* inputVector, unsigned int num_points)
 {
-    size_t n = num_points;
+    // Will be converting by number, with each
+    // complex number containing two component numbers
+    size_t n = num_points * 2;

    // Initialize pointers to keep track as stripmine
    float* outPtr = (float*) outputVector;
    const short* inPtr = (const short*) inputVector;

-    for (size_t vl; n > 0; n -= vl, outPtr += vl * 2, inPtr += vl * 2)
+    for (size_t vl; n > 0; n -= vl, outPtr += vl, inPtr += vl)
        {
            // Record how many elements will actually be processed
-            // Only use a EMUL of 2 so that when widen, EMUL = 4
-            // which still allows a segment store with NFIELDS = 2
-            vl = __riscv_vsetvl_e16m2(n);
+            // Only use a EMUL of 4 so that when widen, EMUL = 8
+            vl = __riscv_vsetvl_e16m4(n);

-            // Load inReal[0..vl), inImag[0..vl)
-            vint16m2x2_t inVal = __riscv_vlseg2e16_v_i16m2x2(inPtr, vl);
-            vint16m2_t inRealVal = __riscv_vget_v_i16m2x2_i16m2(inVal, 0);
-            vint16m2_t inImagVal = __riscv_vget_v_i16m2x2_i16m2(inVal, 1);
+            // Don't have to segment store/load since converting
+            // both real and imaginary components
+            // Load in[0..vl)
+            vint16m4_t inVal = __riscv_vle16_v_i16m4(inPtr, vl);

-            // outReal[i] = (float) inReal[i]
-            // outImag[i] = (float) inImag[i]
-            vfloat32m4_t outRealVal = __riscv_vfwcvt_f_x_v_f32m4(inRealVal, vl);
-            vfloat32m4_t outImagVal = __riscv_vfwcvt_f_x_v_f32m4(inImagVal, vl);
+            // out[i] = (float) in[i]
+            vfloat32m8_t outVal = __riscv_vfwcvt_f_x_v_f32m8(inVal, vl);

-            // Store outReal[0..vl), outImag[0..vl)
-            vfloat32m4x2_t outVal = __riscv_vset_v_f32m4_f32m4x2(
-                __riscv_vundefined_f32m4x2(), 0, outRealVal
-            );
-            outVal = __riscv_vset_v_f32m4_f32m4x2(outVal, 1, outImagVal);
-            __riscv_vsseg2e32_v_f32m4x2(outPtr, outVal, vl);
+            // Store out[0..vl)
+            __riscv_vse32_v_f32m8(outPtr, outVal, vl);

            // In looping, decrement the number
            // elements left and increment the pointers