From f5999872f6abc2a194cc22bba4b2a8d585df2f37 Mon Sep 17 00:00:00 2001 From: Marcus Alagar Date: Sun, 24 Aug 2025 14:07:34 -0500 Subject: [PATCH] Optimize 16ic to 32fc conversion Optimize conversion from 16-bit integer complex numbers to 32-bit floating-point complex numbers using RVV intrinsics by skipping any need to distinguish between real and imaginary numbers and instead just loading a contiguous selectiong of components. Timing results: Old - generic: 39736.8 ms, rvv: 10406.7 ms New - generic: 40479.9 ms, rvv: 8245.38 ms Signed-off-by: Marcus Alagar --- .../volk_gnsssdr_16ic_convert_32fc.h | 33 ++++++++----------- 1 file changed, 14 insertions(+), 19 deletions(-) diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_convert_32fc.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_convert_32fc.h index a2cd84624..e5335dd08 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_convert_32fc.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_convert_32fc.h @@ -270,35 +270,30 @@ static inline void volk_gnsssdr_16ic_convert_32fc_neon(lv_32fc_t* outputVector, static inline void volk_gnsssdr_16ic_convert_32fc_rvv(lv_32fc_t* outputVector, const lv_16sc_t* inputVector, unsigned int num_points) { - size_t n = num_points; + // Will be converting by number, with each + // complex number containing two component numbers + size_t n = num_points * 2; // Initialize pointers to keep track as stripmine float* outPtr = (float*) outputVector; const short* inPtr = (const short*) inputVector; - for (size_t vl; n > 0; n -= vl, outPtr += vl * 2, inPtr += vl * 2) + for (size_t vl; n > 0; n -= vl, outPtr += vl, inPtr += vl) { // Record how many elements will actually be processed - // Only use a EMUL of 2 so that when widen, EMUL = 4 - // which still allows a segment store with NFIELDS = 2 - vl = __riscv_vsetvl_e16m2(n); + // Only use a EMUL of 4 so that when widen, EMUL = 8 + vl = __riscv_vsetvl_e16m4(n); - // Load inReal[0..vl), inImag[0..vl) - vint16m2x2_t inVal = __riscv_vlseg2e16_v_i16m2x2(inPtr, vl); - vint16m2_t inRealVal = __riscv_vget_v_i16m2x2_i16m2(inVal, 0); - vint16m2_t inImagVal = __riscv_vget_v_i16m2x2_i16m2(inVal, 1); + // Don't have to segment store/load since converting + // both real and imaginary components + // Load in[0..vl) + vint16m4_t inVal = __riscv_vle16_v_i16m4(inPtr, vl); - // outReal[i] = (float) inReal[i] - // outImag[i] = (float) inImag[i] - vfloat32m4_t outRealVal = __riscv_vfwcvt_f_x_v_f32m4(inRealVal, vl); - vfloat32m4_t outImagVal = __riscv_vfwcvt_f_x_v_f32m4(inImagVal, vl); + // out[i] = (float) in[i] + vfloat32m8_t outVal = __riscv_vfwcvt_f_x_v_f32m8(inVal, vl); - // Store outReal[0..vl), outImag[0..vl) - vfloat32m4x2_t outVal = __riscv_vset_v_f32m4_f32m4x2( - __riscv_vundefined_f32m4x2(), 0, outRealVal - ); - outVal = __riscv_vset_v_f32m4_f32m4x2(outVal, 1, outImagVal); - __riscv_vsseg2e32_v_f32m4x2(outPtr, outVal, vl); + // Store out[0..vl) + __riscv_vse32_v_f32m8(outPtr, outVal, vl); // In looping, decrement the number // elements left and increment the pointers