1
0
mirror of https://github.com/gnss-sdr/gnss-sdr synced 2025-11-04 17:23:20 +00:00

Optimize 16ic to 32fc conversion

Optimize conversion from 16-bit integer complex numbers to
32-bit floating-point complex numbers using RVV intrinsics
by skipping any need to distinguish between real and imaginary
numbers and instead just loading a contiguous selectiong
of components.

Timing results:
Old - generic: 39736.8 ms, rvv: 10406.7 ms
New - generic: 40479.9 ms, rvv: 8245.38 ms

Signed-off-by: Marcus Alagar <mvala079@gmail.com>
This commit is contained in:
Marcus Alagar
2025-08-24 14:07:34 -05:00
parent 1b265d55be
commit f5999872f6

View File

@@ -270,35 +270,30 @@ static inline void volk_gnsssdr_16ic_convert_32fc_neon(lv_32fc_t* outputVector,
static inline void volk_gnsssdr_16ic_convert_32fc_rvv(lv_32fc_t* outputVector, const lv_16sc_t* inputVector, unsigned int num_points)
{
size_t n = num_points;
// Will be converting by number, with each
// complex number containing two component numbers
size_t n = num_points * 2;
// Initialize pointers to keep track as stripmine
float* outPtr = (float*) outputVector;
const short* inPtr = (const short*) inputVector;
for (size_t vl; n > 0; n -= vl, outPtr += vl * 2, inPtr += vl * 2)
for (size_t vl; n > 0; n -= vl, outPtr += vl, inPtr += vl)
{
// Record how many elements will actually be processed
// Only use a EMUL of 2 so that when widen, EMUL = 4
// which still allows a segment store with NFIELDS = 2
vl = __riscv_vsetvl_e16m2(n);
// Only use a EMUL of 4 so that when widen, EMUL = 8
vl = __riscv_vsetvl_e16m4(n);
// Load inReal[0..vl), inImag[0..vl)
vint16m2x2_t inVal = __riscv_vlseg2e16_v_i16m2x2(inPtr, vl);
vint16m2_t inRealVal = __riscv_vget_v_i16m2x2_i16m2(inVal, 0);
vint16m2_t inImagVal = __riscv_vget_v_i16m2x2_i16m2(inVal, 1);
// Don't have to segment store/load since converting
// both real and imaginary components
// Load in[0..vl)
vint16m4_t inVal = __riscv_vle16_v_i16m4(inPtr, vl);
// outReal[i] = (float) inReal[i]
// outImag[i] = (float) inImag[i]
vfloat32m4_t outRealVal = __riscv_vfwcvt_f_x_v_f32m4(inRealVal, vl);
vfloat32m4_t outImagVal = __riscv_vfwcvt_f_x_v_f32m4(inImagVal, vl);
// out[i] = (float) in[i]
vfloat32m8_t outVal = __riscv_vfwcvt_f_x_v_f32m8(inVal, vl);
// Store outReal[0..vl), outImag[0..vl)
vfloat32m4x2_t outVal = __riscv_vset_v_f32m4_f32m4x2(
__riscv_vundefined_f32m4x2(), 0, outRealVal
);
outVal = __riscv_vset_v_f32m4_f32m4x2(outVal, 1, outImagVal);
__riscv_vsseg2e32_v_f32m4x2(outPtr, outVal, vl);
// Store out[0..vl)
__riscv_vse32_v_f32m8(outPtr, outVal, vl);
// In looping, decrement the number
// elements left and increment the pointers