1
0
mirror of https://github.com/gnss-sdr/gnss-sdr synced 2025-12-03 23:28:07 +00:00

Implement 16ic xn dot product, puppet, with RVV

Implement computing the dot products of multiple vectors
dotted with a common vector, all with elements of 16-bit complex
numbers (with both the real and imaginary part being 16 bits)
using RVV C intrinsics in `volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_rvv`.

Signed-off-by: Marcus Alagar <mvala079@gmail.com>
This commit is contained in:
Marcus Alagar
2025-08-09 22:56:17 -05:00
parent 566d286995
commit bc6c5d968d
2 changed files with 99 additions and 0 deletions

View File

@@ -722,4 +722,80 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_optvma(lv_16sc_t*
}
#endif /* LV_HAVE_NEON */
#ifdef LV_HAVE_RVV
#include <riscv_vector.h>
static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_rvv(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points) {
int n_vec = num_a_vectors;
for (int i = 0; i < n_vec; i++)
{
size_t n = num_points;
short* resPtr = (short*) &result[i];
// Initialize pointers to track progress as stripmine
const short* comPtr = (const short*) in_common;
const short* aPtr = (const short*) in_a[i];
// Use 32-bit accumulator in order to saturate
// to 16 bits
// accReal[0] = 0
vint32m1_t accRealVal = __riscv_vmv_s_x_i32m1(0, 1);
// accImag[0] = 0
vint32m1_t accImagVal = __riscv_vmv_s_x_i32m1(0, 1);
for (size_t vl; n > 0; n -= vl, comPtr += vl * 2, aPtr += vl * 2)
{
// Record how many elements will actually be processed
vl = __riscv_vsetvl_e16m4(n);
// Load comReal[0..vl), comImag[0..vl)
vint16m4x2_t comVal = __riscv_vlseg2e16_v_i16m4x2(comPtr, vl);
vint16m4_t comRealVal = __riscv_vget_v_i16m4x2_i16m4(comVal, 0);
vint16m4_t comImagVal = __riscv_vget_v_i16m4x2_i16m4(comVal, 1);
// Load aReal[0..vl), aImag[0..vl)
vint16m4x2_t aVal = __riscv_vlseg2e16_v_i16m4x2(aPtr, vl);
vint16m4_t aRealVal = __riscv_vget_v_i16m4x2_i16m4(aVal, 0);
vint16m4_t aImagVal = __riscv_vget_v_i16m4x2_i16m4(aVal, 1);
// outReal[i] = -(comImag[i] * aImag[i]) + comReal[i] * aReal[i]
vint16m4_t outRealVal = __riscv_vmul_vv_i16m4(comRealVal, aRealVal, vl);
outRealVal = __riscv_vnmsac_vv_i16m4(outRealVal, comImagVal, aImagVal, vl);
// accReal[0] = sum( accReal[0], outReal[0..vl) )
accRealVal = __riscv_vwredsum_vs_i16m4_i32m1(outRealVal, accRealVal, vl);
// Saturate accReal[0] within 16 bits
accRealVal = __riscv_vmin_vx_i32m1(accRealVal, 32767, 1);
accRealVal = __riscv_vmax_vx_i32m1(accRealVal, -32768, 1);
// outImag[i] = (comImag[i] * aReal[i]) + comReal[i] * aImag[i]
vint16m4_t outImagVal = __riscv_vmul_vv_i16m4(comRealVal, aImagVal, vl);
outImagVal = __riscv_vmacc_vv_i16m4(outImagVal, comImagVal, aRealVal, vl);
// accImag[0] = sum( accImag[0], outImag[0..vl) )
accImagVal = __riscv_vwredsum_vs_i16m4_i32m1(outImagVal, accImagVal, vl);
// Saturate accImag[0] within 16 bits
accImagVal = __riscv_vmin_vx_i32m1(accImagVal, 32767, 1);
accImagVal = __riscv_vmax_vx_i32m1(accImagVal, -32768, 1);
// In looping, decrement the number of
// elements left and increment the pointers
// by the number of elements processed,
// taking into account how the `vl` complex
// numbers are each stored as 2 `short`s
}
// Real part of resultant complex number
resPtr[0] = (short) __riscv_vmv_x_s_i32m1_i32(accRealVal);
// Imaginary part of resultant complex number
resPtr[1] = (short) __riscv_vmv_x_s_i32m1_i32(accImagVal);
}
}
#endif /* LV_HAVE_RVV */
#endif /* INCLUDED_volk_gnsssdr_16ic_xn_dot_prod_16ic_xn_H */

View File

@@ -247,4 +247,27 @@ static inline void volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic_neon_optvma(lv_16sc
#endif // NEON
#ifdef LV_HAVE_RVV
static inline void volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic_rvv(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points)
{
int num_a_vectors = 3;
lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
int n;
for (n = 0; n < num_a_vectors; n++)
{
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points);
}
volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_rvv(result, local_code, (const lv_16sc_t**)in_a, num_a_vectors, num_points);
for (n = 0; n < num_a_vectors; n++)
{
volk_gnsssdr_free(in_a[n]);
}
volk_gnsssdr_free(in_a);
}
#endif /* LV_HAVE_RVV */
#endif // INCLUDED_volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic_H