mirror of
https://github.com/gnss-sdr/gnss-sdr
synced 2025-11-13 05:37:20 +00:00
Implement 32fc 32f rotator dot product, puppet, with RVV
Implement 32-bit floating-point complex number rotator and dot product with input scalar 32-bit floating-point complex vectors utilizing RVV C intrinsics. Signed-off-by: Marcus Alagar <mvala079@gmail.com>
This commit is contained in:
@@ -483,4 +483,229 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_
|
||||
|
||||
#endif /* LV_HAVE_AVX */
|
||||
|
||||
|
||||
#ifdef LV_HAVE_RVV
|
||||
#include <riscv_vector.h>
|
||||
|
||||
static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_rvv(lv_32fc_t* result, const lv_32fc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const float** in_a, int num_a_vectors, unsigned int num_points)
|
||||
{
|
||||
size_t ROTATOR_RELOAD = 256;
|
||||
|
||||
// Initialize reference pointers of compatible type that will not be stripmined
|
||||
float* phasePtr = (float*) phase;
|
||||
|
||||
// Initialize pointers of compatible type to track progress as stripmine
|
||||
float* outPtr = (float*) result;
|
||||
const float* comPtr = (const float*) in_common;
|
||||
const float** inPtrBuf = (const float**) volk_gnsssdr_malloc(
|
||||
num_a_vectors * sizeof(*in_a), volk_gnsssdr_get_alignment()
|
||||
);
|
||||
|
||||
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
||||
{
|
||||
// Initialize `out` to zero
|
||||
result[n_vec] = lv_cmake(0.0f, 0.0f);
|
||||
|
||||
// Treat complex number as struct containting
|
||||
// two 16-bit integers
|
||||
inPtrBuf[n_vec] = in_a[n_vec];
|
||||
}
|
||||
|
||||
for (int _ = 0; _ < num_points / ROTATOR_RELOAD; _++)
|
||||
{
|
||||
size_t n = ROTATOR_RELOAD;
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, comPtr += vl * 2)
|
||||
{
|
||||
// Record how many elements will actually be processed
|
||||
vl = __riscv_vsetvl_e32m4(n);
|
||||
|
||||
// Splat phase
|
||||
vfloat32m4_t phaseRealVal = __riscv_vfmv_v_f_f32m4(phasePtr[0], vl);
|
||||
vfloat32m4_t phaseImagVal = __riscv_vfmv_v_f_f32m4(phasePtr[1], vl);
|
||||
|
||||
// Splat phaseInc
|
||||
vfloat32m4_t phaseIncRealVal = __riscv_vfmv_v_f_f32m4(lv_creal(phase_inc), vl);
|
||||
vfloat32m4_t phaseIncImagVal = __riscv_vfmv_v_f_f32m4(lv_cimag(phase_inc), vl);
|
||||
|
||||
// iter[i] = i
|
||||
vuint32m4_t iterVal = __riscv_vid_v_u32m4(vl);
|
||||
|
||||
// phase[i] = phase[i] * ( phaseInc[i] ^ i )
|
||||
for (int j = 1; j < vl; j++)
|
||||
{
|
||||
vbool8_t maskVal = __riscv_vmsgtu_vx_u32m4_b8(iterVal, 0, vl);
|
||||
|
||||
// Initialize as copies of phase so that can target masked
|
||||
// operations onto these copies instead of the original vectors
|
||||
vfloat32m4_t prodRealVal = phaseRealVal;
|
||||
vfloat32m4_t prodImagVal = phaseImagVal;
|
||||
|
||||
// For more details on cross product,
|
||||
// check `volk_gnsssdr_8ic_x2_multiply_8ic_rvv`,
|
||||
// for instance
|
||||
prodRealVal = __riscv_vfmul_vv_f32m4_mu(maskVal, prodRealVal, phaseRealVal, phaseIncRealVal, vl);
|
||||
prodRealVal = __riscv_vfnmsac_vv_f32m4_mu(maskVal, prodRealVal, phaseImagVal, phaseIncImagVal, vl);
|
||||
prodImagVal = __riscv_vfmul_vv_f32m4_mu(maskVal, prodImagVal, phaseRealVal, phaseIncImagVal, vl);
|
||||
prodImagVal = __riscv_vfmacc_vv_f32m4_mu(maskVal, prodImagVal, phaseImagVal, phaseIncRealVal, vl);
|
||||
|
||||
phaseRealVal = prodRealVal;
|
||||
phaseImagVal = prodImagVal;
|
||||
|
||||
iterVal = __riscv_vsub_vx_u32m4_mu(maskVal, iterVal, iterVal, 1, vl);
|
||||
}
|
||||
|
||||
// Load com[0..vl)
|
||||
vfloat32m4x2_t comVal = __riscv_vlseg2e32_v_f32m4x2(comPtr, vl);
|
||||
vfloat32m4_t comRealVal = __riscv_vget_v_f32m4x2_f32m4(comVal, 0);
|
||||
vfloat32m4_t comImagVal = __riscv_vget_v_f32m4x2_f32m4(comVal, 1);
|
||||
|
||||
vfloat32m4_t comProdRealVal = __riscv_vfmul_vv_f32m4(comRealVal, phaseRealVal, vl);
|
||||
comProdRealVal = __riscv_vfnmsac_vv_f32m4(comProdRealVal, comImagVal, phaseImagVal, vl);
|
||||
vfloat32m4_t comProdImagVal = __riscv_vfmul_vv_f32m4(comRealVal, phaseImagVal, vl);
|
||||
comProdImagVal = __riscv_vfmacc_vv_f32m4(comProdImagVal, comImagVal, phaseRealVal, vl);
|
||||
|
||||
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
||||
{
|
||||
// Load in[0..vl)
|
||||
vfloat32m4_t inVal = __riscv_vle32_v_f32m4(inPtrBuf[n_vec], vl);
|
||||
|
||||
// out[i] = in[i] * comProd[i]
|
||||
vfloat32m4_t outRealVal = __riscv_vfmul_vv_f32m4(inVal, comProdRealVal, vl);
|
||||
vfloat32m4_t outImagVal = __riscv_vfmul_vv_f32m4(inVal, comProdImagVal, vl);
|
||||
|
||||
// Load accumulator
|
||||
vfloat32m1_t accRealVal = __riscv_vfmv_s_f_f32m1(outPtr[2 * n_vec], 1);
|
||||
vfloat32m1_t accImagVal = __riscv_vfmv_s_f_f32m1(outPtr[2 * n_vec + 1], 1);
|
||||
|
||||
// acc[0] = sum( acc[0], out[0..vl) )
|
||||
accRealVal = __riscv_vfredosum_vs_f32m4_f32m1(outRealVal, accRealVal, vl);
|
||||
accImagVal = __riscv_vfredosum_vs_f32m4_f32m1(outImagVal, accImagVal, vl);
|
||||
|
||||
// Store acc[0]
|
||||
outPtr[2 * n_vec] = __riscv_vfmv_f_s_f32m1_f32(accRealVal);
|
||||
outPtr[2 * n_vec + 1] = __riscv_vfmv_f_s_f32m1_f32(accImagVal);
|
||||
|
||||
// Increment this pointer
|
||||
inPtrBuf[n_vec] += vl;
|
||||
}
|
||||
|
||||
// Store phase[vl - 1]
|
||||
phaseRealVal = __riscv_vslidedown_vx_f32m4(phaseRealVal, vl - 1, vl);
|
||||
phasePtr[0] = __riscv_vfmv_f_s_f32m4_f32(phaseRealVal);
|
||||
phaseImagVal = __riscv_vslidedown_vx_f32m4(phaseImagVal, vl - 1, vl);
|
||||
phasePtr[1] = __riscv_vfmv_f_s_f32m4_f32(phaseImagVal);
|
||||
|
||||
// Account for multiplication after last calculation
|
||||
(*phase) *= phase_inc;
|
||||
|
||||
// In looping, decrement the number of
|
||||
// elements left and increment the pointers
|
||||
// by the number of elements processed
|
||||
}
|
||||
// Regenerate phase
|
||||
#ifdef __cplusplus
|
||||
(*phase) /= std::abs((*phase));
|
||||
#else
|
||||
(*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
|
||||
#endif
|
||||
}
|
||||
|
||||
size_t n = num_points % ROTATOR_RELOAD;
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, comPtr += vl * 2)
|
||||
{
|
||||
// Record how many elements will actually be processed
|
||||
vl = __riscv_vsetvl_e32m4(n);
|
||||
|
||||
// Splat phase
|
||||
vfloat32m4_t phaseRealVal = __riscv_vfmv_v_f_f32m4(phasePtr[0], vl);
|
||||
vfloat32m4_t phaseImagVal = __riscv_vfmv_v_f_f32m4(phasePtr[1], vl);
|
||||
|
||||
// Splat phaseInc
|
||||
vfloat32m4_t phaseIncRealVal = __riscv_vfmv_v_f_f32m4(lv_creal(phase_inc), vl);
|
||||
vfloat32m4_t phaseIncImagVal = __riscv_vfmv_v_f_f32m4(lv_cimag(phase_inc), vl);
|
||||
|
||||
// iter[i] = i
|
||||
vuint32m4_t iterVal = __riscv_vid_v_u32m4(vl);
|
||||
|
||||
// phase[i] = phase[i] * ( phaseInc[i] ^ i )
|
||||
for (int j = 1; j < vl; j++)
|
||||
{
|
||||
vbool8_t maskVal = __riscv_vmsgtu_vx_u32m4_b8(iterVal, 0, vl);
|
||||
|
||||
// Initialize as copies of phase so that can target masked
|
||||
// operations onto these copies instead of the original vectors
|
||||
vfloat32m4_t prodRealVal = phaseRealVal;
|
||||
vfloat32m4_t prodImagVal = phaseImagVal;
|
||||
|
||||
// For more details on cross product,
|
||||
// check `volk_gnsssdr_8ic_x2_multiply_8ic_rvv`,
|
||||
// for instance
|
||||
prodRealVal = __riscv_vfmul_vv_f32m4_mu(maskVal, prodRealVal, phaseRealVal, phaseIncRealVal, vl);
|
||||
prodRealVal = __riscv_vfnmsac_vv_f32m4_mu(maskVal, prodRealVal, phaseImagVal, phaseIncImagVal, vl);
|
||||
prodImagVal = __riscv_vfmul_vv_f32m4_mu(maskVal, prodImagVal, phaseRealVal, phaseIncImagVal, vl);
|
||||
prodImagVal = __riscv_vfmacc_vv_f32m4_mu(maskVal, prodImagVal, phaseImagVal, phaseIncRealVal, vl);
|
||||
|
||||
phaseRealVal = prodRealVal;
|
||||
phaseImagVal = prodImagVal;
|
||||
|
||||
iterVal = __riscv_vsub_vx_u32m4_mu(maskVal, iterVal, iterVal, 1, vl);
|
||||
}
|
||||
|
||||
// Load com[0..vl)
|
||||
vfloat32m4x2_t comVal = __riscv_vlseg2e32_v_f32m4x2(comPtr, vl);
|
||||
vfloat32m4_t comRealVal = __riscv_vget_v_f32m4x2_f32m4(comVal, 0);
|
||||
vfloat32m4_t comImagVal = __riscv_vget_v_f32m4x2_f32m4(comVal, 1);
|
||||
|
||||
vfloat32m4_t comProdRealVal = __riscv_vfmul_vv_f32m4(comRealVal, phaseRealVal, vl);
|
||||
comProdRealVal = __riscv_vfnmsac_vv_f32m4(comProdRealVal, comImagVal, phaseImagVal, vl);
|
||||
vfloat32m4_t comProdImagVal = __riscv_vfmul_vv_f32m4(comRealVal, phaseImagVal, vl);
|
||||
comProdImagVal = __riscv_vfmacc_vv_f32m4(comProdImagVal, comImagVal, phaseRealVal, vl);
|
||||
|
||||
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
||||
{
|
||||
// Load in[0..vl)
|
||||
vfloat32m4_t inVal = __riscv_vle32_v_f32m4(inPtrBuf[n_vec], vl);
|
||||
|
||||
// out[i] = in[i] * comProd[i]
|
||||
vfloat32m4_t outRealVal = __riscv_vfmul_vv_f32m4(inVal, comProdRealVal, vl);
|
||||
vfloat32m4_t outImagVal = __riscv_vfmul_vv_f32m4(inVal, comProdImagVal, vl);
|
||||
|
||||
// Load accumulator
|
||||
vfloat32m1_t accRealVal = __riscv_vfmv_s_f_f32m1(outPtr[2 * n_vec], 1);
|
||||
vfloat32m1_t accImagVal = __riscv_vfmv_s_f_f32m1(outPtr[2 * n_vec + 1], 1);
|
||||
|
||||
// acc[0] = sum( acc[0], out[0..vl) )
|
||||
accRealVal = __riscv_vfredosum_vs_f32m4_f32m1(outRealVal, accRealVal, vl);
|
||||
accImagVal = __riscv_vfredosum_vs_f32m4_f32m1(outImagVal, accImagVal, vl);
|
||||
|
||||
// Store acc[0]
|
||||
outPtr[2 * n_vec] = __riscv_vfmv_f_s_f32m1_f32(accRealVal);
|
||||
outPtr[2 * n_vec + 1] = __riscv_vfmv_f_s_f32m1_f32(accImagVal);
|
||||
|
||||
// Increment this pointer
|
||||
inPtrBuf[n_vec] += vl;
|
||||
}
|
||||
|
||||
// Store phase[vl - 1]
|
||||
phaseRealVal = __riscv_vslidedown_vx_f32m4(phaseRealVal, vl - 1, vl);
|
||||
phasePtr[0] = __riscv_vfmv_f_s_f32m4_f32(phaseRealVal);
|
||||
phaseImagVal = __riscv_vslidedown_vx_f32m4(phaseImagVal, vl - 1, vl);
|
||||
phasePtr[1] = __riscv_vfmv_f_s_f32m4_f32(phaseImagVal);
|
||||
|
||||
// Account for multiplication after last calculation
|
||||
(*phase) *= phase_inc;
|
||||
|
||||
// In looping, decrement the number of
|
||||
// elements left and increment the pointers
|
||||
// by the number of elements processed
|
||||
}
|
||||
|
||||
// Don't leak memory!
|
||||
volk_gnsssdr_free(inPtrBuf);
|
||||
}
|
||||
|
||||
#endif /* LV_HAVE_RVV */
|
||||
|
||||
#endif /* INCLUDED_volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_H */
|
||||
|
||||
@@ -150,4 +150,35 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_a_avx(lv_3
|
||||
|
||||
#endif // AVX
|
||||
|
||||
|
||||
#ifdef LV_HAVE_RVV
|
||||
static inline void volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_rvv(lv_32fc_t* result, const lv_32fc_t* local_code, const float* in, unsigned int num_points)
|
||||
{
|
||||
// phases must be normalized. Phase rotator expects a complex exponential input!
|
||||
float rem_carrier_phase_in_rad = 0.25;
|
||||
float phase_step_rad = 0.1;
|
||||
lv_32fc_t phase[1];
|
||||
phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad));
|
||||
lv_32fc_t phase_inc[1];
|
||||
phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad));
|
||||
int n;
|
||||
int num_a_vectors = 3;
|
||||
float** in_a = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_a_vectors, volk_gnsssdr_get_alignment());
|
||||
for (n = 0; n < num_a_vectors; n++)
|
||||
{
|
||||
in_a[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment());
|
||||
memcpy((float*)in_a[n], (float*)in, sizeof(float) * num_points);
|
||||
}
|
||||
|
||||
volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_rvv(result, local_code, phase_inc[0], phase, (const float**)in_a, num_a_vectors, num_points);
|
||||
|
||||
for (n = 0; n < num_a_vectors; n++)
|
||||
{
|
||||
volk_gnsssdr_free(in_a[n]);
|
||||
}
|
||||
volk_gnsssdr_free(in_a);
|
||||
}
|
||||
|
||||
#endif // RVV
|
||||
|
||||
#endif // INCLUDED_volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_H
|
||||
|
||||
Reference in New Issue
Block a user