diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn.h index 56bced832..06e657064 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn.h @@ -483,4 +483,229 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_ #endif /* LV_HAVE_AVX */ + +#ifdef LV_HAVE_RVV +#include + +static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_rvv(lv_32fc_t* result, const lv_32fc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const float** in_a, int num_a_vectors, unsigned int num_points) +{ + size_t ROTATOR_RELOAD = 256; + + // Initialize reference pointers of compatible type that will not be stripmined + float* phasePtr = (float*) phase; + + // Initialize pointers of compatible type to track progress as stripmine + float* outPtr = (float*) result; + const float* comPtr = (const float*) in_common; + const float** inPtrBuf = (const float**) volk_gnsssdr_malloc( + num_a_vectors * sizeof(*in_a), volk_gnsssdr_get_alignment() + ); + + for (int n_vec = 0; n_vec < num_a_vectors; n_vec++) + { + // Initialize `out` to zero + result[n_vec] = lv_cmake(0.0f, 0.0f); + + // Treat complex number as struct containting + // two 16-bit integers + inPtrBuf[n_vec] = in_a[n_vec]; + } + + for (int _ = 0; _ < num_points / ROTATOR_RELOAD; _++) + { + size_t n = ROTATOR_RELOAD; + + for (size_t vl; n > 0; n -= vl, comPtr += vl * 2) + { + // Record how many elements will actually be processed + vl = __riscv_vsetvl_e32m4(n); + + // Splat phase + vfloat32m4_t phaseRealVal = __riscv_vfmv_v_f_f32m4(phasePtr[0], vl); + vfloat32m4_t phaseImagVal = __riscv_vfmv_v_f_f32m4(phasePtr[1], vl); + + // Splat phaseInc + vfloat32m4_t phaseIncRealVal = __riscv_vfmv_v_f_f32m4(lv_creal(phase_inc), vl); + vfloat32m4_t phaseIncImagVal = __riscv_vfmv_v_f_f32m4(lv_cimag(phase_inc), vl); + + // iter[i] = i + vuint32m4_t iterVal = __riscv_vid_v_u32m4(vl); + + // phase[i] = phase[i] * ( phaseInc[i] ^ i ) + for (int j = 1; j < vl; j++) + { + vbool8_t maskVal = __riscv_vmsgtu_vx_u32m4_b8(iterVal, 0, vl); + + // Initialize as copies of phase so that can target masked + // operations onto these copies instead of the original vectors + vfloat32m4_t prodRealVal = phaseRealVal; + vfloat32m4_t prodImagVal = phaseImagVal; + + // For more details on cross product, + // check `volk_gnsssdr_8ic_x2_multiply_8ic_rvv`, + // for instance + prodRealVal = __riscv_vfmul_vv_f32m4_mu(maskVal, prodRealVal, phaseRealVal, phaseIncRealVal, vl); + prodRealVal = __riscv_vfnmsac_vv_f32m4_mu(maskVal, prodRealVal, phaseImagVal, phaseIncImagVal, vl); + prodImagVal = __riscv_vfmul_vv_f32m4_mu(maskVal, prodImagVal, phaseRealVal, phaseIncImagVal, vl); + prodImagVal = __riscv_vfmacc_vv_f32m4_mu(maskVal, prodImagVal, phaseImagVal, phaseIncRealVal, vl); + + phaseRealVal = prodRealVal; + phaseImagVal = prodImagVal; + + iterVal = __riscv_vsub_vx_u32m4_mu(maskVal, iterVal, iterVal, 1, vl); + } + + // Load com[0..vl) + vfloat32m4x2_t comVal = __riscv_vlseg2e32_v_f32m4x2(comPtr, vl); + vfloat32m4_t comRealVal = __riscv_vget_v_f32m4x2_f32m4(comVal, 0); + vfloat32m4_t comImagVal = __riscv_vget_v_f32m4x2_f32m4(comVal, 1); + + vfloat32m4_t comProdRealVal = __riscv_vfmul_vv_f32m4(comRealVal, phaseRealVal, vl); + comProdRealVal = __riscv_vfnmsac_vv_f32m4(comProdRealVal, comImagVal, phaseImagVal, vl); + vfloat32m4_t comProdImagVal = __riscv_vfmul_vv_f32m4(comRealVal, phaseImagVal, vl); + comProdImagVal = __riscv_vfmacc_vv_f32m4(comProdImagVal, comImagVal, phaseRealVal, vl); + + for (int n_vec = 0; n_vec < num_a_vectors; n_vec++) + { + // Load in[0..vl) + vfloat32m4_t inVal = __riscv_vle32_v_f32m4(inPtrBuf[n_vec], vl); + + // out[i] = in[i] * comProd[i] + vfloat32m4_t outRealVal = __riscv_vfmul_vv_f32m4(inVal, comProdRealVal, vl); + vfloat32m4_t outImagVal = __riscv_vfmul_vv_f32m4(inVal, comProdImagVal, vl); + + // Load accumulator + vfloat32m1_t accRealVal = __riscv_vfmv_s_f_f32m1(outPtr[2 * n_vec], 1); + vfloat32m1_t accImagVal = __riscv_vfmv_s_f_f32m1(outPtr[2 * n_vec + 1], 1); + + // acc[0] = sum( acc[0], out[0..vl) ) + accRealVal = __riscv_vfredosum_vs_f32m4_f32m1(outRealVal, accRealVal, vl); + accImagVal = __riscv_vfredosum_vs_f32m4_f32m1(outImagVal, accImagVal, vl); + + // Store acc[0] + outPtr[2 * n_vec] = __riscv_vfmv_f_s_f32m1_f32(accRealVal); + outPtr[2 * n_vec + 1] = __riscv_vfmv_f_s_f32m1_f32(accImagVal); + + // Increment this pointer + inPtrBuf[n_vec] += vl; + } + + // Store phase[vl - 1] + phaseRealVal = __riscv_vslidedown_vx_f32m4(phaseRealVal, vl - 1, vl); + phasePtr[0] = __riscv_vfmv_f_s_f32m4_f32(phaseRealVal); + phaseImagVal = __riscv_vslidedown_vx_f32m4(phaseImagVal, vl - 1, vl); + phasePtr[1] = __riscv_vfmv_f_s_f32m4_f32(phaseImagVal); + + // Account for multiplication after last calculation + (*phase) *= phase_inc; + + // In looping, decrement the number of + // elements left and increment the pointers + // by the number of elements processed + } + // Regenerate phase +#ifdef __cplusplus + (*phase) /= std::abs((*phase)); +#else + (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase)); +#endif + } + + size_t n = num_points % ROTATOR_RELOAD; + + for (size_t vl; n > 0; n -= vl, comPtr += vl * 2) + { + // Record how many elements will actually be processed + vl = __riscv_vsetvl_e32m4(n); + + // Splat phase + vfloat32m4_t phaseRealVal = __riscv_vfmv_v_f_f32m4(phasePtr[0], vl); + vfloat32m4_t phaseImagVal = __riscv_vfmv_v_f_f32m4(phasePtr[1], vl); + + // Splat phaseInc + vfloat32m4_t phaseIncRealVal = __riscv_vfmv_v_f_f32m4(lv_creal(phase_inc), vl); + vfloat32m4_t phaseIncImagVal = __riscv_vfmv_v_f_f32m4(lv_cimag(phase_inc), vl); + + // iter[i] = i + vuint32m4_t iterVal = __riscv_vid_v_u32m4(vl); + + // phase[i] = phase[i] * ( phaseInc[i] ^ i ) + for (int j = 1; j < vl; j++) + { + vbool8_t maskVal = __riscv_vmsgtu_vx_u32m4_b8(iterVal, 0, vl); + + // Initialize as copies of phase so that can target masked + // operations onto these copies instead of the original vectors + vfloat32m4_t prodRealVal = phaseRealVal; + vfloat32m4_t prodImagVal = phaseImagVal; + + // For more details on cross product, + // check `volk_gnsssdr_8ic_x2_multiply_8ic_rvv`, + // for instance + prodRealVal = __riscv_vfmul_vv_f32m4_mu(maskVal, prodRealVal, phaseRealVal, phaseIncRealVal, vl); + prodRealVal = __riscv_vfnmsac_vv_f32m4_mu(maskVal, prodRealVal, phaseImagVal, phaseIncImagVal, vl); + prodImagVal = __riscv_vfmul_vv_f32m4_mu(maskVal, prodImagVal, phaseRealVal, phaseIncImagVal, vl); + prodImagVal = __riscv_vfmacc_vv_f32m4_mu(maskVal, prodImagVal, phaseImagVal, phaseIncRealVal, vl); + + phaseRealVal = prodRealVal; + phaseImagVal = prodImagVal; + + iterVal = __riscv_vsub_vx_u32m4_mu(maskVal, iterVal, iterVal, 1, vl); + } + + // Load com[0..vl) + vfloat32m4x2_t comVal = __riscv_vlseg2e32_v_f32m4x2(comPtr, vl); + vfloat32m4_t comRealVal = __riscv_vget_v_f32m4x2_f32m4(comVal, 0); + vfloat32m4_t comImagVal = __riscv_vget_v_f32m4x2_f32m4(comVal, 1); + + vfloat32m4_t comProdRealVal = __riscv_vfmul_vv_f32m4(comRealVal, phaseRealVal, vl); + comProdRealVal = __riscv_vfnmsac_vv_f32m4(comProdRealVal, comImagVal, phaseImagVal, vl); + vfloat32m4_t comProdImagVal = __riscv_vfmul_vv_f32m4(comRealVal, phaseImagVal, vl); + comProdImagVal = __riscv_vfmacc_vv_f32m4(comProdImagVal, comImagVal, phaseRealVal, vl); + + for (int n_vec = 0; n_vec < num_a_vectors; n_vec++) + { + // Load in[0..vl) + vfloat32m4_t inVal = __riscv_vle32_v_f32m4(inPtrBuf[n_vec], vl); + + // out[i] = in[i] * comProd[i] + vfloat32m4_t outRealVal = __riscv_vfmul_vv_f32m4(inVal, comProdRealVal, vl); + vfloat32m4_t outImagVal = __riscv_vfmul_vv_f32m4(inVal, comProdImagVal, vl); + + // Load accumulator + vfloat32m1_t accRealVal = __riscv_vfmv_s_f_f32m1(outPtr[2 * n_vec], 1); + vfloat32m1_t accImagVal = __riscv_vfmv_s_f_f32m1(outPtr[2 * n_vec + 1], 1); + + // acc[0] = sum( acc[0], out[0..vl) ) + accRealVal = __riscv_vfredosum_vs_f32m4_f32m1(outRealVal, accRealVal, vl); + accImagVal = __riscv_vfredosum_vs_f32m4_f32m1(outImagVal, accImagVal, vl); + + // Store acc[0] + outPtr[2 * n_vec] = __riscv_vfmv_f_s_f32m1_f32(accRealVal); + outPtr[2 * n_vec + 1] = __riscv_vfmv_f_s_f32m1_f32(accImagVal); + + // Increment this pointer + inPtrBuf[n_vec] += vl; + } + + // Store phase[vl - 1] + phaseRealVal = __riscv_vslidedown_vx_f32m4(phaseRealVal, vl - 1, vl); + phasePtr[0] = __riscv_vfmv_f_s_f32m4_f32(phaseRealVal); + phaseImagVal = __riscv_vslidedown_vx_f32m4(phaseImagVal, vl - 1, vl); + phasePtr[1] = __riscv_vfmv_f_s_f32m4_f32(phaseImagVal); + + // Account for multiplication after last calculation + (*phase) *= phase_inc; + + // In looping, decrement the number of + // elements left and increment the pointers + // by the number of elements processed + } + + // Don't leak memory! + volk_gnsssdr_free(inPtrBuf); +} + +#endif /* LV_HAVE_RVV */ + #endif /* INCLUDED_volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_H */ diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc.h index d9b29b0d0..92c0bc372 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc.h @@ -150,4 +150,35 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_a_avx(lv_3 #endif // AVX + +#ifdef LV_HAVE_RVV +static inline void volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_rvv(lv_32fc_t* result, const lv_32fc_t* local_code, const float* in, unsigned int num_points) +{ + // phases must be normalized. Phase rotator expects a complex exponential input! + float rem_carrier_phase_in_rad = 0.25; + float phase_step_rad = 0.1; + lv_32fc_t phase[1]; + phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad)); + lv_32fc_t phase_inc[1]; + phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad)); + int n; + int num_a_vectors = 3; + float** in_a = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_a_vectors, volk_gnsssdr_get_alignment()); + for (n = 0; n < num_a_vectors; n++) + { + in_a[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment()); + memcpy((float*)in_a[n], (float*)in, sizeof(float) * num_points); + } + + volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_rvv(result, local_code, phase_inc[0], phase, (const float**)in_a, num_a_vectors, num_points); + + for (n = 0; n < num_a_vectors; n++) + { + volk_gnsssdr_free(in_a[n]); + } + volk_gnsssdr_free(in_a); +} + +#endif // RVV + #endif // INCLUDED_volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_H