mirror of
				https://github.com/gnss-sdr/gnss-sdr
				synced 2025-10-31 15:23:04 +00:00 
			
		
		
		
	add neon implementation
This commit is contained in:
		| @@ -255,4 +255,94 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_sse2(lv_16sc_t* out, | |||||||
| } | } | ||||||
| #endif /* LV_HAVE_SSE2 */ | #endif /* LV_HAVE_SSE2 */ | ||||||
|  |  | ||||||
|  | #ifdef LV_HAVE_NEON | ||||||
|  | #include <arm_neon.h> | ||||||
|  |  | ||||||
|  | static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon(lv_16sc_t* out, const lv_16sc_t* in_common, const lv_16sc_t** in_a,  int num_a_vectors, unsigned int num_points) | ||||||
|  | { | ||||||
|  |     lv_16sc_t dotProduct = lv_cmake(0,0); | ||||||
|  |  | ||||||
|  |     const unsigned int neon_iters = num_points / 4; | ||||||
|  |  | ||||||
|  |     const lv_16sc_t** _in_a = in_a; | ||||||
|  |     const lv_16sc_t* _in_common = in_common; | ||||||
|  |     lv_16sc_t* _out = out; | ||||||
|  |  | ||||||
|  |     if (neon_iters > 0) | ||||||
|  |         { | ||||||
|  |             __VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4]; | ||||||
|  |  | ||||||
|  |             int16x4x2_t a_val, b_val, c_val; | ||||||
|  |  | ||||||
|  |             //todo dyn mem reg | ||||||
|  |             int16x4x2_t* accumulator; | ||||||
|  |             accumulator = (int16x4x2_t*)calloc(num_a_vectors, sizeof(int16x4x2_t)); | ||||||
|  |  | ||||||
|  |             int16x4x2_t tmp_real, tmp_imag; | ||||||
|  |             lv_16sc_t accum_result[4]; | ||||||
|  |  | ||||||
|  |             for(int n_vec = 0; n_vec < num_a_vectors; n_vec++) | ||||||
|  |                 { | ||||||
|  |                     accumulator[n_vec].val[0] = vdup_n_s16(0); | ||||||
|  |                     accumulator[n_vec].val[1] = vdup_n_s16(0); | ||||||
|  |                 } | ||||||
|  |  | ||||||
|  |             for(unsigned int number = 0; number < neon_iters; number++) | ||||||
|  |                 { | ||||||
|  |                     b_val = vld2_s16((int16_t*)_in_common); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg | ||||||
|  |                     __builtin_prefetch(_in_common + 8); | ||||||
|  |                     for (int n_vec = 0; n_vec < num_a_vectors; n_vec++) | ||||||
|  |                         { | ||||||
|  |                             a_val = vld2_s16((int16_t*)&(_in_a[n_vec][number*4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg | ||||||
|  |                             //__builtin_prefetch(_in_a[n_vec] + 8); | ||||||
|  |  | ||||||
|  |                             // multiply the real*real and imag*imag to get real result | ||||||
|  |                             // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r | ||||||
|  |                             tmp_real.val[0] = vmul_s16(a_val.val[0], b_val.val[0]); | ||||||
|  |                             // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i | ||||||
|  |                             tmp_real.val[1] = vmul_s16(a_val.val[1], b_val.val[1]); | ||||||
|  |  | ||||||
|  |                             // Multiply cross terms to get the imaginary result | ||||||
|  |                             // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i | ||||||
|  |                             tmp_imag.val[0] = vmul_s16(a_val.val[0], b_val.val[1]); | ||||||
|  |                             // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r | ||||||
|  |                             tmp_imag.val[1] = vmul_s16(a_val.val[1], b_val.val[0]); | ||||||
|  |  | ||||||
|  |                             c_val.val[0] = vsub_s16(tmp_real.val[0], tmp_real.val[1]); | ||||||
|  |                             c_val.val[1] = vadd_s16(tmp_imag.val[0], tmp_imag.val[1]); | ||||||
|  |  | ||||||
|  |                             accumulator[n_vec].val[0] = vadd_s16(accumulator[n_vec].val[0], c_val.val[0]); | ||||||
|  |                             accumulator[n_vec].val[1] = vadd_s16(accumulator[n_vec].val[1], c_val.val[1]); | ||||||
|  |                         } | ||||||
|  |                     _in_common += 4; | ||||||
|  |                 } | ||||||
|  |  | ||||||
|  |             for (int n_vec = 0; n_vec < num_a_vectors; n_vec++) | ||||||
|  |                 { | ||||||
|  |                     vst2_s16((int16_t*)dotProductVector, accumulator[n_vec]); // Store the results back into the dot product vector | ||||||
|  |                     dotProduct = lv_cmake(0,0); | ||||||
|  |                     for (int i = 0; i < 4; ++i) | ||||||
|  |                         { | ||||||
|  |                             dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[i])), | ||||||
|  |                                     sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[i]))); | ||||||
|  |                         } | ||||||
|  |                     _out[n_vec] = dotProduct; | ||||||
|  |                 } | ||||||
|  |             free(accumulator); | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |     for (int n_vec = 0; n_vec < num_a_vectors; n_vec++) | ||||||
|  |         { | ||||||
|  |             for(unsigned int n  = neon_iters * 4; n < num_points; n++) | ||||||
|  |                 { | ||||||
|  |                     lv_16sc_t tmp = in_common[n] * in_a[n_vec][n]; | ||||||
|  |  | ||||||
|  |                     _out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)), | ||||||
|  |                             sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |  | ||||||
|  | } | ||||||
|  | #endif /* LV_HAVE_NEON */ | ||||||
|  |  | ||||||
| #endif /*INCLUDED_volk_gnsssdr_16ic_xn_dot_prod_16ic_xn_H*/ | #endif /*INCLUDED_volk_gnsssdr_16ic_xn_dot_prod_16ic_xn_H*/ | ||||||
|   | |||||||
| @@ -105,6 +105,28 @@ static inline void volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic_u_sse2(lv_16sc_t* r | |||||||
|  |  | ||||||
| #endif // SSE2 | #endif // SSE2 | ||||||
|  |  | ||||||
|  | #ifdef LV_HAVE_NEON | ||||||
|  |  | ||||||
|  | static inline void volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic_neon(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points) | ||||||
|  | { | ||||||
|  |     int num_a_vectors = 3; | ||||||
|  |     lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); | ||||||
|  |     for(unsigned int n = 0; n < num_a_vectors; n++) | ||||||
|  |     { | ||||||
|  |        in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t)*num_points, volk_gnsssdr_get_alignment()); | ||||||
|  |        memcpy(in_a[n], in, sizeof(lv_16sc_t)*num_points); | ||||||
|  |     } | ||||||
|  |     volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon(result, local_code, (const lv_16sc_t**) in_a, num_a_vectors, num_points); | ||||||
|  |  | ||||||
|  |     for(unsigned int n = 0; n < num_a_vectors; n++) | ||||||
|  |     { | ||||||
|  |         volk_gnsssdr_free(in_a[n]); | ||||||
|  |     } | ||||||
|  |     volk_gnsssdr_free(in_a); | ||||||
|  | } | ||||||
|  |  | ||||||
|  | #endif // NEON | ||||||
|  |  | ||||||
| #endif  // INCLUDED_volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic_H | #endif  // INCLUDED_volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic_H | ||||||
|  |  | ||||||
|  |  | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Carles Fernandez
					Carles Fernandez