From f4875012df9bfa63d1d558376be6cef1f21e952c Mon Sep 17 00:00:00 2001 From: Carles Fernandez Date: Sun, 31 Jan 2016 10:41:51 +0100 Subject: [PATCH] prefetch data in the cache in neon implementation 8% of average improvement --- .../volk_gnsssdr/volk_gnsssdr_16ic_s32fc_x2_rotator_16ic.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_s32fc_x2_rotator_16ic.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_s32fc_x2_rotator_16ic.h index 765fac222..21b54779d 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_s32fc_x2_rotator_16ic.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_s32fc_x2_rotator_16ic.h @@ -290,7 +290,9 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_neon(lv_16sc_t* outVe for(; i < neon_iters; ++i) { /* load 4 complex numbers (int 16 bits each component) */ - tmp16 = vld2_s16((int16_t*)_in); _in += 4; + tmp16 = vld2_s16((int16_t*)_in); + __builtin_prefetch(_in + 8); + _in += 4; /* promote them to int 32 bits */ tmp32i.val[0] = vmovl_s16(tmp16.val[0]);