From 4a2e03f9a8ca1b6e5b14459f8609058d1e2f851f Mon Sep 17 00:00:00 2001 From: andres Date: Mon, 20 Oct 2014 16:55:55 +0200 Subject: [PATCH] Fixing problems with _mm_loadu_si128 --- .../volk_gnsssdr/volk_gnsssdr_8i_x2_add_8i.h | 4 ++-- .../volk_gnsssdr_8ic_x2_dot_prod_8ic.h | 4 ++-- .../volk_gnsssdr_8ic_x2_multiply_8ic.h | 4 ++-- .../volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3.h | 10 +++++----- .../volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3.h | 10 +++++----- .../volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5.h | 14 +++++++------- 6 files changed, 23 insertions(+), 23 deletions(-) diff --git a/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_x2_add_8i.h b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_x2_add_8i.h index 44fc0137e..8e251ebf8 100644 --- a/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_x2_add_8i.h +++ b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_x2_add_8i.h @@ -59,8 +59,8 @@ static inline void volk_gnsssdr_8i_x2_add_8i_u_sse2(char* cVector, const char* a for(int number = 0; number < sse_iters; number++){ - aVal = _mm_load_si128((__m128i*)aPtr); - bVal = _mm_load_si128((__m128i*)bPtr); + aVal = _mm_loadu_si128((__m128i*)aPtr); + bVal = _mm_loadu_si128((__m128i*)bPtr); cVal = _mm_add_epi8(aVal, bVal); diff --git a/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x2_dot_prod_8ic.h b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x2_dot_prod_8ic.h index 4f3cab796..a77dff056 100644 --- a/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x2_dot_prod_8ic.h +++ b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x2_dot_prod_8ic.h @@ -119,8 +119,8 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_sse2(lv_8sc_t* result, con for(int number = 0; number < sse_iters; number++){ - x = _mm_load_si128((__m128i*)a); - y = _mm_load_si128((__m128i*)b); + x = _mm_loadu_si128((__m128i*)a); + y = _mm_loadu_si128((__m128i*)b); imagx = _mm_srli_si128 (x, 1); imagx = _mm_and_si128 (imagx, mult1); diff --git a/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x2_multiply_8ic.h b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x2_multiply_8ic.h index 2828fa73c..17c15c758 100644 --- a/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x2_multiply_8ic.h +++ b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x2_multiply_8ic.h @@ -62,8 +62,8 @@ static inline void volk_gnsssdr_8ic_x2_multiply_8ic_u_sse2(lv_8sc_t* cVector, co for(int number = 0;number < sse_iters; number++){ - x = _mm_load_si128((__m128i*)a); - y = _mm_load_si128((__m128i*)b); + x = _mm_loadu_si128((__m128i*)a); + y = _mm_loadu_si128((__m128i*)b); imagx = _mm_srli_si128 (x, 1); imagx = _mm_and_si128 (imagx, mult1); diff --git a/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3.h b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3.h index 81a5b061a..f98447d6f 100644 --- a/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3.h +++ b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3.h @@ -222,8 +222,8 @@ static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_u_sse2(lv_32fc_t* E_o for(int number = 0;number < sse_iters; number++){ //Perform the carrier wipe-off - x = _mm_load_si128((__m128i*)input_ptr); - y = _mm_load_si128((__m128i*)carrier_ptr); + x = _mm_loadu_si128((__m128i*)input_ptr); + y = _mm_loadu_si128((__m128i*)carrier_ptr); CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(x, mult1, realx, imagx) CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy) @@ -231,7 +231,7 @@ static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_u_sse2(lv_32fc_t* E_o CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample) //Get early values - y = _mm_load_si128((__m128i*)E_code_ptr); + y = _mm_loadu_si128((__m128i*)E_code_ptr); CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2) @@ -239,7 +239,7 @@ static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_u_sse2(lv_32fc_t* E_o E_code_acc = _mm_add_ps (E_code_acc, output_ps_2); //Get prompt values - y = _mm_load_si128((__m128i*)P_code_ptr); + y = _mm_loadu_si128((__m128i*)P_code_ptr); CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2) @@ -247,7 +247,7 @@ static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_u_sse2(lv_32fc_t* E_o P_code_acc = _mm_add_ps (P_code_acc, output_ps_2); //Get late values - y = _mm_load_si128((__m128i*)L_code_ptr); + y = _mm_loadu_si128((__m128i*)L_code_ptr); CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2) diff --git a/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3.h b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3.h index 83d5abb88..f89ef8c5c 100644 --- a/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3.h +++ b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3.h @@ -268,8 +268,8 @@ static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_u_sse2(lv_8sc_t* E_out for(int number = 0;number < sse_iters; number++){ //Perform the carrier wipe-off - x = _mm_load_si128((__m128i*)input_ptr); - y = _mm_load_si128((__m128i*)carrier_ptr); + x = _mm_loadu_si128((__m128i*)input_ptr); + y = _mm_loadu_si128((__m128i*)carrier_ptr); imagx = _mm_srli_si128 (x, 1); imagx = _mm_and_si128 (imagx, mult1); @@ -288,7 +288,7 @@ static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_u_sse2(lv_8sc_t* E_out imag_bb_signal_sample = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); //Get early values - y = _mm_load_si128((__m128i*)E_code_ptr); + y = _mm_loadu_si128((__m128i*)E_code_ptr); imagy = _mm_srli_si128 (y, 1); imagy = _mm_and_si128 (imagy, mult1); @@ -306,7 +306,7 @@ static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_u_sse2(lv_8sc_t* E_out imag_E_code_acc = _mm_add_epi16 (imag_E_code_acc, imag_output); //Get late values - y = _mm_load_si128((__m128i*)L_code_ptr); + y = _mm_loadu_si128((__m128i*)L_code_ptr); imagy = _mm_srli_si128 (y, 1); imagy = _mm_and_si128 (imagy, mult1); @@ -324,7 +324,7 @@ static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_u_sse2(lv_8sc_t* E_out imag_L_code_acc = _mm_add_epi16 (imag_L_code_acc, imag_output); //Get prompt values - y = _mm_load_si128((__m128i*)P_code_ptr); + y = _mm_loadu_si128((__m128i*)P_code_ptr); imagy = _mm_srli_si128 (y, 1); imagy = _mm_and_si128 (imagy, mult1); diff --git a/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5.h b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5.h index 5c4b79743..5a34242fb 100644 --- a/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5.h +++ b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5.h @@ -289,8 +289,8 @@ static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_u_sse2(lv_32fc_t* VE for(int number = 0;number < sse_iters; number++){ //Perform the carrier wipe-off - x = _mm_load_si128((__m128i*)input_ptr); - y = _mm_load_si128((__m128i*)carrier_ptr); + x = _mm_loadu_si128((__m128i*)input_ptr); + y = _mm_loadu_si128((__m128i*)carrier_ptr); CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(x, mult1, realx, imagx) CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy) @@ -298,7 +298,7 @@ static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_u_sse2(lv_32fc_t* VE CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample) //Get very early values - y = _mm_load_si128((__m128i*)VE_code_ptr); + y = _mm_loadu_si128((__m128i*)VE_code_ptr); CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2) @@ -306,7 +306,7 @@ static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_u_sse2(lv_32fc_t* VE VE_code_acc = _mm_add_ps (VE_code_acc, output_ps_2); //Get early values - y = _mm_load_si128((__m128i*)E_code_ptr); + y = _mm_loadu_si128((__m128i*)E_code_ptr); CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2) @@ -314,7 +314,7 @@ static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_u_sse2(lv_32fc_t* VE E_code_acc = _mm_add_ps (E_code_acc, output_ps_2); //Get prompt values - y = _mm_load_si128((__m128i*)P_code_ptr); + y = _mm_loadu_si128((__m128i*)P_code_ptr); CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2) @@ -322,7 +322,7 @@ static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_u_sse2(lv_32fc_t* VE P_code_acc = _mm_add_ps (P_code_acc, output_ps_2); //Get late values - y = _mm_load_si128((__m128i*)L_code_ptr); + y = _mm_loadu_si128((__m128i*)L_code_ptr); CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2) @@ -330,7 +330,7 @@ static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_u_sse2(lv_32fc_t* VE L_code_acc = _mm_add_ps (L_code_acc, output_ps_2); //Get very late values - y = _mm_load_si128((__m128i*)VL_code_ptr); + y = _mm_loadu_si128((__m128i*)VL_code_ptr); CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)