1
0
mirror of https://github.com/gnss-sdr/gnss-sdr synced 2024-06-18 11:09:56 +00:00

Remove _mm256_zeroupper() calls

Manual usage of _mm256_zeroupper() is not required and leads to miscompilation with GCC 10.2 and optimization level -O3
This commit is contained in:
Carles Fernandez 2021-01-04 14:07:56 +01:00
parent cbdb2ad2e9
commit 68fdedb224
No known key found for this signature in database
GPG Key ID: 4C583C52B0C3877D
14 changed files with 18 additions and 32 deletions

View File

@ -416,7 +416,7 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_a_avx(int16_t** result,
indexn = _mm256_add_ps(indexn, eights);
}
}
_mm256_zeroupper();
for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
{
for (n = avx_iters * 8; n < num_points; n++)
@ -494,7 +494,7 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_u_avx(int16_t** result,
indexn = _mm256_add_ps(indexn, eights);
}
}
_mm256_zeroupper();
for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
{
for (n = avx_iters * 8; n < num_points; n++)

View File

@ -611,7 +611,6 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_avx2(lv_16sc
}
volk_gnsssdr_free(cacc);
_mm256_zeroupper();
_mm256_store_ps((float*)four_phase_acc, four_phase_acc_reg);
(*phase) = four_phase_acc[0];
@ -763,7 +762,6 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_u_avx2(lv_16sc
}
volk_gnsssdr_free(cacc);
_mm256_zeroupper();
_mm256_store_ps((float*)four_phase_acc, four_phase_acc_reg);
(*phase) = four_phase_acc[0];

View File

@ -129,7 +129,7 @@ static inline void volk_gnsssdr_16ic_convert_32fc_u_axv(lv_32fc_t* outputVector,
_in += 4;
_out += 4;
}
_mm256_zeroupper();
for (i = 0; i < (num_points % 4); ++i)
{
*_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in));
@ -156,7 +156,7 @@ static inline void volk_gnsssdr_16ic_convert_32fc_a_axv(lv_32fc_t* outputVector,
_in += 4;
_out += 4;
}
_mm256_zeroupper();
for (i = 0; i < (num_points % 4); ++i)
{
*_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in));

View File

@ -282,7 +282,6 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_u_axv2(lv_16sc_t* out, con
result = _mm256_or_si256(realcacc, imagcacc);
_mm256_storeu_si256((__m256i*)dotProductVector, result); // Store the results back into the dot product vector
_mm256_zeroupper();
for (i = 0; i < 8; ++i)
{
@ -360,7 +359,6 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_a_axv2(lv_16sc_t* out, con
result = _mm256_or_si256(realcacc, imagcacc);
_mm256_store_si256((__m256i*)dotProductVector, result); // Store the results back into the dot product vector
_mm256_zeroupper();
for (i = 0; i < 8; ++i)
{

View File

@ -364,7 +364,6 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_avx2(lv_16sc_t* resul
volk_gnsssdr_free(realcacc);
volk_gnsssdr_free(imagcacc);
}
_mm256_zeroupper();
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
@ -459,7 +458,6 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_avx2(lv_16sc_t* resul
volk_gnsssdr_free(realcacc);
volk_gnsssdr_free(imagcacc);
}
_mm256_zeroupper();
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{

View File

@ -214,7 +214,7 @@ static inline void volk_gnsssdr_16ic_x2_multiply_16ic_u_avx2(lv_16sc_t* out, con
_in_b += 8;
_out += 8;
}
_mm256_zeroupper();
number = avx2_points * 8;
for (; number < num_points; number++)
{
@ -268,7 +268,7 @@ static inline void volk_gnsssdr_16ic_x2_multiply_16ic_a_avx2(lv_16sc_t* out, con
_in_b += 8;
_out += 8;
}
_mm256_zeroupper();
number = avx2_points * 8;
for (; number < num_points; number++)
{

View File

@ -945,7 +945,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2(lv_16sc_
volk_gnsssdr_free(realcacc);
volk_gnsssdr_free(imagcacc);
_mm256_zeroupper();
_mm_store_ps((float*)two_phase_acc, two_phase_acc_reg);
(*phase) = two_phase_acc[0];
@ -1261,7 +1261,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(l
}
_out[n_vec] = dotProduct;
}
_mm256_zeroupper();
volk_gnsssdr_free(realcacc);
volk_gnsssdr_free(imagcacc);

View File

@ -415,7 +415,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_avx(lv_16sc_t** resu
indexn = _mm256_add_ps(indexn, eights);
}
}
_mm256_zeroupper();
for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
{
for (n = avx_iters * 8; n < num_points; n++)
@ -493,7 +493,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_u_avx(lv_16sc_t** resu
indexn = _mm256_add_ps(indexn, eights);
}
}
_mm256_zeroupper();
for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
{
for (n = avx_iters * 8; n < num_points; n++)

View File

@ -491,8 +491,6 @@ static inline void volk_gnsssdr_32f_xn_high_dynamics_resampler_32f_xn_a_avx(floa
indexn = _mm256_add_ps(indexn, eights);
}
_mm256_zeroupper();
for (n = avx_iters * 8; n < num_points; n++)
{
// resample code for first tap
@ -579,8 +577,6 @@ static inline void volk_gnsssdr_32f_xn_high_dynamics_resampler_32f_xn_u_avx(floa
indexn = _mm256_add_ps(indexn, eights);
}
_mm256_zeroupper();
for (n = avx_iters * 8; n < num_points; n++)
{
// resample code for first tap

View File

@ -417,7 +417,7 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_a_avx(float** result, co
indexn = _mm256_add_ps(indexn, eights);
}
}
_mm256_zeroupper();
for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
{
for (n = avx_iters * 8; n < num_points; n++)
@ -495,7 +495,7 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_u_avx(float** result, co
indexn = _mm256_add_ps(indexn, eights);
}
}
_mm256_zeroupper();
for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
{
for (n = avx_iters * 8; n < num_points; n++)

View File

@ -294,7 +294,6 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_
z0 = _mm256_complexnormalise_ps(z0);
_mm256_store_ps((float*)phase_vec, z0);
_phase = phase_vec[0];
_mm256_zeroupper();
number = sixteenthPoints * 16;
for (; number < num_points; number++)
@ -462,7 +461,6 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_
z0 = _mm256_complexnormalise_ps(z0);
_mm256_store_ps((float*)phase_vec, z0);
_phase = phase_vec[0];
_mm256_zeroupper();
number = sixteenthPoints * 16;
for (; number < num_points; number++)

View File

@ -488,7 +488,6 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_t
_mm256_store_ps((float*)four_phase_acc, four_phase_acc_reg);
_phase = four_phase_acc[0];
_mm256_zeroupper();
for (n = avx_iters * 4; n < num_points; n++)
{
@ -618,7 +617,6 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_t
_mm256_store_ps((float*)four_phase_acc, four_phase_acc_reg);
_phase = four_phase_acc[0];
_mm256_zeroupper();
for (n = avx_iters * 4; n < num_points; n++)
{

View File

@ -414,7 +414,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_avx(lv_32fc_t** resu
indexn = _mm256_add_ps(indexn, eights);
}
}
_mm256_zeroupper();
for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
{
for (n = avx_iters * 8; n < num_points; n++)
@ -492,7 +492,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_avx(lv_32fc_t** resu
indexn = _mm256_add_ps(indexn, eights);
}
}
_mm256_zeroupper();
for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
{
for (n = avx_iters * 8; n < num_points; n++)
@ -571,7 +571,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_avx2(lv_32fc_t** res
indexn = _mm256_add_ps(indexn, eights);
}
}
_mm256_zeroupper();
for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
{
for (n = avx_iters * 8; n < num_points; n++)
@ -650,7 +650,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_avx2(lv_32fc_t** res
indexn = _mm256_add_ps(indexn, eights);
}
}
_mm256_zeroupper();
for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
{
for (n = avx_iters * 8; n < num_points; n++)

View File

@ -616,7 +616,7 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_a_avx2(lv_32fc_t *out, const fl
eight_phases_reg = _mm256_add_ps(eight_phases_reg, eight_phases_inc_reg);
}
_mm256_zeroupper();
_phase = _phase + phase_inc * (avx_iters * 8);
for (number = avx_iters * 8; number < num_points; number++)
{
@ -803,7 +803,7 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_u_avx2(lv_32fc_t *out, const fl
eight_phases_reg = _mm256_add_ps(eight_phases_reg, eight_phases_inc_reg);
}
_mm256_zeroupper();
_phase = _phase + phase_inc * (avx_iters * 8);
for (number = avx_iters * 8; number < num_points; number++)
{