mirror of
https://github.com/gnss-sdr/gnss-sdr
synced 2025-01-18 21:23:02 +00:00
Remove _mm256_zeroupper() calls
Manual usage of _mm256_zeroupper() is not required and leads to miscompilation with GCC 10.2 and optimization level -O3
This commit is contained in:
parent
cbdb2ad2e9
commit
68fdedb224
@ -416,7 +416,7 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_a_avx(int16_t** result,
|
||||
indexn = _mm256_add_ps(indexn, eights);
|
||||
}
|
||||
}
|
||||
_mm256_zeroupper();
|
||||
|
||||
for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
|
||||
{
|
||||
for (n = avx_iters * 8; n < num_points; n++)
|
||||
@ -494,7 +494,7 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_u_avx(int16_t** result,
|
||||
indexn = _mm256_add_ps(indexn, eights);
|
||||
}
|
||||
}
|
||||
_mm256_zeroupper();
|
||||
|
||||
for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
|
||||
{
|
||||
for (n = avx_iters * 8; n < num_points; n++)
|
||||
|
@ -611,7 +611,6 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_avx2(lv_16sc
|
||||
}
|
||||
|
||||
volk_gnsssdr_free(cacc);
|
||||
_mm256_zeroupper();
|
||||
|
||||
_mm256_store_ps((float*)four_phase_acc, four_phase_acc_reg);
|
||||
(*phase) = four_phase_acc[0];
|
||||
@ -763,7 +762,6 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_u_avx2(lv_16sc
|
||||
}
|
||||
|
||||
volk_gnsssdr_free(cacc);
|
||||
_mm256_zeroupper();
|
||||
|
||||
_mm256_store_ps((float*)four_phase_acc, four_phase_acc_reg);
|
||||
(*phase) = four_phase_acc[0];
|
||||
|
@ -129,7 +129,7 @@ static inline void volk_gnsssdr_16ic_convert_32fc_u_axv(lv_32fc_t* outputVector,
|
||||
_in += 4;
|
||||
_out += 4;
|
||||
}
|
||||
_mm256_zeroupper();
|
||||
|
||||
for (i = 0; i < (num_points % 4); ++i)
|
||||
{
|
||||
*_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in));
|
||||
@ -156,7 +156,7 @@ static inline void volk_gnsssdr_16ic_convert_32fc_a_axv(lv_32fc_t* outputVector,
|
||||
_in += 4;
|
||||
_out += 4;
|
||||
}
|
||||
_mm256_zeroupper();
|
||||
|
||||
for (i = 0; i < (num_points % 4); ++i)
|
||||
{
|
||||
*_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in));
|
||||
|
@ -282,7 +282,6 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_u_axv2(lv_16sc_t* out, con
|
||||
result = _mm256_or_si256(realcacc, imagcacc);
|
||||
|
||||
_mm256_storeu_si256((__m256i*)dotProductVector, result); // Store the results back into the dot product vector
|
||||
_mm256_zeroupper();
|
||||
|
||||
for (i = 0; i < 8; ++i)
|
||||
{
|
||||
@ -360,7 +359,6 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_a_axv2(lv_16sc_t* out, con
|
||||
result = _mm256_or_si256(realcacc, imagcacc);
|
||||
|
||||
_mm256_store_si256((__m256i*)dotProductVector, result); // Store the results back into the dot product vector
|
||||
_mm256_zeroupper();
|
||||
|
||||
for (i = 0; i < 8; ++i)
|
||||
{
|
||||
|
@ -364,7 +364,6 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_avx2(lv_16sc_t* resul
|
||||
volk_gnsssdr_free(realcacc);
|
||||
volk_gnsssdr_free(imagcacc);
|
||||
}
|
||||
_mm256_zeroupper();
|
||||
|
||||
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
||||
{
|
||||
@ -459,7 +458,6 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_avx2(lv_16sc_t* resul
|
||||
volk_gnsssdr_free(realcacc);
|
||||
volk_gnsssdr_free(imagcacc);
|
||||
}
|
||||
_mm256_zeroupper();
|
||||
|
||||
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
||||
{
|
||||
|
@ -214,7 +214,7 @@ static inline void volk_gnsssdr_16ic_x2_multiply_16ic_u_avx2(lv_16sc_t* out, con
|
||||
_in_b += 8;
|
||||
_out += 8;
|
||||
}
|
||||
_mm256_zeroupper();
|
||||
|
||||
number = avx2_points * 8;
|
||||
for (; number < num_points; number++)
|
||||
{
|
||||
@ -268,7 +268,7 @@ static inline void volk_gnsssdr_16ic_x2_multiply_16ic_a_avx2(lv_16sc_t* out, con
|
||||
_in_b += 8;
|
||||
_out += 8;
|
||||
}
|
||||
_mm256_zeroupper();
|
||||
|
||||
number = avx2_points * 8;
|
||||
for (; number < num_points; number++)
|
||||
{
|
||||
|
@ -945,7 +945,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2(lv_16sc_
|
||||
|
||||
volk_gnsssdr_free(realcacc);
|
||||
volk_gnsssdr_free(imagcacc);
|
||||
_mm256_zeroupper();
|
||||
|
||||
|
||||
_mm_store_ps((float*)two_phase_acc, two_phase_acc_reg);
|
||||
(*phase) = two_phase_acc[0];
|
||||
@ -1261,7 +1261,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(l
|
||||
}
|
||||
_out[n_vec] = dotProduct;
|
||||
}
|
||||
_mm256_zeroupper();
|
||||
|
||||
volk_gnsssdr_free(realcacc);
|
||||
volk_gnsssdr_free(imagcacc);
|
||||
|
||||
|
@ -415,7 +415,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_avx(lv_16sc_t** resu
|
||||
indexn = _mm256_add_ps(indexn, eights);
|
||||
}
|
||||
}
|
||||
_mm256_zeroupper();
|
||||
|
||||
for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
|
||||
{
|
||||
for (n = avx_iters * 8; n < num_points; n++)
|
||||
@ -493,7 +493,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_u_avx(lv_16sc_t** resu
|
||||
indexn = _mm256_add_ps(indexn, eights);
|
||||
}
|
||||
}
|
||||
_mm256_zeroupper();
|
||||
|
||||
for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
|
||||
{
|
||||
for (n = avx_iters * 8; n < num_points; n++)
|
||||
|
@ -491,8 +491,6 @@ static inline void volk_gnsssdr_32f_xn_high_dynamics_resampler_32f_xn_a_avx(floa
|
||||
indexn = _mm256_add_ps(indexn, eights);
|
||||
}
|
||||
|
||||
_mm256_zeroupper();
|
||||
|
||||
for (n = avx_iters * 8; n < num_points; n++)
|
||||
{
|
||||
// resample code for first tap
|
||||
@ -579,8 +577,6 @@ static inline void volk_gnsssdr_32f_xn_high_dynamics_resampler_32f_xn_u_avx(floa
|
||||
indexn = _mm256_add_ps(indexn, eights);
|
||||
}
|
||||
|
||||
_mm256_zeroupper();
|
||||
|
||||
for (n = avx_iters * 8; n < num_points; n++)
|
||||
{
|
||||
// resample code for first tap
|
||||
|
@ -417,7 +417,7 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_a_avx(float** result, co
|
||||
indexn = _mm256_add_ps(indexn, eights);
|
||||
}
|
||||
}
|
||||
_mm256_zeroupper();
|
||||
|
||||
for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
|
||||
{
|
||||
for (n = avx_iters * 8; n < num_points; n++)
|
||||
@ -495,7 +495,7 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_u_avx(float** result, co
|
||||
indexn = _mm256_add_ps(indexn, eights);
|
||||
}
|
||||
}
|
||||
_mm256_zeroupper();
|
||||
|
||||
for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
|
||||
{
|
||||
for (n = avx_iters * 8; n < num_points; n++)
|
||||
|
@ -294,7 +294,6 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_
|
||||
z0 = _mm256_complexnormalise_ps(z0);
|
||||
_mm256_store_ps((float*)phase_vec, z0);
|
||||
_phase = phase_vec[0];
|
||||
_mm256_zeroupper();
|
||||
|
||||
number = sixteenthPoints * 16;
|
||||
for (; number < num_points; number++)
|
||||
@ -462,7 +461,6 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_
|
||||
z0 = _mm256_complexnormalise_ps(z0);
|
||||
_mm256_store_ps((float*)phase_vec, z0);
|
||||
_phase = phase_vec[0];
|
||||
_mm256_zeroupper();
|
||||
|
||||
number = sixteenthPoints * 16;
|
||||
for (; number < num_points; number++)
|
||||
|
@ -488,7 +488,6 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_t
|
||||
|
||||
_mm256_store_ps((float*)four_phase_acc, four_phase_acc_reg);
|
||||
_phase = four_phase_acc[0];
|
||||
_mm256_zeroupper();
|
||||
|
||||
for (n = avx_iters * 4; n < num_points; n++)
|
||||
{
|
||||
@ -618,7 +617,6 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_t
|
||||
|
||||
_mm256_store_ps((float*)four_phase_acc, four_phase_acc_reg);
|
||||
_phase = four_phase_acc[0];
|
||||
_mm256_zeroupper();
|
||||
|
||||
for (n = avx_iters * 4; n < num_points; n++)
|
||||
{
|
||||
|
@ -414,7 +414,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_avx(lv_32fc_t** resu
|
||||
indexn = _mm256_add_ps(indexn, eights);
|
||||
}
|
||||
}
|
||||
_mm256_zeroupper();
|
||||
|
||||
for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
|
||||
{
|
||||
for (n = avx_iters * 8; n < num_points; n++)
|
||||
@ -492,7 +492,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_avx(lv_32fc_t** resu
|
||||
indexn = _mm256_add_ps(indexn, eights);
|
||||
}
|
||||
}
|
||||
_mm256_zeroupper();
|
||||
|
||||
for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
|
||||
{
|
||||
for (n = avx_iters * 8; n < num_points; n++)
|
||||
@ -571,7 +571,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_avx2(lv_32fc_t** res
|
||||
indexn = _mm256_add_ps(indexn, eights);
|
||||
}
|
||||
}
|
||||
_mm256_zeroupper();
|
||||
|
||||
for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
|
||||
{
|
||||
for (n = avx_iters * 8; n < num_points; n++)
|
||||
@ -650,7 +650,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_avx2(lv_32fc_t** res
|
||||
indexn = _mm256_add_ps(indexn, eights);
|
||||
}
|
||||
}
|
||||
_mm256_zeroupper();
|
||||
|
||||
for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
|
||||
{
|
||||
for (n = avx_iters * 8; n < num_points; n++)
|
||||
|
@ -616,7 +616,7 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_a_avx2(lv_32fc_t *out, const fl
|
||||
|
||||
eight_phases_reg = _mm256_add_ps(eight_phases_reg, eight_phases_inc_reg);
|
||||
}
|
||||
_mm256_zeroupper();
|
||||
|
||||
_phase = _phase + phase_inc * (avx_iters * 8);
|
||||
for (number = avx_iters * 8; number < num_points; number++)
|
||||
{
|
||||
@ -803,7 +803,7 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_u_avx2(lv_32fc_t *out, const fl
|
||||
|
||||
eight_phases_reg = _mm256_add_ps(eight_phases_reg, eight_phases_inc_reg);
|
||||
}
|
||||
_mm256_zeroupper();
|
||||
|
||||
_phase = _phase + phase_inc * (avx_iters * 8);
|
||||
for (number = avx_iters * 8; number < num_points; number++)
|
||||
{
|
||||
|
Loading…
Reference in New Issue
Block a user