mirror of
https://github.com/gnss-sdr/gnss-sdr
synced 2025-02-21 05:20:37 +00:00
adding _mm256_zeroupper() at the end of AVX and AVX2 protokernels
This avoids penalties for state transitions from 256-bit x86-AVX instructions to x86-SSE instructions
This commit is contained in:
parent
b1d99d58ec
commit
78372ba2e9
@ -290,6 +290,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_u_axv2(lv_16sc_t* out, con
|
||||
result = _mm256_or_si256(realcacc, imagcacc);
|
||||
|
||||
_mm256_storeu_si256((__m256i*)dotProductVector, result); // Store the results back into the dot product vector
|
||||
_mm256_zeroupper();
|
||||
|
||||
for (i = 0; i < 8; ++i)
|
||||
{
|
||||
@ -365,6 +366,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_a_axv2(lv_16sc_t* out, con
|
||||
result = _mm256_or_si256(realcacc, imagcacc);
|
||||
|
||||
_mm256_store_si256((__m256i*)dotProductVector, result); // Store the results back into the dot product vector
|
||||
_mm256_zeroupper();
|
||||
|
||||
for (i = 0; i < 8; ++i)
|
||||
{
|
||||
|
@ -369,6 +369,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_avx2(lv_16sc_t* resul
|
||||
volk_gnsssdr_free(realcacc);
|
||||
volk_gnsssdr_free(imagcacc);
|
||||
}
|
||||
_mm256_zeroupper();
|
||||
|
||||
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
||||
{
|
||||
@ -461,6 +462,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_avx2(lv_16sc_t* resul
|
||||
volk_gnsssdr_free(realcacc);
|
||||
volk_gnsssdr_free(imagcacc);
|
||||
}
|
||||
_mm256_zeroupper();
|
||||
|
||||
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
||||
{
|
||||
|
@ -225,7 +225,7 @@ static inline void volk_gnsssdr_16ic_x2_multiply_16ic_u_avx2(lv_16sc_t* out, con
|
||||
_in_b += 8;
|
||||
_out += 8;
|
||||
}
|
||||
|
||||
_mm256_zeroupper();
|
||||
number = avx2_points * 8;
|
||||
for(;number < num_points; number++)
|
||||
{
|
||||
@ -279,7 +279,7 @@ static inline void volk_gnsssdr_16ic_x2_multiply_16ic_a_avx2(lv_16sc_t* out, con
|
||||
_in_b += 8;
|
||||
_out += 8;
|
||||
}
|
||||
|
||||
_mm256_zeroupper();
|
||||
number = avx2_points * 8;
|
||||
for(;number < num_points; number++)
|
||||
{
|
||||
|
@ -930,7 +930,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2(lv_16sc_
|
||||
|
||||
volk_gnsssdr_free(realcacc);
|
||||
volk_gnsssdr_free(imagcacc);
|
||||
|
||||
_mm256_zeroupper();
|
||||
|
||||
_mm_store_ps((float*)two_phase_acc, two_phase_acc_reg);
|
||||
(*phase) = two_phase_acc[0];
|
||||
@ -1241,7 +1241,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(l
|
||||
}
|
||||
_out[n_vec] = dotProduct;
|
||||
}
|
||||
|
||||
_mm256_zeroupper();
|
||||
volk_gnsssdr_free(realcacc);
|
||||
volk_gnsssdr_free(imagcacc);
|
||||
|
||||
|
@ -37,7 +37,8 @@
|
||||
*
|
||||
* \b Overview
|
||||
*
|
||||
* Computes the sine and cosine of a vector of floats, providing the output in a complex vector (cosine, sine)
|
||||
* VOLK_GNSSSDR kernel that computes the sine and cosine of a vector
|
||||
* of floats, providing the output in a complex vector (cosine, sine)
|
||||
*
|
||||
* <b>Dispatcher Prototype</b>
|
||||
* \code
|
||||
@ -133,12 +134,13 @@ static inline void volk_gnsssdr_32f_sincos_32fc_u_sse4_1(lv_32fc_t* out, const f
|
||||
cosine = _mm_sub_ps(cosine, _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3));
|
||||
|
||||
cplxValue = _mm_unpacklo_ps(cosine, sine);
|
||||
|
||||
_mm_storeu_ps((float*)bPtr, cplxValue);
|
||||
bPtr += 2;
|
||||
|
||||
cplxValue = _mm_unpackhi_ps(cosine, sine);
|
||||
_mm_storeu_ps((float*)bPtr, cplxValue);
|
||||
bPtr += 2;
|
||||
|
||||
aPtr += 4;
|
||||
}
|
||||
|
||||
@ -226,12 +228,13 @@ static inline void volk_gnsssdr_32f_sincos_32fc_a_sse4_1(lv_32fc_t* out, const f
|
||||
cosine = _mm_sub_ps(cosine, _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3));
|
||||
|
||||
cplxValue = _mm_unpacklo_ps(cosine, sine);
|
||||
|
||||
_mm_store_ps((float*)bPtr, cplxValue);
|
||||
bPtr += 2;
|
||||
|
||||
cplxValue = _mm_unpackhi_ps(cosine, sine);
|
||||
_mm_store_ps((float*)bPtr, cplxValue);
|
||||
bPtr += 2;
|
||||
|
||||
aPtr += 4;
|
||||
}
|
||||
|
||||
@ -587,7 +590,7 @@ static inline void volk_gnsssdr_32f_sincos_32fc_generic_fxpt(lv_32fc_t* out, con
|
||||
_in = *in++;
|
||||
d = (int32_t)floor(_in / TWO_PI + 0.5);
|
||||
_in -= d * TWO_PI;
|
||||
x = (int32_t) ((float) _in * TWO_TO_THE_31_DIV_PI);
|
||||
x = (int32_t) ((float)_in * TWO_TO_THE_31_DIV_PI);
|
||||
|
||||
ux = x;
|
||||
sin_index = ux >> diffbits;
|
||||
|
@ -479,6 +479,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_t
|
||||
|
||||
_mm256_store_ps((float*)four_phase_acc, four_phase_acc_reg);
|
||||
_phase = four_phase_acc[0];
|
||||
_mm256_zeroupper();
|
||||
|
||||
for(unsigned int n = avx_iters * 4; n < num_points; n++)
|
||||
{
|
||||
@ -602,6 +603,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_t
|
||||
|
||||
_mm256_store_ps((float*)four_phase_acc, four_phase_acc_reg);
|
||||
_phase = four_phase_acc[0];
|
||||
_mm256_zeroupper();
|
||||
|
||||
for(unsigned int n = avx_iters * 4; n < num_points; n++)
|
||||
{
|
||||
|
@ -588,7 +588,7 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_a_avx2(lv_32fc_t* out, const fl
|
||||
|
||||
eight_phases_reg = _mm256_add_ps(eight_phases_reg, eight_phases_inc_reg);
|
||||
}
|
||||
|
||||
_mm256_zeroupper();
|
||||
_phase = _phase + phase_inc * (avx_iters * 8);
|
||||
for(number = avx_iters * 8; number < num_points; number++)
|
||||
{
|
||||
@ -756,7 +756,7 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_u_avx2(lv_32fc_t* out, const fl
|
||||
|
||||
eight_phases_reg = _mm256_add_ps(eight_phases_reg, eight_phases_inc_reg);
|
||||
}
|
||||
|
||||
_mm256_zeroupper();
|
||||
_phase = _phase + phase_inc * (avx_iters * 8);
|
||||
for(number = avx_iters * 8; number < num_points; number++)
|
||||
{
|
||||
|
Loading…
x
Reference in New Issue
Block a user