1
0
mirror of https://github.com/gnss-sdr/gnss-sdr synced 2024-09-27 22:58:24 +00:00

adding _mm256_zeroupper() at the end of AVX and AVX2 protokernels

This avoids penalties for state transitions from 256-bit x86-AVX
instructions to x86-SSE instructions
This commit is contained in:
Carles Fernandez 2016-03-28 11:58:01 +02:00
parent b1d99d58ec
commit 78372ba2e9
7 changed files with 19 additions and 10 deletions

View File

@ -290,6 +290,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_u_axv2(lv_16sc_t* out, con
result = _mm256_or_si256(realcacc, imagcacc);
_mm256_storeu_si256((__m256i*)dotProductVector, result); // Store the results back into the dot product vector
_mm256_zeroupper();
for (i = 0; i < 8; ++i)
{
@ -365,6 +366,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_a_axv2(lv_16sc_t* out, con
result = _mm256_or_si256(realcacc, imagcacc);
_mm256_store_si256((__m256i*)dotProductVector, result); // Store the results back into the dot product vector
_mm256_zeroupper();
for (i = 0; i < 8; ++i)
{

View File

@ -369,6 +369,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_avx2(lv_16sc_t* resul
volk_gnsssdr_free(realcacc);
volk_gnsssdr_free(imagcacc);
}
_mm256_zeroupper();
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
@ -461,6 +462,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_avx2(lv_16sc_t* resul
volk_gnsssdr_free(realcacc);
volk_gnsssdr_free(imagcacc);
}
_mm256_zeroupper();
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
{

View File

@ -225,7 +225,7 @@ static inline void volk_gnsssdr_16ic_x2_multiply_16ic_u_avx2(lv_16sc_t* out, con
_in_b += 8;
_out += 8;
}
_mm256_zeroupper();
number = avx2_points * 8;
for(;number < num_points; number++)
{
@ -279,7 +279,7 @@ static inline void volk_gnsssdr_16ic_x2_multiply_16ic_a_avx2(lv_16sc_t* out, con
_in_b += 8;
_out += 8;
}
_mm256_zeroupper();
number = avx2_points * 8;
for(;number < num_points; number++)
{

View File

@ -930,7 +930,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2(lv_16sc_
volk_gnsssdr_free(realcacc);
volk_gnsssdr_free(imagcacc);
_mm256_zeroupper();
_mm_store_ps((float*)two_phase_acc, two_phase_acc_reg);
(*phase) = two_phase_acc[0];
@ -1241,7 +1241,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(l
}
_out[n_vec] = dotProduct;
}
_mm256_zeroupper();
volk_gnsssdr_free(realcacc);
volk_gnsssdr_free(imagcacc);

View File

@ -37,7 +37,8 @@
*
* \b Overview
*
* Computes the sine and cosine of a vector of floats, providing the output in a complex vector (cosine, sine)
* VOLK_GNSSSDR kernel that computes the sine and cosine of a vector
* of floats, providing the output in a complex vector (cosine, sine)
*
* <b>Dispatcher Prototype</b>
* \code
@ -133,12 +134,13 @@ static inline void volk_gnsssdr_32f_sincos_32fc_u_sse4_1(lv_32fc_t* out, const f
cosine = _mm_sub_ps(cosine, _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3));
cplxValue = _mm_unpacklo_ps(cosine, sine);
_mm_storeu_ps((float*)bPtr, cplxValue);
bPtr += 2;
cplxValue = _mm_unpackhi_ps(cosine, sine);
_mm_storeu_ps((float*)bPtr, cplxValue);
bPtr += 2;
aPtr += 4;
}
@ -226,12 +228,13 @@ static inline void volk_gnsssdr_32f_sincos_32fc_a_sse4_1(lv_32fc_t* out, const f
cosine = _mm_sub_ps(cosine, _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3));
cplxValue = _mm_unpacklo_ps(cosine, sine);
_mm_store_ps((float*)bPtr, cplxValue);
bPtr += 2;
cplxValue = _mm_unpackhi_ps(cosine, sine);
_mm_store_ps((float*)bPtr, cplxValue);
bPtr += 2;
aPtr += 4;
}
@ -587,7 +590,7 @@ static inline void volk_gnsssdr_32f_sincos_32fc_generic_fxpt(lv_32fc_t* out, con
_in = *in++;
d = (int32_t)floor(_in / TWO_PI + 0.5);
_in -= d * TWO_PI;
x = (int32_t) ((float) _in * TWO_TO_THE_31_DIV_PI);
x = (int32_t) ((float)_in * TWO_TO_THE_31_DIV_PI);
ux = x;
sin_index = ux >> diffbits;

View File

@ -479,6 +479,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_t
_mm256_store_ps((float*)four_phase_acc, four_phase_acc_reg);
_phase = four_phase_acc[0];
_mm256_zeroupper();
for(unsigned int n = avx_iters * 4; n < num_points; n++)
{
@ -602,6 +603,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_t
_mm256_store_ps((float*)four_phase_acc, four_phase_acc_reg);
_phase = four_phase_acc[0];
_mm256_zeroupper();
for(unsigned int n = avx_iters * 4; n < num_points; n++)
{

View File

@ -588,7 +588,7 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_a_avx2(lv_32fc_t* out, const fl
eight_phases_reg = _mm256_add_ps(eight_phases_reg, eight_phases_inc_reg);
}
_mm256_zeroupper();
_phase = _phase + phase_inc * (avx_iters * 8);
for(number = avx_iters * 8; number < num_points; number++)
{
@ -756,7 +756,7 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_u_avx2(lv_32fc_t* out, const fl
eight_phases_reg = _mm256_add_ps(eight_phases_reg, eight_phases_inc_reg);
}
_mm256_zeroupper();
_phase = _phase + phase_inc * (avx_iters * 8);
for(number = avx_iters * 8; number < num_points; number++)
{