Make SIMD instructions work on MS Windows

This commit is contained in:
Carles Fernandez 2020-11-20 16:44:06 +01:00
parent 94337c159f
commit 7e1f0176f4
No known key found for this signature in database
GPG Key ID: 4C583C52B0C3877D
9 changed files with 87 additions and 62 deletions

View File

@ -57,6 +57,8 @@ SPDX-FileCopyrightText: 2011-2020 Carles Fernandez-Prades <carles.fernandez@cttc
GNSS-SDR, the old method is still used in old development environments. No GNSS-SDR, the old method is still used in old development environments. No
extra dependency is needed. This change is transparent to the user, since extra dependency is needed. This change is transparent to the user, since
everything is managed by the CMake scripts. everything is managed by the CMake scripts.
- The `volk_gnsssdr` library can be built on Microsoft Windows and can execute
SIMD instructions on that OS.
- Fix building with `-DENABLE_CUDA=ON` for blocks implemented with CUDA. - Fix building with `-DENABLE_CUDA=ON` for blocks implemented with CUDA.
### Improvements in Usability: ### Improvements in Usability:

View File

@ -247,7 +247,10 @@ if(CMAKE_VERSION VERSION_GREATER 3.0 AND SUPPORTED_CPU_FEATURES_ARCH)
FORCE FORCE
) )
set(USE_CPU_FEATURES ON) set(USE_CPU_FEATURES ON)
set(BUILD_SHARED_LIBS_SAVED "${BUILD_SHARED_LIBS}")
set(BUILD_SHARED_LIBS OFF)
add_subdirectory(cpu_features) add_subdirectory(cpu_features)
set(BUILD_SHARED_LIBS "${BUILD_SHARED_LIBS_SAVED}")
endif() endif()
# Python # Python

View File

@ -139,6 +139,24 @@ $ volk_gnsssdr_profile
$ sudo ldconfig $ sudo ldconfig
``` ```
### Building on Microsoft Windows
With Visual Studio 2019, in a Powershell run by administrator:
```
$ git clone https://github.com/gnss-sdr/gnss-sdr
$ cd gnss-sdr
$ git checkout next
$ cd build
$ cmake -G "Visual Studio 16 2019" ..\src\algorithms\libs\volk_gnsssdr_module\volk_gnsssdr
$ cd ..
$ cmake --build build --config Release
$ cd build
$ ctest -C Release
$ cd ..
$ cmake --install build
```
## References ## References
If you use VOLK_GNSSSDR in your research and/or software, please cite the If you use VOLK_GNSSSDR in your research and/or software, please cite the

View File

@ -68,7 +68,6 @@
<check name="mmx"></check> <check name="mmx"></check>
<flag compiler="gnu">-mmmx</flag> <flag compiler="gnu">-mmmx</flag>
<flag compiler="clang">-mmmx</flag> <flag compiler="clang">-mmmx</flag>
<flag compiler="msvc">/arch:SSE</flag>
<alignment>8</alignment> <alignment>8</alignment>
</arch> </arch>
@ -84,7 +83,6 @@
<check name="sse"></check> <check name="sse"></check>
<flag compiler="gnu">-msse</flag> <flag compiler="gnu">-msse</flag>
<flag compiler="clang">-msse</flag> <flag compiler="clang">-msse</flag>
<flag compiler="msvc">/arch:SSE</flag>
<environment>_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);</environment> <environment>_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);</environment>
<include>xmmintrin.h</include> <include>xmmintrin.h</include>
<alignment>16</alignment> <alignment>16</alignment>
@ -94,7 +92,6 @@
<check name="sse2"></check> <check name="sse2"></check>
<flag compiler="gnu">-msse2</flag> <flag compiler="gnu">-msse2</flag>
<flag compiler="clang">-msse2</flag> <flag compiler="clang">-msse2</flag>
<flag compiler="msvc">/arch:SSE2</flag>
<alignment>16</alignment> <alignment>16</alignment>
</arch> </arch>

View File

@ -63,7 +63,6 @@
#include <volk_gnsssdr/volk_gnsssdr_complex.h> #include <volk_gnsssdr/volk_gnsssdr_complex.h>
#include <volk_gnsssdr/volk_gnsssdr_malloc.h> #include <volk_gnsssdr/volk_gnsssdr_malloc.h>
#include <math.h> #include <math.h>
// #include <stdio.h>
#ifdef LV_HAVE_GENERIC #ifdef LV_HAVE_GENERIC
@ -74,7 +73,7 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_generic(lv_32f
unsigned int n; unsigned int n;
for (n_vec = 0; n_vec < num_a_vectors; n_vec++) for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{ {
result[n_vec] = lv_cmake(0, 0); result[n_vec] = lv_cmake(0.0f, 0.0f);
} }
for (n = 0; n < num_points; n++) for (n = 0; n < num_points; n++)
{ {
@ -115,7 +114,7 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_generic_reload
unsigned int j; unsigned int j;
for (n_vec = 0; n_vec < num_a_vectors; n_vec++) for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{ {
result[n_vec] = lv_cmake(0, 0); result[n_vec] = lv_cmake(0.0f, 0.0f);
} }
for (n = 0; n < num_points / ROTATOR_RELOAD; n++) for (n = 0; n < num_points / ROTATOR_RELOAD; n++)
@ -158,6 +157,7 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_generic_reload
#include <immintrin.h> #include <immintrin.h>
static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_t* result, const lv_32fc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const float** in_a, int num_a_vectors, unsigned int num_points) static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_t* result, const lv_32fc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const float** in_a, int num_a_vectors, unsigned int num_points)
{ {
#ifndef WIN32
unsigned int number = 0; unsigned int number = 0;
int vec_ind = 0; int vec_ind = 0;
unsigned int i = 0; unsigned int i = 0;
@ -287,7 +287,7 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_
_mm256_store_ps((float*)dotProductVector, dotProdVal0[vec_ind]); // Store the results back into the dot product vector _mm256_store_ps((float*)dotProductVector, dotProdVal0[vec_ind]); // Store the results back into the dot product vector
result[vec_ind] = lv_cmake(0, 0); result[vec_ind] = lv_cmake(0.0f, 0.0f);
for (i = 0; i < 4; ++i) for (i = 0; i < 4; ++i)
{ {
result[vec_ind] += dotProductVector[i]; result[vec_ind] += dotProductVector[i];
@ -312,6 +312,9 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_
} }
*phase = _phase; *phase = _phase;
#else
volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_generic_reload(result, in_common, phase_inc, phase, in_a, num_a_vectors, num_points);
#endif
} }
#endif /* LV_HAVE_AVX */ #endif /* LV_HAVE_AVX */
@ -322,6 +325,7 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_
#include <immintrin.h> #include <immintrin.h>
static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_t* result, const lv_32fc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const float** in_a, int num_a_vectors, unsigned int num_points) static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_t* result, const lv_32fc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const float** in_a, int num_a_vectors, unsigned int num_points)
{ {
#ifndef WIN32
unsigned int number = 0; unsigned int number = 0;
int vec_ind = 0; int vec_ind = 0;
unsigned int i = 0; unsigned int i = 0;
@ -451,7 +455,7 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_
_mm256_store_ps((float*)dotProductVector, dotProdVal0[vec_ind]); // Store the results back into the dot product vector _mm256_store_ps((float*)dotProductVector, dotProdVal0[vec_ind]); // Store the results back into the dot product vector
result[vec_ind] = lv_cmake(0, 0); result[vec_ind] = lv_cmake(0.0f, 0.0f);
for (i = 0; i < 4; ++i) for (i = 0; i < 4; ++i)
{ {
result[vec_ind] += dotProductVector[i]; result[vec_ind] += dotProductVector[i];
@ -476,6 +480,9 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_
} }
*phase = _phase; *phase = _phase;
#else
volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_generic_reload(result, in_common, phase_inc, phase, in_a, num_a_vectors, num_points);
#endif
} }

View File

@ -106,8 +106,11 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_u_avx(lv_3
in_a[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment()); in_a[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment());
memcpy((float*)in_a[n], (float*)in, sizeof(float) * num_points); memcpy((float*)in_a[n], (float*)in, sizeof(float) * num_points);
} }
#ifndef WIN32
volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_u_avx(result, local_code, phase_inc[0], phase, (const float**)in_a, num_a_vectors, num_points); volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_u_avx(result, local_code, phase_inc[0], phase, (const float**)in_a, num_a_vectors, num_points);
#else
volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_generic_reload(result, local_code, phase_inc[0], phase, (const float**)in_a, num_a_vectors, num_points);
#endif
for (n = 0; n < num_a_vectors; n++) for (n = 0; n < num_a_vectors; n++)
{ {
volk_gnsssdr_free(in_a[n]); volk_gnsssdr_free(in_a[n]);
@ -136,8 +139,11 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_a_avx(lv_3
in_a[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment()); in_a[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment());
memcpy((float*)in_a[n], (float*)in, sizeof(float) * num_points); memcpy((float*)in_a[n], (float*)in, sizeof(float) * num_points);
} }
#ifndef WIN32
volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_a_avx(result, local_code, phase_inc[0], phase, (const float**)in_a, num_a_vectors, num_points); volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_a_avx(result, local_code, phase_inc[0], phase, (const float**)in_a, num_a_vectors, num_points);
#else
volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_generic_reload(result, local_code, phase_inc[0], phase, (const float**)in_a, num_a_vectors, num_points);
#endif
for (n = 0; n < num_a_vectors; n++) for (n = 0; n < num_a_vectors; n++)
{ {
volk_gnsssdr_free(in_a[n]); volk_gnsssdr_free(in_a[n]);

View File

@ -74,7 +74,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_generic(lv_32fc
unsigned int n; unsigned int n;
for (n_vec = 0; n_vec < num_a_vectors; n_vec++) for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{ {
result[n_vec] = lv_cmake(0, 0); result[n_vec] = lv_cmake(0.0f, 0.0f);
} }
for (n = 0; n < num_points; n++) for (n = 0; n < num_points; n++)
{ {
@ -115,7 +115,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_generic_reload(
unsigned int j; unsigned int j;
for (n_vec = 0; n_vec < num_a_vectors; n_vec++) for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{ {
result[n_vec] = lv_cmake(0, 0); result[n_vec] = lv_cmake(0.0f, 0.0f);
} }
for (n = 0; n < num_points / ROTATOR_RELOAD; n++) for (n = 0; n < num_points / ROTATOR_RELOAD; n++)
@ -158,7 +158,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_generic_reload(
#include <pmmintrin.h> #include <pmmintrin.h>
static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_sse3(lv_32fc_t* result, const lv_32fc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const lv_32fc_t** in_a, int num_a_vectors, unsigned int num_points) static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_sse3(lv_32fc_t* result, const lv_32fc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const lv_32fc_t** in_a, int num_a_vectors, unsigned int num_points)
{ {
lv_32fc_t dotProduct = lv_cmake(0, 0); lv_32fc_t dotProduct = lv_cmake(0.0f, 0.0f);
lv_32fc_t tmp32_1, tmp32_2; lv_32fc_t tmp32_1, tmp32_2;
const unsigned int sse_iters = num_points / 2; const unsigned int sse_iters = num_points / 2;
int n_vec; int n_vec;
@ -240,7 +240,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_sse3(lv_32fc_
for (n_vec = 0; n_vec < num_a_vectors; n_vec++) for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{ {
_mm_store_ps((float*)dotProductVector, acc[n_vec]); // Store the results back into the dot product vector _mm_store_ps((float*)dotProductVector, acc[n_vec]); // Store the results back into the dot product vector
dotProduct = lv_cmake(0, 0); dotProduct = lv_cmake(0.0f, 0.0f);
for (i = 0; i < 2; ++i) for (i = 0; i < 2; ++i)
{ {
dotProduct = dotProduct + dotProductVector[i]; dotProduct = dotProduct + dotProductVector[i];
@ -270,7 +270,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_sse3(lv_32fc_
#include <pmmintrin.h> #include <pmmintrin.h>
static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_sse3(lv_32fc_t* result, const lv_32fc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const lv_32fc_t** in_a, int num_a_vectors, unsigned int num_points) static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_sse3(lv_32fc_t* result, const lv_32fc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const lv_32fc_t** in_a, int num_a_vectors, unsigned int num_points)
{ {
lv_32fc_t dotProduct = lv_cmake(0, 0); lv_32fc_t dotProduct = lv_cmake(0.0f, 0.0f);
lv_32fc_t tmp32_1, tmp32_2; lv_32fc_t tmp32_1, tmp32_2;
const unsigned int sse_iters = num_points / 2; const unsigned int sse_iters = num_points / 2;
int n_vec; int n_vec;
@ -352,7 +352,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_sse3(lv_32fc_
for (n_vec = 0; n_vec < num_a_vectors; n_vec++) for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{ {
_mm_store_ps((float*)dotProductVector, acc[n_vec]); // Store the results back into the dot product vector _mm_store_ps((float*)dotProductVector, acc[n_vec]); // Store the results back into the dot product vector
dotProduct = lv_cmake(0, 0); dotProduct = lv_cmake(0.0f, 0.0f);
for (i = 0; i < 2; ++i) for (i = 0; i < 2; ++i)
{ {
dotProduct = dotProduct + dotProductVector[i]; dotProduct = dotProduct + dotProductVector[i];
@ -382,7 +382,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_sse3(lv_32fc_
#include <immintrin.h> #include <immintrin.h>
static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_t* result, const lv_32fc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const lv_32fc_t** in_a, int num_a_vectors, unsigned int num_points) static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_t* result, const lv_32fc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const lv_32fc_t** in_a, int num_a_vectors, unsigned int num_points)
{ {
lv_32fc_t dotProduct = lv_cmake(0, 0); lv_32fc_t dotProduct = lv_cmake(0.0f, 0.0f);
lv_32fc_t tmp32_1, tmp32_2; lv_32fc_t tmp32_1, tmp32_2;
const unsigned int avx_iters = num_points / 4; const unsigned int avx_iters = num_points / 4;
int n_vec; int n_vec;
@ -401,13 +401,14 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_t
for (n_vec = 0; n_vec < num_a_vectors; n_vec++) for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{ {
acc[n_vec] = _mm256_setzero_ps(); acc[n_vec] = _mm256_setzero_ps();
result[n_vec] = lv_cmake(0, 0); result[n_vec] = lv_cmake(0.0f, 0.0f);
} }
// phase rotation registers // phase rotation registers
__m256 a, four_phase_acc_reg, yl, yh, tmp1, tmp1p, tmp2, tmp2p, z; __m256 a, four_phase_acc_reg, yl, yh, tmp1, tmp1p, tmp2, tmp2p, z;
__attribute__((aligned(32))) lv_32fc_t four_phase_inc[4]; __VOLK_ATTR_ALIGNED(32)
lv_32fc_t four_phase_inc[4];
const lv_32fc_t phase_inc2 = phase_inc * phase_inc; const lv_32fc_t phase_inc2 = phase_inc * phase_inc;
const lv_32fc_t phase_inc3 = phase_inc2 * phase_inc; const lv_32fc_t phase_inc3 = phase_inc2 * phase_inc;
const lv_32fc_t phase_inc4 = phase_inc3 * phase_inc; const lv_32fc_t phase_inc4 = phase_inc3 * phase_inc;
@ -417,7 +418,8 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_t
four_phase_inc[3] = phase_inc4; four_phase_inc[3] = phase_inc4;
const __m256 four_phase_inc_reg = _mm256_load_ps((float*)four_phase_inc); const __m256 four_phase_inc_reg = _mm256_load_ps((float*)four_phase_inc);
__attribute__((aligned(32))) lv_32fc_t four_phase_acc[4]; __VOLK_ATTR_ALIGNED(32)
lv_32fc_t four_phase_acc[4];
four_phase_acc[0] = _phase; four_phase_acc[0] = _phase;
four_phase_acc[1] = _phase * phase_inc; four_phase_acc[1] = _phase * phase_inc;
four_phase_acc[2] = _phase * phase_inc2; four_phase_acc[2] = _phase * phase_inc2;
@ -472,7 +474,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_t
for (n_vec = 0; n_vec < num_a_vectors; n_vec++) for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{ {
_mm256_store_ps((float*)dotProductVector, acc[n_vec]); // Store the results back into the dot product vector _mm256_store_ps((float*)dotProductVector, acc[n_vec]); // Store the results back into the dot product vector
dotProduct = lv_cmake(0, 0); dotProduct = lv_cmake(0.0f, 0.0f);
for (i = 0; i < 4; ++i) for (i = 0; i < 4; ++i)
{ {
dotProduct = dotProduct + dotProductVector[i]; dotProduct = dotProduct + dotProductVector[i];
@ -510,7 +512,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_t
#include <immintrin.h> #include <immintrin.h>
static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_t* result, const lv_32fc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const lv_32fc_t** in_a, int num_a_vectors, unsigned int num_points) static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_t* result, const lv_32fc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const lv_32fc_t** in_a, int num_a_vectors, unsigned int num_points)
{ {
lv_32fc_t dotProduct = lv_cmake(0, 0); lv_32fc_t dotProduct = lv_cmake(0.0f, 0.0f);
lv_32fc_t tmp32_1, tmp32_2; lv_32fc_t tmp32_1, tmp32_2;
const unsigned int avx_iters = num_points / 4; const unsigned int avx_iters = num_points / 4;
int n_vec; int n_vec;
@ -529,7 +531,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_t
for (n_vec = 0; n_vec < num_a_vectors; n_vec++) for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{ {
acc[n_vec] = _mm256_setzero_ps(); acc[n_vec] = _mm256_setzero_ps();
result[n_vec] = lv_cmake(0, 0); result[n_vec] = lv_cmake(0.0f, 0.0f);
} }
// phase rotation registers // phase rotation registers
@ -602,7 +604,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_t
for (n_vec = 0; n_vec < num_a_vectors; n_vec++) for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{ {
_mm256_store_ps((float*)dotProductVector, acc[n_vec]); // Store the results back into the dot product vector _mm256_store_ps((float*)dotProductVector, acc[n_vec]); // Store the results back into the dot product vector
dotProduct = lv_cmake(0, 0); dotProduct = lv_cmake(0.0f, 0.0f);
for (i = 0; i < 4; ++i) for (i = 0; i < 4; ++i)
{ {
dotProduct = dotProduct + dotProductVector[i]; dotProduct = dotProduct + dotProductVector[i];
@ -655,7 +657,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_neon(lv_32fc_t*
if (neon_iters > 0) if (neon_iters > 0)
{ {
lv_32fc_t dotProduct = lv_cmake(0, 0); lv_32fc_t dotProduct = lv_cmake(0.0f, 0.0f);
float32_t arg_phase0 = cargf(_phase); float32_t arg_phase0 = cargf(_phase);
float32_t arg_phase_inc = cargf(phase_inc); float32_t arg_phase_inc = cargf(phase_inc);
float32_t phase_est; float32_t phase_est;
@ -760,7 +762,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_neon(lv_32fc_t*
for (n_vec = 0; n_vec < num_a_vectors; n_vec++) for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{ {
vst2q_f32((float32_t*)dotProductVector, accumulator1[n_vec]); // Store the results back into the dot product vector vst2q_f32((float32_t*)dotProductVector, accumulator1[n_vec]); // Store the results back into the dot product vector
dotProduct = lv_cmake(0, 0); dotProduct = lv_cmake(0.0f, 0.0f);
for (i = 0; i < 4; ++i) for (i = 0; i < 4; ++i)
{ {
dotProduct = dotProduct + dotProductVector[i]; dotProduct = dotProduct + dotProductVector[i];

View File

@ -145,6 +145,11 @@ macro(OVERRULE_ARCH arch reason)
list(REMOVE_ITEM available_archs ${arch}) list(REMOVE_ITEM available_archs ${arch})
endmacro() endmacro()
macro(FORCE_ARCH arch reason)
message(STATUS "${reason}, Forced arch ${arch}")
list(APPEND available_archs ${arch})
endmacro()
######################################################################## ########################################################################
# eliminate AVX on if not on x86, or if the compiler does not accept # eliminate AVX on if not on x86, or if the compiler does not accept
# the xgetbv instruction, or {if not cross-compiling and the xgetbv # the xgetbv instruction, or {if not cross-compiling and the xgetbv
@ -264,13 +269,13 @@ if(NOT CROSSCOMPILE_MULTILIB AND CPU_IS_x86)
endif() endif()
# MSVC 64 bit does not have MMX, overrule it # MSVC 64 bit does not have MMX, overrule it
if(${SIZEOF_CPU} EQUAL 64 AND MSVC) if(MSVC)
overrule_arch(mmx "No MMX for Win64") if(${SIZEOF_CPU} EQUAL 64)
if(MSVC_VERSION GREATER 1700) overrule_arch(mmx "No MMX for Win64")
overrule_arch(sse "No SSE for Win64 Visual Studio 2013")
endif() endif()
force_arch(sse "Built-in for MSVC > 2013")
force_arch(sse2 "Built-in for MSVC > 2013")
endif() endif()
endif() endif()
######################################################################## ########################################################################

View File

@ -15,25 +15,10 @@
#include <string.h> #include <string.h>
// clang-format on // clang-format on
#if __GNUC__ > 3 || __GNUC__ == 3 && __GNUC_MINOR__ >= 4
#define __popcnt __builtin_popcount
#else
inline unsigned __popcnt(unsigned num)
{
unsigned pop = 0;
while (num)
{
if (num & 0x1) pop++;
num >>= 1;
}
return pop;
}
#endif
int volk_gnsssdr_get_index( int volk_gnsssdr_get_index(
const char *impl_names[], //list of implementations by name const char *impl_names[], // list of implementations by name
const size_t n_impls, //number of implementations available const size_t n_impls, // number of implementations available
const char *impl_name //the implementation name to find const char *impl_name // the implementation name to find
) )
{ {
unsigned int i; unsigned int i;
@ -44,20 +29,20 @@ int volk_gnsssdr_get_index(
return i; return i;
} }
} }
//TODO return -1; // TODO return -1;
//something terrible should happen here // something terrible should happen here
fprintf(stderr, "VOLK_GNSSSDR warning: no arch found, returning generic impl\n"); fprintf(stderr, "VOLK_GNSSSDR warning: no arch found, returning generic impl\n");
return volk_gnsssdr_get_index(impl_names, n_impls, "generic"); //but we'll fake it for now return volk_gnsssdr_get_index(impl_names, n_impls, "generic"); //but we'll fake it for now
} }
int volk_gnsssdr_rank_archs( int volk_gnsssdr_rank_archs(
const char *kern_name, //name of the kernel to rank const char *kern_name, // name of the kernel to rank
const char *impl_names[], //list of implementations by name const char *impl_names[], // list of implementations by name
const int *impl_deps, //requirement mask per implementation const int *impl_deps, // requirement mask per implementation
const bool *alignment, //alignment status of each implementation const bool *alignment, // alignment status of each implementation
size_t n_impls, //number of implementations available size_t n_impls, // number of implementations available
const bool align //if false, filter aligned implementations const bool align // if false, filter aligned implementations
) )
{ {
size_t i; size_t i;
@ -78,7 +63,7 @@ int volk_gnsssdr_rank_archs(
return volk_gnsssdr_get_index(impl_names, n_impls, "generic"); return volk_gnsssdr_get_index(impl_names, n_impls, "generic");
} }
//now look for the function name in the prefs list // now look for the function name in the prefs list
for (i = 0; i < n_arch_prefs; i++) for (i = 0; i < n_arch_prefs; i++)
{ {
if (!strncmp(kern_name, volk_gnsssdr_arch_prefs[i].name, sizeof(volk_gnsssdr_arch_prefs[i].name))) //found it if (!strncmp(kern_name, volk_gnsssdr_arch_prefs[i].name, sizeof(volk_gnsssdr_arch_prefs[i].name))) //found it
@ -88,14 +73,14 @@ int volk_gnsssdr_rank_archs(
} }
} }
//return the best index with the largest deps // return the best index with the largest deps
size_t best_index_a = 0; size_t best_index_a = 0;
size_t best_index_u = 0; size_t best_index_u = 0;
int best_value_a = -1; int best_value_a = -1;
int best_value_u = -1; int best_value_u = -1;
for (i = 0; i < n_impls; i++) for (i = 0; i < n_impls; i++)
{ {
const signed val = __popcnt(impl_deps[i]); const signed val = impl_deps[i];
if (alignment[i] && val > best_value_a) if (alignment[i] && val > best_value_a)
{ {
best_index_a = i; best_index_a = i;
@ -108,9 +93,9 @@ int volk_gnsssdr_rank_archs(
} }
} }
//when align and we found a best aligned, use it // when align and we found a best aligned, use it
if (align && best_value_a != -1) return best_index_a; if (align && best_value_a != -1) return best_index_a;
//otherwise return the best unaligned // otherwise return the best unaligned
return best_index_u; return best_index_u;
} }