Make SIMD instructions work on MS Windows

This commit is contained in:
Carles Fernandez 2020-11-20 16:44:06 +01:00
parent 94337c159f
commit 7e1f0176f4
No known key found for this signature in database
GPG Key ID: 4C583C52B0C3877D
9 changed files with 87 additions and 62 deletions

View File

@ -57,6 +57,8 @@ SPDX-FileCopyrightText: 2011-2020 Carles Fernandez-Prades <carles.fernandez@cttc
GNSS-SDR, the old method is still used in old development environments. No
extra dependency is needed. This change is transparent to the user, since
everything is managed by the CMake scripts.
- The `volk_gnsssdr` library can be built on Microsoft Windows and can execute
SIMD instructions on that OS.
- Fix building with `-DENABLE_CUDA=ON` for blocks implemented with CUDA.
### Improvements in Usability:

View File

@ -247,7 +247,10 @@ if(CMAKE_VERSION VERSION_GREATER 3.0 AND SUPPORTED_CPU_FEATURES_ARCH)
FORCE
)
set(USE_CPU_FEATURES ON)
set(BUILD_SHARED_LIBS_SAVED "${BUILD_SHARED_LIBS}")
set(BUILD_SHARED_LIBS OFF)
add_subdirectory(cpu_features)
set(BUILD_SHARED_LIBS "${BUILD_SHARED_LIBS_SAVED}")
endif()
# Python

View File

@ -139,6 +139,24 @@ $ volk_gnsssdr_profile
$ sudo ldconfig
```
### Building on Microsoft Windows
With Visual Studio 2019, in a Powershell run by administrator:
```
$ git clone https://github.com/gnss-sdr/gnss-sdr
$ cd gnss-sdr
$ git checkout next
$ cd build
$ cmake -G "Visual Studio 16 2019" ..\src\algorithms\libs\volk_gnsssdr_module\volk_gnsssdr
$ cd ..
$ cmake --build build --config Release
$ cd build
$ ctest -C Release
$ cd ..
$ cmake --install build
```
## References
If you use VOLK_GNSSSDR in your research and/or software, please cite the

View File

@ -68,7 +68,6 @@
<check name="mmx"></check>
<flag compiler="gnu">-mmmx</flag>
<flag compiler="clang">-mmmx</flag>
<flag compiler="msvc">/arch:SSE</flag>
<alignment>8</alignment>
</arch>
@ -84,7 +83,6 @@
<check name="sse"></check>
<flag compiler="gnu">-msse</flag>
<flag compiler="clang">-msse</flag>
<flag compiler="msvc">/arch:SSE</flag>
<environment>_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);</environment>
<include>xmmintrin.h</include>
<alignment>16</alignment>
@ -94,7 +92,6 @@
<check name="sse2"></check>
<flag compiler="gnu">-msse2</flag>
<flag compiler="clang">-msse2</flag>
<flag compiler="msvc">/arch:SSE2</flag>
<alignment>16</alignment>
</arch>

View File

@ -63,7 +63,6 @@
#include <volk_gnsssdr/volk_gnsssdr_complex.h>
#include <volk_gnsssdr/volk_gnsssdr_malloc.h>
#include <math.h>
// #include <stdio.h>
#ifdef LV_HAVE_GENERIC
@ -74,7 +73,7 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_generic(lv_32f
unsigned int n;
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
result[n_vec] = lv_cmake(0, 0);
result[n_vec] = lv_cmake(0.0f, 0.0f);
}
for (n = 0; n < num_points; n++)
{
@ -115,7 +114,7 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_generic_reload
unsigned int j;
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
result[n_vec] = lv_cmake(0, 0);
result[n_vec] = lv_cmake(0.0f, 0.0f);
}
for (n = 0; n < num_points / ROTATOR_RELOAD; n++)
@ -158,6 +157,7 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_generic_reload
#include <immintrin.h>
static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_t* result, const lv_32fc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const float** in_a, int num_a_vectors, unsigned int num_points)
{
#ifndef WIN32
unsigned int number = 0;
int vec_ind = 0;
unsigned int i = 0;
@ -287,7 +287,7 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_
_mm256_store_ps((float*)dotProductVector, dotProdVal0[vec_ind]); // Store the results back into the dot product vector
result[vec_ind] = lv_cmake(0, 0);
result[vec_ind] = lv_cmake(0.0f, 0.0f);
for (i = 0; i < 4; ++i)
{
result[vec_ind] += dotProductVector[i];
@ -312,6 +312,9 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_
}
*phase = _phase;
#else
volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_generic_reload(result, in_common, phase_inc, phase, in_a, num_a_vectors, num_points);
#endif
}
#endif /* LV_HAVE_AVX */
@ -322,6 +325,7 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_
#include <immintrin.h>
static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_t* result, const lv_32fc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const float** in_a, int num_a_vectors, unsigned int num_points)
{
#ifndef WIN32
unsigned int number = 0;
int vec_ind = 0;
unsigned int i = 0;
@ -451,7 +455,7 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_
_mm256_store_ps((float*)dotProductVector, dotProdVal0[vec_ind]); // Store the results back into the dot product vector
result[vec_ind] = lv_cmake(0, 0);
result[vec_ind] = lv_cmake(0.0f, 0.0f);
for (i = 0; i < 4; ++i)
{
result[vec_ind] += dotProductVector[i];
@ -476,6 +480,9 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_
}
*phase = _phase;
#else
volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_generic_reload(result, in_common, phase_inc, phase, in_a, num_a_vectors, num_points);
#endif
}

View File

@ -106,8 +106,11 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_u_avx(lv_3
in_a[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment());
memcpy((float*)in_a[n], (float*)in, sizeof(float) * num_points);
}
#ifndef WIN32
volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_u_avx(result, local_code, phase_inc[0], phase, (const float**)in_a, num_a_vectors, num_points);
#else
volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_generic_reload(result, local_code, phase_inc[0], phase, (const float**)in_a, num_a_vectors, num_points);
#endif
for (n = 0; n < num_a_vectors; n++)
{
volk_gnsssdr_free(in_a[n]);
@ -136,8 +139,11 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_a_avx(lv_3
in_a[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment());
memcpy((float*)in_a[n], (float*)in, sizeof(float) * num_points);
}
#ifndef WIN32
volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_a_avx(result, local_code, phase_inc[0], phase, (const float**)in_a, num_a_vectors, num_points);
#else
volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_generic_reload(result, local_code, phase_inc[0], phase, (const float**)in_a, num_a_vectors, num_points);
#endif
for (n = 0; n < num_a_vectors; n++)
{
volk_gnsssdr_free(in_a[n]);

View File

@ -74,7 +74,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_generic(lv_32fc
unsigned int n;
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
result[n_vec] = lv_cmake(0, 0);
result[n_vec] = lv_cmake(0.0f, 0.0f);
}
for (n = 0; n < num_points; n++)
{
@ -115,7 +115,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_generic_reload(
unsigned int j;
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
result[n_vec] = lv_cmake(0, 0);
result[n_vec] = lv_cmake(0.0f, 0.0f);
}
for (n = 0; n < num_points / ROTATOR_RELOAD; n++)
@ -158,7 +158,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_generic_reload(
#include <pmmintrin.h>
static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_sse3(lv_32fc_t* result, const lv_32fc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const lv_32fc_t** in_a, int num_a_vectors, unsigned int num_points)
{
lv_32fc_t dotProduct = lv_cmake(0, 0);
lv_32fc_t dotProduct = lv_cmake(0.0f, 0.0f);
lv_32fc_t tmp32_1, tmp32_2;
const unsigned int sse_iters = num_points / 2;
int n_vec;
@ -240,7 +240,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_sse3(lv_32fc_
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
_mm_store_ps((float*)dotProductVector, acc[n_vec]); // Store the results back into the dot product vector
dotProduct = lv_cmake(0, 0);
dotProduct = lv_cmake(0.0f, 0.0f);
for (i = 0; i < 2; ++i)
{
dotProduct = dotProduct + dotProductVector[i];
@ -270,7 +270,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_sse3(lv_32fc_
#include <pmmintrin.h>
static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_sse3(lv_32fc_t* result, const lv_32fc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const lv_32fc_t** in_a, int num_a_vectors, unsigned int num_points)
{
lv_32fc_t dotProduct = lv_cmake(0, 0);
lv_32fc_t dotProduct = lv_cmake(0.0f, 0.0f);
lv_32fc_t tmp32_1, tmp32_2;
const unsigned int sse_iters = num_points / 2;
int n_vec;
@ -352,7 +352,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_sse3(lv_32fc_
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
_mm_store_ps((float*)dotProductVector, acc[n_vec]); // Store the results back into the dot product vector
dotProduct = lv_cmake(0, 0);
dotProduct = lv_cmake(0.0f, 0.0f);
for (i = 0; i < 2; ++i)
{
dotProduct = dotProduct + dotProductVector[i];
@ -382,7 +382,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_sse3(lv_32fc_
#include <immintrin.h>
static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_t* result, const lv_32fc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const lv_32fc_t** in_a, int num_a_vectors, unsigned int num_points)
{
lv_32fc_t dotProduct = lv_cmake(0, 0);
lv_32fc_t dotProduct = lv_cmake(0.0f, 0.0f);
lv_32fc_t tmp32_1, tmp32_2;
const unsigned int avx_iters = num_points / 4;
int n_vec;
@ -401,13 +401,14 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_t
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
acc[n_vec] = _mm256_setzero_ps();
result[n_vec] = lv_cmake(0, 0);
result[n_vec] = lv_cmake(0.0f, 0.0f);
}
// phase rotation registers
__m256 a, four_phase_acc_reg, yl, yh, tmp1, tmp1p, tmp2, tmp2p, z;
__attribute__((aligned(32))) lv_32fc_t four_phase_inc[4];
__VOLK_ATTR_ALIGNED(32)
lv_32fc_t four_phase_inc[4];
const lv_32fc_t phase_inc2 = phase_inc * phase_inc;
const lv_32fc_t phase_inc3 = phase_inc2 * phase_inc;
const lv_32fc_t phase_inc4 = phase_inc3 * phase_inc;
@ -417,7 +418,8 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_t
four_phase_inc[3] = phase_inc4;
const __m256 four_phase_inc_reg = _mm256_load_ps((float*)four_phase_inc);
__attribute__((aligned(32))) lv_32fc_t four_phase_acc[4];
__VOLK_ATTR_ALIGNED(32)
lv_32fc_t four_phase_acc[4];
four_phase_acc[0] = _phase;
four_phase_acc[1] = _phase * phase_inc;
four_phase_acc[2] = _phase * phase_inc2;
@ -472,7 +474,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_t
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
_mm256_store_ps((float*)dotProductVector, acc[n_vec]); // Store the results back into the dot product vector
dotProduct = lv_cmake(0, 0);
dotProduct = lv_cmake(0.0f, 0.0f);
for (i = 0; i < 4; ++i)
{
dotProduct = dotProduct + dotProductVector[i];
@ -510,7 +512,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_t
#include <immintrin.h>
static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_t* result, const lv_32fc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const lv_32fc_t** in_a, int num_a_vectors, unsigned int num_points)
{
lv_32fc_t dotProduct = lv_cmake(0, 0);
lv_32fc_t dotProduct = lv_cmake(0.0f, 0.0f);
lv_32fc_t tmp32_1, tmp32_2;
const unsigned int avx_iters = num_points / 4;
int n_vec;
@ -529,7 +531,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_t
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
acc[n_vec] = _mm256_setzero_ps();
result[n_vec] = lv_cmake(0, 0);
result[n_vec] = lv_cmake(0.0f, 0.0f);
}
// phase rotation registers
@ -602,7 +604,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_t
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
_mm256_store_ps((float*)dotProductVector, acc[n_vec]); // Store the results back into the dot product vector
dotProduct = lv_cmake(0, 0);
dotProduct = lv_cmake(0.0f, 0.0f);
for (i = 0; i < 4; ++i)
{
dotProduct = dotProduct + dotProductVector[i];
@ -655,7 +657,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_neon(lv_32fc_t*
if (neon_iters > 0)
{
lv_32fc_t dotProduct = lv_cmake(0, 0);
lv_32fc_t dotProduct = lv_cmake(0.0f, 0.0f);
float32_t arg_phase0 = cargf(_phase);
float32_t arg_phase_inc = cargf(phase_inc);
float32_t phase_est;
@ -760,7 +762,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_neon(lv_32fc_t*
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
vst2q_f32((float32_t*)dotProductVector, accumulator1[n_vec]); // Store the results back into the dot product vector
dotProduct = lv_cmake(0, 0);
dotProduct = lv_cmake(0.0f, 0.0f);
for (i = 0; i < 4; ++i)
{
dotProduct = dotProduct + dotProductVector[i];

View File

@ -145,6 +145,11 @@ macro(OVERRULE_ARCH arch reason)
list(REMOVE_ITEM available_archs ${arch})
endmacro()
macro(FORCE_ARCH arch reason)
message(STATUS "${reason}, Forced arch ${arch}")
list(APPEND available_archs ${arch})
endmacro()
########################################################################
# eliminate AVX on if not on x86, or if the compiler does not accept
# the xgetbv instruction, or {if not cross-compiling and the xgetbv
@ -264,13 +269,13 @@ if(NOT CROSSCOMPILE_MULTILIB AND CPU_IS_x86)
endif()
# MSVC 64 bit does not have MMX, overrule it
if(${SIZEOF_CPU} EQUAL 64 AND MSVC)
overrule_arch(mmx "No MMX for Win64")
if(MSVC_VERSION GREATER 1700)
overrule_arch(sse "No SSE for Win64 Visual Studio 2013")
if(MSVC)
if(${SIZEOF_CPU} EQUAL 64)
overrule_arch(mmx "No MMX for Win64")
endif()
force_arch(sse "Built-in for MSVC > 2013")
force_arch(sse2 "Built-in for MSVC > 2013")
endif()
endif()
########################################################################

View File

@ -15,25 +15,10 @@
#include <string.h>
// clang-format on
#if __GNUC__ > 3 || __GNUC__ == 3 && __GNUC_MINOR__ >= 4
#define __popcnt __builtin_popcount
#else
inline unsigned __popcnt(unsigned num)
{
unsigned pop = 0;
while (num)
{
if (num & 0x1) pop++;
num >>= 1;
}
return pop;
}
#endif
int volk_gnsssdr_get_index(
const char *impl_names[], //list of implementations by name
const size_t n_impls, //number of implementations available
const char *impl_name //the implementation name to find
const char *impl_names[], // list of implementations by name
const size_t n_impls, // number of implementations available
const char *impl_name // the implementation name to find
)
{
unsigned int i;
@ -44,20 +29,20 @@ int volk_gnsssdr_get_index(
return i;
}
}
//TODO return -1;
//something terrible should happen here
// TODO return -1;
// something terrible should happen here
fprintf(stderr, "VOLK_GNSSSDR warning: no arch found, returning generic impl\n");
return volk_gnsssdr_get_index(impl_names, n_impls, "generic"); //but we'll fake it for now
}
int volk_gnsssdr_rank_archs(
const char *kern_name, //name of the kernel to rank
const char *impl_names[], //list of implementations by name
const int *impl_deps, //requirement mask per implementation
const bool *alignment, //alignment status of each implementation
size_t n_impls, //number of implementations available
const bool align //if false, filter aligned implementations
const char *kern_name, // name of the kernel to rank
const char *impl_names[], // list of implementations by name
const int *impl_deps, // requirement mask per implementation
const bool *alignment, // alignment status of each implementation
size_t n_impls, // number of implementations available
const bool align // if false, filter aligned implementations
)
{
size_t i;
@ -78,7 +63,7 @@ int volk_gnsssdr_rank_archs(
return volk_gnsssdr_get_index(impl_names, n_impls, "generic");
}
//now look for the function name in the prefs list
// now look for the function name in the prefs list
for (i = 0; i < n_arch_prefs; i++)
{
if (!strncmp(kern_name, volk_gnsssdr_arch_prefs[i].name, sizeof(volk_gnsssdr_arch_prefs[i].name))) //found it
@ -88,14 +73,14 @@ int volk_gnsssdr_rank_archs(
}
}
//return the best index with the largest deps
// return the best index with the largest deps
size_t best_index_a = 0;
size_t best_index_u = 0;
int best_value_a = -1;
int best_value_u = -1;
for (i = 0; i < n_impls; i++)
{
const signed val = __popcnt(impl_deps[i]);
const signed val = impl_deps[i];
if (alignment[i] && val > best_value_a)
{
best_index_a = i;
@ -108,9 +93,9 @@ int volk_gnsssdr_rank_archs(
}
}
//when align and we found a best aligned, use it
// when align and we found a best aligned, use it
if (align && best_value_a != -1) return best_index_a;
//otherwise return the best unaligned
// otherwise return the best unaligned
return best_index_u;
}