From 7e1f0176f4fe9ff9d5b668dde6cb584123d0e501 Mon Sep 17 00:00:00 2001 From: Carles Fernandez Date: Fri, 20 Nov 2020 16:44:06 +0100 Subject: [PATCH] Make SIMD instructions work on MS Windows --- docs/changelog.md | 2 + .../volk_gnsssdr/CMakeLists.txt | 3 ++ .../volk_gnsssdr/README.md | 18 +++++++ .../volk_gnsssdr/gen/archs.xml | 3 -- ...nsssdr_32fc_32f_rotator_dot_prod_32fc_xn.h | 17 +++++-- ...dr_32fc_32f_rotator_dotprodxnpuppet_32fc.h | 10 +++- ...gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn.h | 34 +++++++------- .../volk_gnsssdr/lib/CMakeLists.txt | 15 ++++-- .../lib/volk_gnsssdr_rank_archs.c | 47 +++++++------------ 9 files changed, 87 insertions(+), 62 deletions(-) diff --git a/docs/changelog.md b/docs/changelog.md index 2defafb3d..5196141a5 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -57,6 +57,8 @@ SPDX-FileCopyrightText: 2011-2020 Carles Fernandez-Prades -mmmx -mmmx - /arch:SSE 8 @@ -84,7 +83,6 @@ -msse -msse - /arch:SSE _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); xmmintrin.h 16 @@ -94,7 +92,6 @@ -msse2 -msse2 - /arch:SSE2 16 diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn.h index bb149a55c..2888cc679 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn.h @@ -63,7 +63,6 @@ #include #include #include -// #include #ifdef LV_HAVE_GENERIC @@ -74,7 +73,7 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_generic(lv_32f unsigned int n; for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - result[n_vec] = lv_cmake(0, 0); + result[n_vec] = lv_cmake(0.0f, 0.0f); } for (n = 0; n < num_points; n++) { @@ -115,7 +114,7 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_generic_reload unsigned int j; for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - result[n_vec] = lv_cmake(0, 0); + result[n_vec] = lv_cmake(0.0f, 0.0f); } for (n = 0; n < num_points / ROTATOR_RELOAD; n++) @@ -158,6 +157,7 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_generic_reload #include static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_t* result, const lv_32fc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const float** in_a, int num_a_vectors, unsigned int num_points) { +#ifndef WIN32 unsigned int number = 0; int vec_ind = 0; unsigned int i = 0; @@ -287,7 +287,7 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_ _mm256_store_ps((float*)dotProductVector, dotProdVal0[vec_ind]); // Store the results back into the dot product vector - result[vec_ind] = lv_cmake(0, 0); + result[vec_ind] = lv_cmake(0.0f, 0.0f); for (i = 0; i < 4; ++i) { result[vec_ind] += dotProductVector[i]; @@ -312,6 +312,9 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_ } *phase = _phase; +#else + volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_generic_reload(result, in_common, phase_inc, phase, in_a, num_a_vectors, num_points); +#endif } #endif /* LV_HAVE_AVX */ @@ -322,6 +325,7 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_ #include static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_t* result, const lv_32fc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const float** in_a, int num_a_vectors, unsigned int num_points) { +#ifndef WIN32 unsigned int number = 0; int vec_ind = 0; unsigned int i = 0; @@ -451,7 +455,7 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_ _mm256_store_ps((float*)dotProductVector, dotProdVal0[vec_ind]); // Store the results back into the dot product vector - result[vec_ind] = lv_cmake(0, 0); + result[vec_ind] = lv_cmake(0.0f, 0.0f); for (i = 0; i < 4; ++i) { result[vec_ind] += dotProductVector[i]; @@ -476,6 +480,9 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_ } *phase = _phase; +#else + volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_generic_reload(result, in_common, phase_inc, phase, in_a, num_a_vectors, num_points); +#endif } diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc.h index 85eb7bf5c..a73bac9cd 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc.h @@ -106,8 +106,11 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_u_avx(lv_3 in_a[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment()); memcpy((float*)in_a[n], (float*)in, sizeof(float) * num_points); } +#ifndef WIN32 volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_u_avx(result, local_code, phase_inc[0], phase, (const float**)in_a, num_a_vectors, num_points); - +#else + volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_generic_reload(result, local_code, phase_inc[0], phase, (const float**)in_a, num_a_vectors, num_points); +#endif for (n = 0; n < num_a_vectors; n++) { volk_gnsssdr_free(in_a[n]); @@ -136,8 +139,11 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_a_avx(lv_3 in_a[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment()); memcpy((float*)in_a[n], (float*)in, sizeof(float) * num_points); } +#ifndef WIN32 volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_a_avx(result, local_code, phase_inc[0], phase, (const float**)in_a, num_a_vectors, num_points); - +#else + volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_generic_reload(result, local_code, phase_inc[0], phase, (const float**)in_a, num_a_vectors, num_points); +#endif for (n = 0; n < num_a_vectors; n++) { volk_gnsssdr_free(in_a[n]); diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn.h index 9b9ced9b1..45f0b7452 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn.h @@ -74,7 +74,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_generic(lv_32fc unsigned int n; for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - result[n_vec] = lv_cmake(0, 0); + result[n_vec] = lv_cmake(0.0f, 0.0f); } for (n = 0; n < num_points; n++) { @@ -115,7 +115,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_generic_reload( unsigned int j; for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - result[n_vec] = lv_cmake(0, 0); + result[n_vec] = lv_cmake(0.0f, 0.0f); } for (n = 0; n < num_points / ROTATOR_RELOAD; n++) @@ -158,7 +158,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_generic_reload( #include static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_sse3(lv_32fc_t* result, const lv_32fc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const lv_32fc_t** in_a, int num_a_vectors, unsigned int num_points) { - lv_32fc_t dotProduct = lv_cmake(0, 0); + lv_32fc_t dotProduct = lv_cmake(0.0f, 0.0f); lv_32fc_t tmp32_1, tmp32_2; const unsigned int sse_iters = num_points / 2; int n_vec; @@ -240,7 +240,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_sse3(lv_32fc_ for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { _mm_store_ps((float*)dotProductVector, acc[n_vec]); // Store the results back into the dot product vector - dotProduct = lv_cmake(0, 0); + dotProduct = lv_cmake(0.0f, 0.0f); for (i = 0; i < 2; ++i) { dotProduct = dotProduct + dotProductVector[i]; @@ -270,7 +270,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_sse3(lv_32fc_ #include static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_sse3(lv_32fc_t* result, const lv_32fc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const lv_32fc_t** in_a, int num_a_vectors, unsigned int num_points) { - lv_32fc_t dotProduct = lv_cmake(0, 0); + lv_32fc_t dotProduct = lv_cmake(0.0f, 0.0f); lv_32fc_t tmp32_1, tmp32_2; const unsigned int sse_iters = num_points / 2; int n_vec; @@ -352,7 +352,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_sse3(lv_32fc_ for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { _mm_store_ps((float*)dotProductVector, acc[n_vec]); // Store the results back into the dot product vector - dotProduct = lv_cmake(0, 0); + dotProduct = lv_cmake(0.0f, 0.0f); for (i = 0; i < 2; ++i) { dotProduct = dotProduct + dotProductVector[i]; @@ -382,7 +382,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_sse3(lv_32fc_ #include static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_t* result, const lv_32fc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const lv_32fc_t** in_a, int num_a_vectors, unsigned int num_points) { - lv_32fc_t dotProduct = lv_cmake(0, 0); + lv_32fc_t dotProduct = lv_cmake(0.0f, 0.0f); lv_32fc_t tmp32_1, tmp32_2; const unsigned int avx_iters = num_points / 4; int n_vec; @@ -401,13 +401,14 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_t for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { acc[n_vec] = _mm256_setzero_ps(); - result[n_vec] = lv_cmake(0, 0); + result[n_vec] = lv_cmake(0.0f, 0.0f); } // phase rotation registers __m256 a, four_phase_acc_reg, yl, yh, tmp1, tmp1p, tmp2, tmp2p, z; - __attribute__((aligned(32))) lv_32fc_t four_phase_inc[4]; + __VOLK_ATTR_ALIGNED(32) + lv_32fc_t four_phase_inc[4]; const lv_32fc_t phase_inc2 = phase_inc * phase_inc; const lv_32fc_t phase_inc3 = phase_inc2 * phase_inc; const lv_32fc_t phase_inc4 = phase_inc3 * phase_inc; @@ -417,7 +418,8 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_t four_phase_inc[3] = phase_inc4; const __m256 four_phase_inc_reg = _mm256_load_ps((float*)four_phase_inc); - __attribute__((aligned(32))) lv_32fc_t four_phase_acc[4]; + __VOLK_ATTR_ALIGNED(32) + lv_32fc_t four_phase_acc[4]; four_phase_acc[0] = _phase; four_phase_acc[1] = _phase * phase_inc; four_phase_acc[2] = _phase * phase_inc2; @@ -472,7 +474,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_t for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { _mm256_store_ps((float*)dotProductVector, acc[n_vec]); // Store the results back into the dot product vector - dotProduct = lv_cmake(0, 0); + dotProduct = lv_cmake(0.0f, 0.0f); for (i = 0; i < 4; ++i) { dotProduct = dotProduct + dotProductVector[i]; @@ -510,7 +512,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_t #include static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_t* result, const lv_32fc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const lv_32fc_t** in_a, int num_a_vectors, unsigned int num_points) { - lv_32fc_t dotProduct = lv_cmake(0, 0); + lv_32fc_t dotProduct = lv_cmake(0.0f, 0.0f); lv_32fc_t tmp32_1, tmp32_2; const unsigned int avx_iters = num_points / 4; int n_vec; @@ -529,7 +531,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_t for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { acc[n_vec] = _mm256_setzero_ps(); - result[n_vec] = lv_cmake(0, 0); + result[n_vec] = lv_cmake(0.0f, 0.0f); } // phase rotation registers @@ -602,7 +604,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_t for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { _mm256_store_ps((float*)dotProductVector, acc[n_vec]); // Store the results back into the dot product vector - dotProduct = lv_cmake(0, 0); + dotProduct = lv_cmake(0.0f, 0.0f); for (i = 0; i < 4; ++i) { dotProduct = dotProduct + dotProductVector[i]; @@ -655,7 +657,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_neon(lv_32fc_t* if (neon_iters > 0) { - lv_32fc_t dotProduct = lv_cmake(0, 0); + lv_32fc_t dotProduct = lv_cmake(0.0f, 0.0f); float32_t arg_phase0 = cargf(_phase); float32_t arg_phase_inc = cargf(phase_inc); float32_t phase_est; @@ -760,7 +762,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_neon(lv_32fc_t* for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { vst2q_f32((float32_t*)dotProductVector, accumulator1[n_vec]); // Store the results back into the dot product vector - dotProduct = lv_cmake(0, 0); + dotProduct = lv_cmake(0.0f, 0.0f); for (i = 0; i < 4; ++i) { dotProduct = dotProduct + dotProductVector[i]; diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/CMakeLists.txt b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/CMakeLists.txt index 68afbf718..2fe6e04ab 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/CMakeLists.txt +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/CMakeLists.txt @@ -145,6 +145,11 @@ macro(OVERRULE_ARCH arch reason) list(REMOVE_ITEM available_archs ${arch}) endmacro() +macro(FORCE_ARCH arch reason) + message(STATUS "${reason}, Forced arch ${arch}") + list(APPEND available_archs ${arch}) +endmacro() + ######################################################################## # eliminate AVX on if not on x86, or if the compiler does not accept # the xgetbv instruction, or {if not cross-compiling and the xgetbv @@ -264,13 +269,13 @@ if(NOT CROSSCOMPILE_MULTILIB AND CPU_IS_x86) endif() # MSVC 64 bit does not have MMX, overrule it - if(${SIZEOF_CPU} EQUAL 64 AND MSVC) - overrule_arch(mmx "No MMX for Win64") - if(MSVC_VERSION GREATER 1700) - overrule_arch(sse "No SSE for Win64 Visual Studio 2013") + if(MSVC) + if(${SIZEOF_CPU} EQUAL 64) + overrule_arch(mmx "No MMX for Win64") endif() + force_arch(sse "Built-in for MSVC > 2013") + force_arch(sse2 "Built-in for MSVC > 2013") endif() - endif() ######################################################################## diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/volk_gnsssdr_rank_archs.c b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/volk_gnsssdr_rank_archs.c index db5b931d1..dca205524 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/volk_gnsssdr_rank_archs.c +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/volk_gnsssdr_rank_archs.c @@ -15,25 +15,10 @@ #include // clang-format on -#if __GNUC__ > 3 || __GNUC__ == 3 && __GNUC_MINOR__ >= 4 -#define __popcnt __builtin_popcount -#else -inline unsigned __popcnt(unsigned num) -{ - unsigned pop = 0; - while (num) - { - if (num & 0x1) pop++; - num >>= 1; - } - return pop; -} -#endif - int volk_gnsssdr_get_index( - const char *impl_names[], //list of implementations by name - const size_t n_impls, //number of implementations available - const char *impl_name //the implementation name to find + const char *impl_names[], // list of implementations by name + const size_t n_impls, // number of implementations available + const char *impl_name // the implementation name to find ) { unsigned int i; @@ -44,20 +29,20 @@ int volk_gnsssdr_get_index( return i; } } - //TODO return -1; - //something terrible should happen here + // TODO return -1; + // something terrible should happen here fprintf(stderr, "VOLK_GNSSSDR warning: no arch found, returning generic impl\n"); return volk_gnsssdr_get_index(impl_names, n_impls, "generic"); //but we'll fake it for now } int volk_gnsssdr_rank_archs( - const char *kern_name, //name of the kernel to rank - const char *impl_names[], //list of implementations by name - const int *impl_deps, //requirement mask per implementation - const bool *alignment, //alignment status of each implementation - size_t n_impls, //number of implementations available - const bool align //if false, filter aligned implementations + const char *kern_name, // name of the kernel to rank + const char *impl_names[], // list of implementations by name + const int *impl_deps, // requirement mask per implementation + const bool *alignment, // alignment status of each implementation + size_t n_impls, // number of implementations available + const bool align // if false, filter aligned implementations ) { size_t i; @@ -78,7 +63,7 @@ int volk_gnsssdr_rank_archs( return volk_gnsssdr_get_index(impl_names, n_impls, "generic"); } - //now look for the function name in the prefs list + // now look for the function name in the prefs list for (i = 0; i < n_arch_prefs; i++) { if (!strncmp(kern_name, volk_gnsssdr_arch_prefs[i].name, sizeof(volk_gnsssdr_arch_prefs[i].name))) //found it @@ -88,14 +73,14 @@ int volk_gnsssdr_rank_archs( } } - //return the best index with the largest deps + // return the best index with the largest deps size_t best_index_a = 0; size_t best_index_u = 0; int best_value_a = -1; int best_value_u = -1; for (i = 0; i < n_impls; i++) { - const signed val = __popcnt(impl_deps[i]); + const signed val = impl_deps[i]; if (alignment[i] && val > best_value_a) { best_index_a = i; @@ -108,9 +93,9 @@ int volk_gnsssdr_rank_archs( } } - //when align and we found a best aligned, use it + // when align and we found a best aligned, use it if (align && best_value_a != -1) return best_index_a; - //otherwise return the best unaligned + // otherwise return the best unaligned return best_index_u; }