Make SIMD instructions work on MS Windows

2020-11-20 16:44:06 +01:00 · 2020-11-20 16:44:06 +01:00 · 7e1f0176f4
parent 94337c159f
commit 7e1f0176f4
9 changed files with 87 additions and 62 deletions
--- a/docs/changelog.md
+++ b/docs/changelog.md
@ -57,6 +57,8 @@ SPDX-FileCopyrightText: 2011-2020 Carles Fernandez-Prades <carles.fernandez@cttc
  GNSS-SDR, the old method is still used in old development environments. No
  extra dependency is needed. This change is transparent to the user, since
  everything is managed by the CMake scripts.
 - The `volk_gnsssdr` library can be built on Microsoft Windows and can execute
  SIMD instructions on that OS.
 - Fix building with `-DENABLE_CUDA=ON` for blocks implemented with CUDA.
 ### Improvements in Usability:
--- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/CMakeLists.txt
+++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/CMakeLists.txt
@ -247,7 +247,10 @@ if(CMAKE_VERSION VERSION_GREATER 3.0 AND SUPPORTED_CPU_FEATURES_ARCH)
        FORCE
    )
    set(USE_CPU_FEATURES ON)
    set(BUILD_SHARED_LIBS_SAVED "${BUILD_SHARED_LIBS}")
    set(BUILD_SHARED_LIBS OFF)
    add_subdirectory(cpu_features)
    set(BUILD_SHARED_LIBS "${BUILD_SHARED_LIBS_SAVED}")
 endif()
 # Python
--- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/README.md
+++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/README.md
@ -139,6 +139,24 @@ $ volk_gnsssdr_profile
 $ sudo ldconfig
 ```
 ### Building on Microsoft Windows
 With Visual Studio 2019, in a Powershell run by administrator:
 ```
 $ git clone https://github.com/gnss-sdr/gnss-sdr
 $ cd gnss-sdr
 $ git checkout next
 $ cd build
 $ cmake -G "Visual Studio 16 2019" ..\src\algorithms\libs\volk_gnsssdr_module\volk_gnsssdr
 $ cd ..
 $ cmake --build build --config Release
 $ cd build
 $ ctest -C Release
 $ cd ..
 $ cmake --install build
 ```
 ## References
 If you use VOLK_GNSSSDR in your research and/or software, please cite the
--- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/gen/archs.xml
+++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/gen/archs.xml
@ -68,7 +68,6 @@
  <check name="mmx"></check>
  <flag compiler="gnu">-mmmx</flag>
  <flag compiler="clang">-mmmx</flag>
  <flag compiler="msvc">/arch:SSE</flag>
  <alignment>8</alignment>
 </arch>
@ -84,7 +83,6 @@
  <check name="sse"></check>
  <flag compiler="gnu">-msse</flag>
  <flag compiler="clang">-msse</flag>
  <flag compiler="msvc">/arch:SSE</flag>
  <environment>_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);</environment>
  <include>xmmintrin.h</include>
  <alignment>16</alignment>
@ -94,7 +92,6 @@
  <check name="sse2"></check>
  <flag compiler="gnu">-msse2</flag>
  <flag compiler="clang">-msse2</flag>
  <flag compiler="msvc">/arch:SSE2</flag>
  <alignment>16</alignment>
 </arch>
--- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn.h
+++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn.h
@ -63,7 +63,6 @@
 #include <volk_gnsssdr/volk_gnsssdr_complex.h>
 #include <volk_gnsssdr/volk_gnsssdr_malloc.h>
 #include <math.h>
 // #include <stdio.h>
 #ifdef LV_HAVE_GENERIC
@ -74,7 +73,7 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_generic(lv_32f
    unsigned int n;
    for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
        {
-            result[n_vec] = lv_cmake(0, 0);
+            result[n_vec] = lv_cmake(0.0f, 0.0f);
        }
    for (n = 0; n < num_points; n++)
        {
@ -115,7 +114,7 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_generic_reload
    unsigned int j;
    for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
        {
-            result[n_vec] = lv_cmake(0, 0);
+            result[n_vec] = lv_cmake(0.0f, 0.0f);
        }
    for (n = 0; n < num_points / ROTATOR_RELOAD; n++)
@ -158,6 +157,7 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_generic_reload
 #include <immintrin.h>
 static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_t* result, const lv_32fc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const float** in_a, int num_a_vectors, unsigned int num_points)
 {
 #ifndef WIN32
    unsigned int number = 0;
    int vec_ind = 0;
    unsigned int i = 0;
@ -287,7 +287,7 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_
            _mm256_store_ps((float*)dotProductVector, dotProdVal0[vec_ind]);  // Store the results back into the dot product vector
-            result[vec_ind] = lv_cmake(0, 0);
+            result[vec_ind] = lv_cmake(0.0f, 0.0f);
            for (i = 0; i < 4; ++i)
                {
                    result[vec_ind] += dotProductVector[i];
@ -312,6 +312,9 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_
        }
    *phase = _phase;
 #else
    volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_generic_reload(result, in_common, phase_inc, phase, in_a, num_a_vectors, num_points);
 #endif
 }
 #endif /* LV_HAVE_AVX */
@ -322,6 +325,7 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_
 #include <immintrin.h>
 static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_t* result, const lv_32fc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const float** in_a, int num_a_vectors, unsigned int num_points)
 {
 #ifndef WIN32
    unsigned int number = 0;
    int vec_ind = 0;
    unsigned int i = 0;
@ -451,7 +455,7 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_
            _mm256_store_ps((float*)dotProductVector, dotProdVal0[vec_ind]);  // Store the results back into the dot product vector
-            result[vec_ind] = lv_cmake(0, 0);
+            result[vec_ind] = lv_cmake(0.0f, 0.0f);
            for (i = 0; i < 4; ++i)
                {
                    result[vec_ind] += dotProductVector[i];
@ -476,6 +480,9 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_
        }
    *phase = _phase;
 #else
    volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_generic_reload(result, in_common, phase_inc, phase, in_a, num_a_vectors, num_points);
 #endif
 }
--- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc.h
+++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc.h
@ -106,8 +106,11 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_u_avx(lv_3
            in_a[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment());
            memcpy((float*)in_a[n], (float*)in, sizeof(float) * num_points);
        }
 #ifndef WIN32
    volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_u_avx(result, local_code, phase_inc[0], phase, (const float**)in_a, num_a_vectors, num_points);
-
+#else
    volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_generic_reload(result, local_code, phase_inc[0], phase, (const float**)in_a, num_a_vectors, num_points);
 #endif
    for (n = 0; n < num_a_vectors; n++)
        {
            volk_gnsssdr_free(in_a[n]);
@ -136,8 +139,11 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_a_avx(lv_3
            in_a[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment());
            memcpy((float*)in_a[n], (float*)in, sizeof(float) * num_points);
        }
 #ifndef WIN32
    volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_a_avx(result, local_code, phase_inc[0], phase, (const float**)in_a, num_a_vectors, num_points);
-
+#else
    volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_generic_reload(result, local_code, phase_inc[0], phase, (const float**)in_a, num_a_vectors, num_points);
 #endif
    for (n = 0; n < num_a_vectors; n++)
        {
            volk_gnsssdr_free(in_a[n]);
--- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn.h
+++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn.h
@ -74,7 +74,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_generic(lv_32fc
    unsigned int n;
    for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
        {
-            result[n_vec] = lv_cmake(0, 0);
+            result[n_vec] = lv_cmake(0.0f, 0.0f);
        }
    for (n = 0; n < num_points; n++)
        {
@ -115,7 +115,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_generic_reload(
    unsigned int j;
    for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
        {
-            result[n_vec] = lv_cmake(0, 0);
+            result[n_vec] = lv_cmake(0.0f, 0.0f);
        }
    for (n = 0; n < num_points / ROTATOR_RELOAD; n++)
@ -158,7 +158,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_generic_reload(
 #include <pmmintrin.h>
 static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_sse3(lv_32fc_t* result, const lv_32fc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const lv_32fc_t** in_a, int num_a_vectors, unsigned int num_points)
 {
-    lv_32fc_t dotProduct = lv_cmake(0, 0);
+    lv_32fc_t dotProduct = lv_cmake(0.0f, 0.0f);
    lv_32fc_t tmp32_1, tmp32_2;
    const unsigned int sse_iters = num_points / 2;
    int n_vec;
@ -240,7 +240,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_sse3(lv_32fc_
    for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
        {
            _mm_store_ps((float*)dotProductVector, acc[n_vec]);  // Store the results back into the dot product vector
-            dotProduct = lv_cmake(0, 0);
+            dotProduct = lv_cmake(0.0f, 0.0f);
            for (i = 0; i < 2; ++i)
                {
                    dotProduct = dotProduct + dotProductVector[i];
@ -270,7 +270,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_sse3(lv_32fc_
 #include <pmmintrin.h>
 static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_sse3(lv_32fc_t* result, const lv_32fc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const lv_32fc_t** in_a, int num_a_vectors, unsigned int num_points)
 {
-    lv_32fc_t dotProduct = lv_cmake(0, 0);
+    lv_32fc_t dotProduct = lv_cmake(0.0f, 0.0f);
    lv_32fc_t tmp32_1, tmp32_2;
    const unsigned int sse_iters = num_points / 2;
    int n_vec;
@ -352,7 +352,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_sse3(lv_32fc_
    for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
        {
            _mm_store_ps((float*)dotProductVector, acc[n_vec]);  // Store the results back into the dot product vector
-            dotProduct = lv_cmake(0, 0);
+            dotProduct = lv_cmake(0.0f, 0.0f);
            for (i = 0; i < 2; ++i)
                {
                    dotProduct = dotProduct + dotProductVector[i];
@ -382,7 +382,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_sse3(lv_32fc_
 #include <immintrin.h>
 static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_t* result, const lv_32fc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const lv_32fc_t** in_a, int num_a_vectors, unsigned int num_points)
 {
-    lv_32fc_t dotProduct = lv_cmake(0, 0);
+    lv_32fc_t dotProduct = lv_cmake(0.0f, 0.0f);
    lv_32fc_t tmp32_1, tmp32_2;
    const unsigned int avx_iters = num_points / 4;
    int n_vec;
@ -401,13 +401,14 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_t
    for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
        {
            acc[n_vec] = _mm256_setzero_ps();
-            result[n_vec] = lv_cmake(0, 0);
+            result[n_vec] = lv_cmake(0.0f, 0.0f);
        }
    // phase rotation registers
    __m256 a, four_phase_acc_reg, yl, yh, tmp1, tmp1p, tmp2, tmp2p, z;
-    __attribute__((aligned(32))) lv_32fc_t four_phase_inc[4];
+    __VOLK_ATTR_ALIGNED(32)
    lv_32fc_t four_phase_inc[4];
    const lv_32fc_t phase_inc2 = phase_inc * phase_inc;
    const lv_32fc_t phase_inc3 = phase_inc2 * phase_inc;
    const lv_32fc_t phase_inc4 = phase_inc3 * phase_inc;
@ -417,7 +418,8 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_t
    four_phase_inc[3] = phase_inc4;
    const __m256 four_phase_inc_reg = _mm256_load_ps((float*)four_phase_inc);
-    __attribute__((aligned(32))) lv_32fc_t four_phase_acc[4];
+    __VOLK_ATTR_ALIGNED(32)
    lv_32fc_t four_phase_acc[4];
    four_phase_acc[0] = _phase;
    four_phase_acc[1] = _phase * phase_inc;
    four_phase_acc[2] = _phase * phase_inc2;
@ -472,7 +474,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_t
    for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
        {
            _mm256_store_ps((float*)dotProductVector, acc[n_vec]);  // Store the results back into the dot product vector
-            dotProduct = lv_cmake(0, 0);
+            dotProduct = lv_cmake(0.0f, 0.0f);
            for (i = 0; i < 4; ++i)
                {
                    dotProduct = dotProduct + dotProductVector[i];
@ -510,7 +512,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_t
 #include <immintrin.h>
 static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_t* result, const lv_32fc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const lv_32fc_t** in_a, int num_a_vectors, unsigned int num_points)
 {
-    lv_32fc_t dotProduct = lv_cmake(0, 0);
+    lv_32fc_t dotProduct = lv_cmake(0.0f, 0.0f);
    lv_32fc_t tmp32_1, tmp32_2;
    const unsigned int avx_iters = num_points / 4;
    int n_vec;
@ -529,7 +531,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_t
    for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
        {
            acc[n_vec] = _mm256_setzero_ps();
-            result[n_vec] = lv_cmake(0, 0);
+            result[n_vec] = lv_cmake(0.0f, 0.0f);
        }
    // phase rotation registers
@ -602,7 +604,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_t
    for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
        {
            _mm256_store_ps((float*)dotProductVector, acc[n_vec]);  // Store the results back into the dot product vector
-            dotProduct = lv_cmake(0, 0);
+            dotProduct = lv_cmake(0.0f, 0.0f);
            for (i = 0; i < 4; ++i)
                {
                    dotProduct = dotProduct + dotProductVector[i];
@ -655,7 +657,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_neon(lv_32fc_t*
    if (neon_iters > 0)
        {
-            lv_32fc_t dotProduct = lv_cmake(0, 0);
+            lv_32fc_t dotProduct = lv_cmake(0.0f, 0.0f);
            float32_t arg_phase0 = cargf(_phase);
            float32_t arg_phase_inc = cargf(phase_inc);
            float32_t phase_est;
@ -760,7 +762,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_neon(lv_32fc_t*
            for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
                {
                    vst2q_f32((float32_t*)dotProductVector, accumulator1[n_vec]);  // Store the results back into the dot product vector
-                    dotProduct = lv_cmake(0, 0);
+                    dotProduct = lv_cmake(0.0f, 0.0f);
                    for (i = 0; i < 4; ++i)
                        {
                            dotProduct = dotProduct + dotProductVector[i];
--- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/CMakeLists.txt
+++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/CMakeLists.txt
@ -145,6 +145,11 @@ macro(OVERRULE_ARCH arch reason)
    list(REMOVE_ITEM available_archs ${arch})
 endmacro()
 macro(FORCE_ARCH arch reason)
    message(STATUS "${reason}, Forced arch ${arch}")
    list(APPEND available_archs ${arch})
 endmacro()
 ########################################################################
 # eliminate AVX on if not on x86, or if the compiler does not accept
 # the xgetbv instruction, or {if not cross-compiling and the xgetbv
@ -264,13 +269,13 @@ if(NOT CROSSCOMPILE_MULTILIB AND CPU_IS_x86)
    endif()
    # MSVC 64 bit does not have MMX, overrule it
-    if(${SIZEOF_CPU} EQUAL 64 AND MSVC)
+    if(MSVC)
-        overrule_arch(mmx "No MMX for Win64")
+        if(${SIZEOF_CPU} EQUAL 64)
-        if(MSVC_VERSION GREATER 1700)
+            overrule_arch(mmx "No MMX for Win64")
            overrule_arch(sse "No SSE for Win64 Visual Studio 2013")
        endif()
        force_arch(sse "Built-in for MSVC > 2013")
        force_arch(sse2 "Built-in for MSVC > 2013")
    endif()
 endif()
 ########################################################################
--- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/volk_gnsssdr_rank_archs.c
+++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/volk_gnsssdr_rank_archs.c
@ -15,25 +15,10 @@
 #include <string.h>
 // clang-format on
 #if __GNUC__ > 3 || __GNUC__ == 3 && __GNUC_MINOR__ >= 4
 #define __popcnt __builtin_popcount
 #else
 inline unsigned __popcnt(unsigned num)
 {
    unsigned pop = 0;
    while (num)
        {
            if (num & 0x1) pop++;
            num >>= 1;
        }
    return pop;
 }
 #endif
 int volk_gnsssdr_get_index(
-    const char *impl_names[],  //list of implementations by name
+    const char *impl_names[],  // list of implementations by name
-    const size_t n_impls,      //number of implementations available
+    const size_t n_impls,      // number of implementations available
-    const char *impl_name      //the implementation name to find
+    const char *impl_name      // the implementation name to find
 )
 {
    unsigned int i;
@ -44,20 +29,20 @@ int volk_gnsssdr_get_index(
                    return i;
                }
        }
-    //TODO return -1;
+    // TODO return -1;
-    //something terrible should happen here
+    // something terrible should happen here
    fprintf(stderr, "VOLK_GNSSSDR warning: no arch found, returning generic impl\n");
    return volk_gnsssdr_get_index(impl_names, n_impls, "generic");  //but we'll fake it for now
 }
 int volk_gnsssdr_rank_archs(
-    const char *kern_name,     //name of the kernel to rank
+    const char *kern_name,     // name of the kernel to rank
-    const char *impl_names[],  //list of implementations by name
+    const char *impl_names[],  // list of implementations by name
-    const int *impl_deps,      //requirement mask per implementation
+    const int *impl_deps,      // requirement mask per implementation
-    const bool *alignment,     //alignment status of each implementation
+    const bool *alignment,     // alignment status of each implementation
-    size_t n_impls,            //number of implementations available
+    size_t n_impls,            // number of implementations available
-    const bool align           //if false, filter aligned implementations
+    const bool align           // if false, filter aligned implementations
 )
 {
    size_t i;
@ -78,7 +63,7 @@ int volk_gnsssdr_rank_archs(
            return volk_gnsssdr_get_index(impl_names, n_impls, "generic");
        }
-    //now look for the function name in the prefs list
+    // now look for the function name in the prefs list
    for (i = 0; i < n_arch_prefs; i++)
        {
            if (!strncmp(kern_name, volk_gnsssdr_arch_prefs[i].name, sizeof(volk_gnsssdr_arch_prefs[i].name)))  //found it
@ -88,14 +73,14 @@ int volk_gnsssdr_rank_archs(
                }
        }
-    //return the best index with the largest deps
+    // return the best index with the largest deps
    size_t best_index_a = 0;
    size_t best_index_u = 0;
    int best_value_a = -1;
    int best_value_u = -1;
    for (i = 0; i < n_impls; i++)
        {
-            const signed val = __popcnt(impl_deps[i]);
+            const signed val = impl_deps[i];
            if (alignment[i] && val > best_value_a)
                {
                    best_index_a = i;
@ -108,9 +93,9 @@ int volk_gnsssdr_rank_archs(
                }
        }
-    //when align and we found a best aligned, use it
+    // when align and we found a best aligned, use it
    if (align && best_value_a != -1) return best_index_a;
-    //otherwise return the best unaligned
+    // otherwise return the best unaligned
    return best_index_u;
 }