Make SIMD instructions work on MS Windows

2025-10-25 12:37:40 +00:00 · 2020-11-20 16:44:06 +01:00
parent 94337c159f
commit 7e1f0176f4
9 changed files with 87 additions and 62 deletions
--- a/docs/changelog.md
+++ b/docs/changelog.md
@@ -57,6 +57,8 @@ SPDX-FileCopyrightText: 2011-2020 Carles Fernandez-Prades <carles.fernandez@cttc
  GNSS-SDR, the old method is still used in old development environments. No
  extra dependency is needed. This change is transparent to the user, since
  everything is managed by the CMake scripts.
+- The `volk_gnsssdr` library can be built on Microsoft Windows and can execute
+  SIMD instructions on that OS.
 - Fix building with `-DENABLE_CUDA=ON` for blocks implemented with CUDA.

 ### Improvements in Usability:
--- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/CMakeLists.txt
+++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/CMakeLists.txt
@@ -247,7 +247,10 @@ if(CMAKE_VERSION VERSION_GREATER 3.0 AND SUPPORTED_CPU_FEATURES_ARCH)
        FORCE
    )
    set(USE_CPU_FEATURES ON)
+    set(BUILD_SHARED_LIBS_SAVED "${BUILD_SHARED_LIBS}")
+    set(BUILD_SHARED_LIBS OFF)
    add_subdirectory(cpu_features)
+    set(BUILD_SHARED_LIBS "${BUILD_SHARED_LIBS_SAVED}")
 endif()

 # Python
--- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/README.md
+++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/README.md
@@ -139,6 +139,24 @@ $ volk_gnsssdr_profile
 $ sudo ldconfig
 ```

+### Building on Microsoft Windows
+
+With Visual Studio 2019, in a Powershell run by administrator:
+
+```
+$ git clone https://github.com/gnss-sdr/gnss-sdr
+$ cd gnss-sdr
+$ git checkout next
+$ cd build
+$ cmake -G "Visual Studio 16 2019" ..\src\algorithms\libs\volk_gnsssdr_module\volk_gnsssdr
+$ cd ..
+$ cmake --build build --config Release
+$ cd build
+$ ctest -C Release
+$ cd ..
+$ cmake --install build
+```
+
 ## References

 If you use VOLK_GNSSSDR in your research and/or software, please cite the
--- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/gen/archs.xml
+++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/gen/archs.xml
@@ -68,7 +68,6 @@
  <check name="mmx"></check>
  <flag compiler="gnu">-mmmx</flag>
  <flag compiler="clang">-mmmx</flag>
-  <flag compiler="msvc">/arch:SSE</flag>
  <alignment>8</alignment>
 </arch>

@@ -84,7 +83,6 @@
  <check name="sse"></check>
  <flag compiler="gnu">-msse</flag>
  <flag compiler="clang">-msse</flag>
-  <flag compiler="msvc">/arch:SSE</flag>
  <environment>_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);</environment>
  <include>xmmintrin.h</include>
  <alignment>16</alignment>
@@ -94,7 +92,6 @@
  <check name="sse2"></check>
  <flag compiler="gnu">-msse2</flag>
  <flag compiler="clang">-msse2</flag>
-  <flag compiler="msvc">/arch:SSE2</flag>
  <alignment>16</alignment>
 </arch>

--- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn.h
+++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn.h
@@ -63,7 +63,6 @@
 #include <volk_gnsssdr/volk_gnsssdr_complex.h>
 #include <volk_gnsssdr/volk_gnsssdr_malloc.h>
 #include <math.h>
-// #include <stdio.h>

 #ifdef LV_HAVE_GENERIC

@@ -74,7 +73,7 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_generic(lv_32f
    unsigned int n;
    for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
        {
-            result[n_vec] = lv_cmake(0, 0);
+            result[n_vec] = lv_cmake(0.0f, 0.0f);
        }
    for (n = 0; n < num_points; n++)
        {
@@ -115,7 +114,7 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_generic_reload
    unsigned int j;
    for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
        {
-            result[n_vec] = lv_cmake(0, 0);
+            result[n_vec] = lv_cmake(0.0f, 0.0f);
        }

    for (n = 0; n < num_points / ROTATOR_RELOAD; n++)
@@ -158,6 +157,7 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_generic_reload
 #include <immintrin.h>
 static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_t* result, const lv_32fc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const float** in_a, int num_a_vectors, unsigned int num_points)
 {
+#ifndef WIN32
    unsigned int number = 0;
    int vec_ind = 0;
    unsigned int i = 0;
@@ -287,7 +287,7 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_

            _mm256_store_ps((float*)dotProductVector, dotProdVal0[vec_ind]);  // Store the results back into the dot product vector

-            result[vec_ind] = lv_cmake(0, 0);
+            result[vec_ind] = lv_cmake(0.0f, 0.0f);
            for (i = 0; i < 4; ++i)
                {
                    result[vec_ind] += dotProductVector[i];
@@ -312,6 +312,9 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_
        }

    *phase = _phase;
+#else
+    volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_generic_reload(result, in_common, phase_inc, phase, in_a, num_a_vectors, num_points);
+#endif
 }

 #endif /* LV_HAVE_AVX */
@@ -322,6 +325,7 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_
 #include <immintrin.h>
 static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_t* result, const lv_32fc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const float** in_a, int num_a_vectors, unsigned int num_points)
 {
+#ifndef WIN32
    unsigned int number = 0;
    int vec_ind = 0;
    unsigned int i = 0;
@@ -451,7 +455,7 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_

            _mm256_store_ps((float*)dotProductVector, dotProdVal0[vec_ind]);  // Store the results back into the dot product vector

-            result[vec_ind] = lv_cmake(0, 0);
+            result[vec_ind] = lv_cmake(0.0f, 0.0f);
            for (i = 0; i < 4; ++i)
                {
                    result[vec_ind] += dotProductVector[i];
@@ -476,6 +480,9 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_
        }

    *phase = _phase;
+#else
+    volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_generic_reload(result, in_common, phase_inc, phase, in_a, num_a_vectors, num_points);
+#endif
 }


--- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc.h
+++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc.h
@@ -106,8 +106,11 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_u_avx(lv_3
            in_a[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment());
            memcpy((float*)in_a[n], (float*)in, sizeof(float) * num_points);
        }
+#ifndef WIN32
    volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_u_avx(result, local_code, phase_inc[0], phase, (const float**)in_a, num_a_vectors, num_points);
-
+#else
+    volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_generic_reload(result, local_code, phase_inc[0], phase, (const float**)in_a, num_a_vectors, num_points);
+#endif
    for (n = 0; n < num_a_vectors; n++)
        {
            volk_gnsssdr_free(in_a[n]);
@@ -136,8 +139,11 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_a_avx(lv_3
            in_a[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment());
            memcpy((float*)in_a[n], (float*)in, sizeof(float) * num_points);
        }
+#ifndef WIN32
    volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_a_avx(result, local_code, phase_inc[0], phase, (const float**)in_a, num_a_vectors, num_points);
-
+#else
+    volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_generic_reload(result, local_code, phase_inc[0], phase, (const float**)in_a, num_a_vectors, num_points);
+#endif
    for (n = 0; n < num_a_vectors; n++)
        {
            volk_gnsssdr_free(in_a[n]);
--- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn.h
+++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn.h
@@ -74,7 +74,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_generic(lv_32fc
    unsigned int n;
    for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
        {
-            result[n_vec] = lv_cmake(0, 0);
+            result[n_vec] = lv_cmake(0.0f, 0.0f);
        }
    for (n = 0; n < num_points; n++)
        {
@@ -115,7 +115,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_generic_reload(
    unsigned int j;
    for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
        {
-            result[n_vec] = lv_cmake(0, 0);
+            result[n_vec] = lv_cmake(0.0f, 0.0f);
        }

    for (n = 0; n < num_points / ROTATOR_RELOAD; n++)
@@ -158,7 +158,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_generic_reload(
 #include <pmmintrin.h>
 static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_sse3(lv_32fc_t* result, const lv_32fc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const lv_32fc_t** in_a, int num_a_vectors, unsigned int num_points)
 {
-    lv_32fc_t dotProduct = lv_cmake(0, 0);
+    lv_32fc_t dotProduct = lv_cmake(0.0f, 0.0f);
    lv_32fc_t tmp32_1, tmp32_2;
    const unsigned int sse_iters = num_points / 2;
    int n_vec;
@@ -240,7 +240,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_sse3(lv_32fc_
    for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
        {
            _mm_store_ps((float*)dotProductVector, acc[n_vec]);  // Store the results back into the dot product vector
-            dotProduct = lv_cmake(0, 0);
+            dotProduct = lv_cmake(0.0f, 0.0f);
            for (i = 0; i < 2; ++i)
                {
                    dotProduct = dotProduct + dotProductVector[i];
@@ -270,7 +270,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_sse3(lv_32fc_
 #include <pmmintrin.h>
 static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_sse3(lv_32fc_t* result, const lv_32fc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const lv_32fc_t** in_a, int num_a_vectors, unsigned int num_points)
 {
-    lv_32fc_t dotProduct = lv_cmake(0, 0);
+    lv_32fc_t dotProduct = lv_cmake(0.0f, 0.0f);
    lv_32fc_t tmp32_1, tmp32_2;
    const unsigned int sse_iters = num_points / 2;
    int n_vec;
@@ -352,7 +352,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_sse3(lv_32fc_
    for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
        {
            _mm_store_ps((float*)dotProductVector, acc[n_vec]);  // Store the results back into the dot product vector
-            dotProduct = lv_cmake(0, 0);
+            dotProduct = lv_cmake(0.0f, 0.0f);
            for (i = 0; i < 2; ++i)
                {
                    dotProduct = dotProduct + dotProductVector[i];
@@ -382,7 +382,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_sse3(lv_32fc_
 #include <immintrin.h>
 static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_t* result, const lv_32fc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const lv_32fc_t** in_a, int num_a_vectors, unsigned int num_points)
 {
-    lv_32fc_t dotProduct = lv_cmake(0, 0);
+    lv_32fc_t dotProduct = lv_cmake(0.0f, 0.0f);
    lv_32fc_t tmp32_1, tmp32_2;
    const unsigned int avx_iters = num_points / 4;
    int n_vec;
@@ -401,13 +401,14 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_t
    for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
        {
            acc[n_vec] = _mm256_setzero_ps();
-            result[n_vec] = lv_cmake(0, 0);
+            result[n_vec] = lv_cmake(0.0f, 0.0f);
        }

    // phase rotation registers
    __m256 a, four_phase_acc_reg, yl, yh, tmp1, tmp1p, tmp2, tmp2p, z;

-    __attribute__((aligned(32))) lv_32fc_t four_phase_inc[4];
+    __VOLK_ATTR_ALIGNED(32)
+    lv_32fc_t four_phase_inc[4];
    const lv_32fc_t phase_inc2 = phase_inc * phase_inc;
    const lv_32fc_t phase_inc3 = phase_inc2 * phase_inc;
    const lv_32fc_t phase_inc4 = phase_inc3 * phase_inc;
@@ -417,7 +418,8 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_t
    four_phase_inc[3] = phase_inc4;
    const __m256 four_phase_inc_reg = _mm256_load_ps((float*)four_phase_inc);

-    __attribute__((aligned(32))) lv_32fc_t four_phase_acc[4];
+    __VOLK_ATTR_ALIGNED(32)
+    lv_32fc_t four_phase_acc[4];
    four_phase_acc[0] = _phase;
    four_phase_acc[1] = _phase * phase_inc;
    four_phase_acc[2] = _phase * phase_inc2;
@@ -472,7 +474,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_t
    for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
        {
            _mm256_store_ps((float*)dotProductVector, acc[n_vec]);  // Store the results back into the dot product vector
-            dotProduct = lv_cmake(0, 0);
+            dotProduct = lv_cmake(0.0f, 0.0f);
            for (i = 0; i < 4; ++i)
                {
                    dotProduct = dotProduct + dotProductVector[i];
@@ -510,7 +512,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_t
 #include <immintrin.h>
 static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_t* result, const lv_32fc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const lv_32fc_t** in_a, int num_a_vectors, unsigned int num_points)
 {
-    lv_32fc_t dotProduct = lv_cmake(0, 0);
+    lv_32fc_t dotProduct = lv_cmake(0.0f, 0.0f);
    lv_32fc_t tmp32_1, tmp32_2;
    const unsigned int avx_iters = num_points / 4;
    int n_vec;
@@ -529,7 +531,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_t
    for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
        {
            acc[n_vec] = _mm256_setzero_ps();
-            result[n_vec] = lv_cmake(0, 0);
+            result[n_vec] = lv_cmake(0.0f, 0.0f);
        }

    // phase rotation registers
@@ -602,7 +604,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_t
    for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
        {
            _mm256_store_ps((float*)dotProductVector, acc[n_vec]);  // Store the results back into the dot product vector
-            dotProduct = lv_cmake(0, 0);
+            dotProduct = lv_cmake(0.0f, 0.0f);
            for (i = 0; i < 4; ++i)
                {
                    dotProduct = dotProduct + dotProductVector[i];
@@ -655,7 +657,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_neon(lv_32fc_t*

    if (neon_iters > 0)
        {
-            lv_32fc_t dotProduct = lv_cmake(0, 0);
+            lv_32fc_t dotProduct = lv_cmake(0.0f, 0.0f);
            float32_t arg_phase0 = cargf(_phase);
            float32_t arg_phase_inc = cargf(phase_inc);
            float32_t phase_est;
@@ -760,7 +762,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_neon(lv_32fc_t*
            for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
                {
                    vst2q_f32((float32_t*)dotProductVector, accumulator1[n_vec]);  // Store the results back into the dot product vector
-                    dotProduct = lv_cmake(0, 0);
+                    dotProduct = lv_cmake(0.0f, 0.0f);
                    for (i = 0; i < 4; ++i)
                        {
                            dotProduct = dotProduct + dotProductVector[i];
--- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/CMakeLists.txt
+++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/CMakeLists.txt
@@ -145,6 +145,11 @@ macro(OVERRULE_ARCH arch reason)
    list(REMOVE_ITEM available_archs ${arch})
 endmacro()

+macro(FORCE_ARCH arch reason)
+    message(STATUS "${reason}, Forced arch ${arch}")
+    list(APPEND available_archs ${arch})
+endmacro()
+
 ########################################################################
 # eliminate AVX on if not on x86, or if the compiler does not accept
 # the xgetbv instruction, or {if not cross-compiling and the xgetbv
@@ -264,13 +269,13 @@ if(NOT CROSSCOMPILE_MULTILIB AND CPU_IS_x86)
    endif()

    # MSVC 64 bit does not have MMX, overrule it
-    if(${SIZEOF_CPU} EQUAL 64 AND MSVC)
-        overrule_arch(mmx "No MMX for Win64")
-        if(MSVC_VERSION GREATER 1700)
-            overrule_arch(sse "No SSE for Win64 Visual Studio 2013")
+    if(MSVC)
+        if(${SIZEOF_CPU} EQUAL 64)
+            overrule_arch(mmx "No MMX for Win64")
        endif()
+        force_arch(sse "Built-in for MSVC > 2013")
+        force_arch(sse2 "Built-in for MSVC > 2013")
    endif()
-
 endif()

 ########################################################################
--- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/volk_gnsssdr_rank_archs.c
+++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/volk_gnsssdr_rank_archs.c
@@ -15,25 +15,10 @@
 #include <string.h>
 // clang-format on

-#if __GNUC__ > 3 || __GNUC__ == 3 && __GNUC_MINOR__ >= 4
-#define __popcnt __builtin_popcount
-#else
-inline unsigned __popcnt(unsigned num)
-{
-    unsigned pop = 0;
-    while (num)
-        {
-            if (num & 0x1) pop++;
-            num >>= 1;
-        }
-    return pop;
-}
-#endif
-
 int volk_gnsssdr_get_index(
-    const char *impl_names[],  //list of implementations by name
-    const size_t n_impls,      //number of implementations available
-    const char *impl_name      //the implementation name to find
+    const char *impl_names[],  // list of implementations by name
+    const size_t n_impls,      // number of implementations available
+    const char *impl_name      // the implementation name to find
 )
 {
    unsigned int i;
@@ -44,20 +29,20 @@ int volk_gnsssdr_get_index(
                    return i;
                }
        }
-    //TODO return -1;
-    //something terrible should happen here
+    // TODO return -1;
+    // something terrible should happen here
    fprintf(stderr, "VOLK_GNSSSDR warning: no arch found, returning generic impl\n");
    return volk_gnsssdr_get_index(impl_names, n_impls, "generic");  //but we'll fake it for now
 }


 int volk_gnsssdr_rank_archs(
-    const char *kern_name,     //name of the kernel to rank
-    const char *impl_names[],  //list of implementations by name
-    const int *impl_deps,      //requirement mask per implementation
-    const bool *alignment,     //alignment status of each implementation
-    size_t n_impls,            //number of implementations available
-    const bool align           //if false, filter aligned implementations
+    const char *kern_name,     // name of the kernel to rank
+    const char *impl_names[],  // list of implementations by name
+    const int *impl_deps,      // requirement mask per implementation
+    const bool *alignment,     // alignment status of each implementation
+    size_t n_impls,            // number of implementations available
+    const bool align           // if false, filter aligned implementations
 )
 {
    size_t i;
@@ -78,7 +63,7 @@ int volk_gnsssdr_rank_archs(
            return volk_gnsssdr_get_index(impl_names, n_impls, "generic");
        }

-    //now look for the function name in the prefs list
+    // now look for the function name in the prefs list
    for (i = 0; i < n_arch_prefs; i++)
        {
            if (!strncmp(kern_name, volk_gnsssdr_arch_prefs[i].name, sizeof(volk_gnsssdr_arch_prefs[i].name)))  //found it
@@ -88,14 +73,14 @@ int volk_gnsssdr_rank_archs(
                }
        }

-    //return the best index with the largest deps
+    // return the best index with the largest deps
    size_t best_index_a = 0;
    size_t best_index_u = 0;
    int best_value_a = -1;
    int best_value_u = -1;
    for (i = 0; i < n_impls; i++)
        {
-            const signed val = __popcnt(impl_deps[i]);
+            const signed val = impl_deps[i];
            if (alignment[i] && val > best_value_a)
                {
                    best_index_a = i;
@@ -108,9 +93,9 @@ int volk_gnsssdr_rank_archs(
                }
        }

-    //when align and we found a best aligned, use it
+    // when align and we found a best aligned, use it
    if (align && best_value_a != -1) return best_index_a;

-    //otherwise return the best unaligned
+    // otherwise return the best unaligned
    return best_index_u;
 }