Multithread query server

While profiling suggests that most operations are cheap and IO-bound rather than CPU-bound, the GEMM for deduplication is pretty slow. As such, use multiple threads for higher throughput.
2026-06-02 10:52:18 +00:00 · 2025-01-31 13:47:47 +00:00
parent 5215822e39
commit e57931d47f
5 changed files with 130 additions and 86 deletions
@@ -70,7 +70,10 @@ fn main() -> Result<()> {
            l: 192,
            maxc: 750,
            alpha: 65200,
-            saturate_graph: false
+            saturate_graph: false,
+            query_breakpoint: vecs.len() as u32,
+            query_alpha: 65200,
+            max_add_per_stitch_iter: 0
        };

        let mut graph = IndexGraph::empty(vecs.len(), config.r);
@@ -105,13 +108,16 @@ fn main() -> Result<()> {
        l: 200,
        alpha: 65536,
        maxc: 0,
-        saturate_graph: false
+        saturate_graph: false,
+        query_breakpoint: vecs.len() as u32,
+        query_alpha: 65200,
+        max_add_per_stitch_iter: 0
    };

    let mut scratch = Scratch::new(config);

    for (i, vec) in tqdm::tqdm(vecs.iter().enumerate()) {
-        let ctr = greedy_search(&mut scratch, medioid, &vec, &vecs, &graph, config);
+        let ctr = greedy_search(&mut scratch, medioid, false, &vec, &vecs, &graph, config);
        cmps_ctr += ctr.distances;
        cmps.push(ctr.distances);
        if scratch.neighbour_buffer.ids[0] == (i as u32) {
@@ -1,5 +1,3 @@
-use core::f32;
-
 use half::f16;
 use simsimd::SpatialSimilarity;
 use fastrand::Rng;
@@ -420,6 +418,7 @@ pub fn scale_dot_result_f64(x: f64) -> i64 {
 #[cfg(test)]
 mod bench {
    use super::*;
+    use half::slice::HalfFloatSliceExt;
    use test::Bencher;

    #[bench]
@@ -451,4 +450,27 @@ mod bench {
            fast_dot_noprefetch(&a, &b)
        });
    }
+
+    #[bench]
+    fn bench_preprocess_query(be: &mut Bencher) {
+        let mut rng = fastrand::Rng::with_seed(1);
+        let pq = rmp_serde::from_slice::<ProductQuantizer>(&std::fs::read("opq.msgpack").unwrap()).unwrap();
+        let query = Vector::randn(&mut rng, pq.n_dims).to_f32_vec();
+        be.iter(|| {
+            pq.preprocess_query(&query)
+        });
+    }
+
+    #[bench]
+    fn bench_asymmetric_dot_product(be: &mut Bencher) {
+        let mut rng = fastrand::Rng::with_seed(1);
+        let pq = rmp_serde::from_slice::<ProductQuantizer>(&std::fs::read("opq.msgpack").unwrap()).unwrap();
+        let query = Vector::randn(&mut rng, pq.n_dims).to_f32_vec();
+        let lut = pq.preprocess_query(&query);
+        let mut pq_vectors = vec![0; 100 * pq.n_dims / pq.n_dims_per_code];
+        rng.fill(&mut pq_vectors);
+        be.iter(|| {
+            pq.asymmetric_dot_product(&lut, &pq_vectors)
+        });
+    }
 }