From 1ab254ff1d487e4220a8661a4936f07db5f59cc4 Mon Sep 17 00:00:00 2001 From: osmarks Date: Tue, 25 Jun 2024 08:23:30 +0100 Subject: [PATCH] Adjust index storage for memory efficiency and fix SQLite interface type confusion --- Cargo.lock | 36 ++++++++++++++++++++++++++++++++++++ Cargo.toml | 5 +++-- src/main.rs | 33 ++++++++++++++++++++------------- src/video_reader.rs | 2 +- 4 files changed, 60 insertions(+), 16 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 09b91dd..8ea0680 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -365,6 +365,15 @@ version = "1.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "514de17de45fdb8dc022b1a7975556c53c86f9f0aa5f534b98977b171857c2c9" +[[package]] +name = "castaway" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a17ed5635fc8536268e5d4de1e22e81ac34419e5f052d4d51f4e01dcc263fcc" +dependencies = [ + "rustversion", +] + [[package]] name = "cc" version = "1.0.98" @@ -432,6 +441,21 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3d7b894f5411737b7867f4827955924d7c254fc9f4d91a6aad6b097804b1018b" +[[package]] +name = "compact_str" +version = "0.8.0-beta" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c2a2dc81369dde6d31456eedbb4fd3d320f0b9713573dfe06e569e2bce7607f2" +dependencies = [ + "castaway", + "cfg-if", + "itoa", + "rustversion", + "ryu", + "serde", + "static_assertions", +] + [[package]] name = "const-oid" version = "0.9.6" @@ -1483,6 +1507,7 @@ dependencies = [ "axum", "base64 0.22.1", "chrono", + "compact_str", "faiss", "fastrand", "ffmpeg-the-third", @@ -1581,6 +1606,15 @@ dependencies = [ "static_assertions", ] +[[package]] +name = "nasm-rs" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fe4d98d0065f4b1daf164b3eafb11974c94662e5e2396cf03f32d0bb5c17da51" +dependencies = [ + "rayon", +] + [[package]] name = "native-tls" version = "0.2.11" @@ -2093,6 +2127,7 @@ dependencies = [ "av1-grain", "bitstream-io", "built", + "cc", "cfg-if", "interpolate_name", "itertools", @@ -2100,6 +2135,7 @@ dependencies = [ "libfuzzer-sys", "log", "maybe-rayon", + "nasm-rs", "new_debug_unreachable", "noop_proc_macro", "num-derive", diff --git a/Cargo.toml b/Cargo.toml index ca5323d..464cf8b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -8,7 +8,7 @@ edition = "2021" [dependencies] tokio = { version = "1", features = ["full"] } axum = "0.7" -image = { version = "0.25", features = ["avif", "avif-native"] } +image = { version = "0.25", features = ["avif", "avif-native", "nasm"] } reqwest = { version = "0.12", features = ["multipart"] } serde = { version = "1", features = ["derive"] } sqlx = { version = "0.7", features = ["runtime-tokio", "sqlite"] } @@ -40,6 +40,7 @@ fastrand = "2" mimalloc = "0.1" sonic-rs = "0.3" ffmpeg-the-third = "2.0" +compact_str = { version = "0.8.0-beta", features = ["serde"] } [patch.crates-io] image = { git = "https://github.com/fintelia/image/", branch = "upgrade-zune-jpeg" } @@ -50,4 +51,4 @@ path = "src/reddit_dump.rs" [[bin]] name = "video-reader" -path = "src/video_reader.rs" \ No newline at end of file +path = "src/video_reader.rs" diff --git a/src/main.rs b/src/main.rs index 0799c70..3141962 100644 --- a/src/main.rs +++ b/src/main.rs @@ -14,6 +14,7 @@ use axum::{ http::StatusCode }; use common::resize_for_embed_sync; +use compact_str::CompactString; use image::RgbImage; use image::{imageops::FilterType, io::Reader as ImageReader, DynamicImage, ImageFormat}; use reqwest::Client; @@ -122,7 +123,7 @@ struct RawFileRecord { #[derive(Debug, Clone)] struct FileRecord { - filename: String, + filename: CompactString, needs_embed: bool, needs_ocr: bool, needs_thumbnail: bool @@ -145,8 +146,8 @@ struct LoadedImage { #[derive(Debug, Clone, Serialize, Deserialize, Hash)] enum Filename { - Actual(String), - VideoFrame(String, u64) + Actual(CompactString), + VideoFrame(CompactString, u64) } // this is a somewhat horrible hack, but probably nobody has NUL bytes at the start of filenames? @@ -154,7 +155,7 @@ impl Filename { fn decode(buf: Vec) -> Result { Ok(match buf.strip_prefix(&[0]) { Some(remainder) => rmp_serde::from_read(&*remainder)?, - None => Filename::Actual(String::from_utf8(buf)?.to_string()) + None => Filename::Actual(CompactString::from_utf8(buf)?) }) } @@ -325,7 +326,7 @@ async fn ingest_files(config: Arc) -> Result<()> { let to_ocr_tx = to_ocr_tx.clone(); let video_lengths = video_lengths.clone(); async move { - let path = Path::new(&config.service.files).join(&record.filename); + let path = Path::new(&config.service.files).join(&*record.filename); let image: Result> = tokio::task::block_in_place(|| Ok(Arc::new(ImageReader::open(&path)?.with_guessed_format()?.decode()?))); let image = match image { Ok(image) => image, @@ -490,7 +491,7 @@ async fn ingest_files(config: Arc) -> Result<()> { let mut conn = pool.acquire().await?; ensure_filename_record_exists(&mut conn, &filename_enc).await?; match filename { - Filename::VideoFrame(container, _) => { video_thumb_times.write().await.insert(container.to_string(), timestamp()); }, + Filename::VideoFrame(container, _) => { video_thumb_times.write().await.insert(container.clone(), timestamp()); }, _ => () } sqlx::query!( @@ -588,7 +589,7 @@ async fn ingest_files(config: Arc) -> Result<()> { IMAGES_EMBEDDED_COUNTER.inc(); ensure_filename_record_exists(&mut *tx, &encoded_filename).await?; match &batch[i].filename { - Filename::VideoFrame(container, _) => { video_embed_times.write().await.insert(container.to_string(), timestamp()); }, + Filename::VideoFrame(container, _) => { video_embed_times.write().await.insert(container.clone(), timestamp()); }, _ => () } sqlx::query!( @@ -614,7 +615,7 @@ async fn ingest_files(config: Arc) -> Result<()> { let entry = entry?; let path = entry.path(); if path.is_file() { - let filename = path.strip_prefix(&config.service.files)?.to_str().unwrap().to_string(); + let filename = CompactString::from(path.strip_prefix(&config.service.files)?.to_str().unwrap()); let modtime = entry.metadata()?.modified()?.duration_since(std::time::UNIX_EPOCH)?; let modtime = modtime.as_micros() as i64; actual_filenames.insert(filename.clone(), (path.to_path_buf(), modtime)); @@ -627,7 +628,8 @@ async fn ingest_files(config: Arc) -> Result<()> { for (filename, (_path, modtime)) in actual_filenames.iter() { let modtime = *modtime; - let record = sqlx::query_as!(RawFileRecord, "SELECT * FROM files WHERE filename = ?", filename) + let filename_arr = filename.as_bytes(); + let record = sqlx::query_as!(RawFileRecord, "SELECT * FROM files WHERE filename = ?", filename_arr) .fetch_optional(&pool) .await?; @@ -681,10 +683,14 @@ async fn ingest_files(config: Arc) -> Result<()> { for filename in stored { let parsed_filename = Filename::decode(filename.clone())?; match parsed_filename { - Filename::Actual(s) => if !actual_filenames.contains_key(&s) { - sqlx::query!("DELETE FROM files WHERE filename = ?", s) - .execute(&mut *tx) - .await?; + Filename::Actual(s) => { + let s = &*s; + let raw = &filename; + if !actual_filenames.contains_key(s) { + sqlx::query!("DELETE FROM files WHERE filename = ?", raw) + .execute(&mut *tx) + .await?; + } }, // This might fail in some cases where for whatever reason a video is replaced with a file of the same name which is not a video. Don't do that. Filename::VideoFrame(container, frame) => if !actual_filenames.contains_key(&container) { @@ -704,6 +710,7 @@ async fn ingest_files(config: Arc) -> Result<()> { for container_filename in video_lengths.keys() { let embed_time = video_embed_times.get(container_filename); let thumb_time = video_thumb_times.get(container_filename); + let container_filename: &[u8] = container_filename.as_bytes(); sqlx::query!("INSERT OR REPLACE INTO files (filename, embedding_time, thumbnail_time) VALUES (?, ?, ?)", container_filename, embed_time, thumb_time) .execute(&mut *tx) .await?; diff --git a/src/video_reader.rs b/src/video_reader.rs index 27bddb6..3d8d8f2 100644 --- a/src/video_reader.rs +++ b/src/video_reader.rs @@ -2,7 +2,7 @@ extern crate ffmpeg_the_third as ffmpeg; use anyhow::{Result, Context}; use image::RgbImage; use std::env; -use ffmpeg::{codec, filter, format::{self, Pixel}, media::Type, util::frame::video::Video, software::scaling}; +use ffmpeg::{codec, filter, format::{self, Pixel}, media::Type, util::frame::video::Video}; const BYTES_PER_PIXEL: usize = 3;