use std::collections::HashSet; use std::ffi::OsStr; use std::sync::Arc; use anyhow::Result; use compact_str::CompactString; use futures::TryStreamExt; use serde::{Deserialize, Serialize}; use tokio::sync::RwLock; use crate::util::{hash_str, parse_html, parse_pdf, systemtime_to_utc, urlencode, CONFIG}; use crate::indexer::{Ctx, Indexer, TableSpec, delete_nonexistent_files, ColumnSpec}; use async_walkdir::WalkDir; use chrono::prelude::*; use regex::RegexSet; #[derive(Serialize, Deserialize)] struct Config { path: String, #[serde(default)] ignore_regexes: Vec, base_url: String } pub struct TextFilesIndexer { config: Config, ignore: RegexSet } lazy_static::lazy_static! { static ref VALID_EXTENSIONS: HashSet<&'static str> = ["pdf", "txt", "html", "htm", "xhtml"].into_iter().collect(); } #[async_trait::async_trait] impl Indexer for TextFilesIndexer { fn name(&self) -> &'static str { "text_files" } fn schemas(&self) -> &'static [&'static str] { &[r#" CREATE TABLE text_files ( id BIGINT PRIMARY KEY, path TEXT NOT NULL UNIQUE, title TEXT, content TEXT NOT NULL, timestamp TIMESTAMPTZ ); "#] } fn tables(&self) -> &'static [TableSpec] { &[ TableSpec { name: "text_files", parent: None, columns: &[ ColumnSpec { name: "path", fts: true, fts_short: true, trigram: true, is_array: false }, ColumnSpec { name: "title", fts: true, fts_short: true, trigram: true, is_array: false }, ColumnSpec { name: "content", fts: true, fts_short: false, trigram: false, is_array: false } ], url_source_column: Some("path"), title_source_column: "title", summary_columns: &["path", "title", "timestamp"] } ] } async fn run(&self, ctx: Arc) -> Result<()> { let entries = WalkDir::new(&self.config.path); // TODO let ignore = &self.ignore; let base_path = &self.config.path; let existing_files = Arc::new(RwLock::new(HashSet::new())); entries.map_err(|e| anyhow::Error::from(e)).try_for_each_concurrent(Some(CONFIG.concurrency), |entry| { let ctx = ctx.clone(); let existing_files = existing_files.clone(); async move { let real_path = entry.path(); let path = if let Some(path) = real_path.strip_prefix(base_path)?.to_str() { path } else { return Result::Ok(()); }; let ext = real_path.extension().and_then(OsStr::to_str); if ignore.is_match(path) || !entry.file_type().await?.is_file() || !VALID_EXTENSIONS.contains(ext.unwrap_or_default()) { return Ok(()); } let mut conn = ctx.pool.get().await?; existing_files.write().await.insert(CompactString::from(path)); let metadata = entry.metadata().await?; let row = conn.query_opt("SELECT timestamp FROM text_files WHERE id = $1", &[&hash_str(path)]).await?; let timestamp: DateTime = row.map(|r| r.get(0)).unwrap_or(DateTime::::MIN_UTC); let modtime = systemtime_to_utc(metadata.modified()?)?; if modtime > timestamp { let parse = match ext { Some("pdf") => { parse_pdf(&real_path).await.map(Some) }, Some("txt") => { let content = tokio::fs::read(&real_path).await?; Ok(Some((String::from_utf8_lossy(&content).to_string(), String::new()))) }, Some("htm") | Some("html") | Some("xhtml") => { let content = tokio::fs::read(&real_path).await?; Ok(Some(tokio::task::block_in_place(|| parse_html(&content, true)))) }, _ => Ok(None), }; match parse { Ok(None) => (), Ok(Some((content, title))) => { // Null bytes aren't legal in Postgres strings despite being valid UTF-8. let tx = conn.transaction().await?; tx.execute("DELETE FROM text_files WHERE id = $1", &[&hash_str(path)]).await?; tx.execute("INSERT INTO text_files VALUES ($1, $2, $3, $4, $5)", &[&hash_str(path), &path, &title.replace("\0", ""), &content.replace("\0", ""), &modtime]) .await?; tx.commit().await?; }, Err(e) => log::warn!("File parse for {}: {}", path, e) } } Result::Ok(()) } }).await?; { let existing = existing_files.read().await; delete_nonexistent_files(ctx, "SELECT path FROM text_files", "DELETE FROM text_files WHERE id = $1", &existing).await?; } Ok(()) } fn url_for(&self, _table: &str, column_content: &str) -> String { format!("{}{}", self.config.base_url, urlencode(column_content)) } } impl TextFilesIndexer { pub async fn new(config: toml::Table) -> Result> { let config: Config = config.try_into()?; Ok(Box::new(TextFilesIndexer { ignore: RegexSet::new(&config.ignore_regexes)?, config })) } }