1
0
mirror of https://github.com/osmarks/maghammer.git synced 2025-08-09 23:33:44 +00:00
maghammer/src/indexers/minoteaur.rs

348 lines
13 KiB
Rust

use std::collections::HashMap;
use std::sync::Arc;
use anyhow::{anyhow, Context, Result};
use serde::{Deserialize, Serialize};
use crate::util::hash_str;
use crate::indexer::{Ctx, Indexer, TableSpec, ColumnSpec};
use chrono::prelude::*;
use rusqlite::OpenFlags;
#[derive(Serialize, Deserialize)]
struct Config {
db_path: String,
base_url: String
}
#[derive(Clone)]
pub struct MinoteaurIndexer {
config: Arc<Config>
}
// https://github.com/osmarks/minoteaur-8/blob/master/src/storage.rs
// https://github.com/osmarks/minoteaur-8/blob/master/src/util.rs
mod minoteaur_types {
use serde::{Deserialize, Serialize};
use ulid::Ulid;
use chrono::Utc;
use std::collections::{BTreeSet, HashMap};
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub enum Value {
Text(String),
Number(f64)
}
#[derive(Debug, Serialize, Deserialize, Clone, Copy)]
pub struct ContentSize {
pub words: usize,
pub bytes: usize,
pub lines: usize
}
pub type StructuredData = Vec<(String, Value)>;
#[derive(Debug, Serialize, Deserialize, Clone)]
pub struct File {
#[serde(with="ulid::serde::ulid_as_u128")]
pub page: Ulid,
pub filename: String,
#[serde(with="chrono::serde::ts_milliseconds")]
pub created: chrono::DateTime<Utc>,
pub storage_path: String,
pub size: u64,
pub mime_type: String,
pub metadata: HashMap<String, String>
}
#[derive(Debug, Serialize, Deserialize, Clone)]
pub struct Page {
#[serde(with="ulid::serde::ulid_as_u128")]
pub id: Ulid,
#[serde(with="chrono::serde::ts_milliseconds")]
pub updated: chrono::DateTime<Utc>,
#[serde(with="chrono::serde::ts_milliseconds")]
pub created: chrono::DateTime<Utc>,
pub title: String,
pub names: BTreeSet<String>,
pub content: String,
pub tags: BTreeSet<String>,
pub size: ContentSize,
#[serde(default)]
pub files: HashMap<String, File>,
#[serde(default)]
pub icon_filename: Option<String>,
#[serde(default)]
pub structured_data: StructuredData,
#[serde(default)]
pub theme: Option<String>
}
#[derive(Debug, Serialize, Deserialize, Clone)]
pub enum RevisionType {
AddName(String),
AddTag(String),
ContentUpdate { new_content_size: ContentSize, edit_distance: Option<u32> },
PageCreated,
RemoveName(String),
RemoveTag(String),
AddFile(String),
RemoveFile(String),
SetIconFilename(Option<String>),
SetStructuredData(StructuredData),
SetTheme(Option<String>),
Rename(String)
}
#[derive(Debug, Serialize, Deserialize, Clone)]
pub struct PageView {
#[serde(with="ulid::serde::ulid_as_u128")]
pub page: Ulid,
#[serde(with="chrono::serde::ts_milliseconds")]
pub time: chrono::DateTime<Utc>
}
#[derive(Debug, Serialize, Deserialize, Clone)]
pub struct RevisionHeader {
#[serde(with="ulid::serde::ulid_as_u128")]
pub id: Ulid,
#[serde(with="ulid::serde::ulid_as_u128")]
pub page: Ulid,
pub ty: RevisionType,
#[serde(with="chrono::serde::ts_milliseconds")]
pub time: chrono::DateTime<Utc>
}
#[derive(Debug, Serialize, Deserialize, Clone)]
pub enum Object {
Page(Page),
Revision(RevisionHeader),
PageView(PageView)
}
}
#[async_trait::async_trait]
impl Indexer for MinoteaurIndexer {
fn name(&self) -> &'static str {
"minoteaur"
}
fn schemas(&self) -> &'static [&'static str] {
&[r#"
CREATE TABLE IF NOT EXISTS mino_pages (
id BIGINT PRIMARY KEY,
ulid TEXT NOT NULL,
timestamp TIMESTAMPTZ NOT NULL,
created TIMESTAMPTZ NOT NULL,
title TEXT NOT NULL,
words INTEGER NOT NULL,
tags TEXT[] NOT NULL,
names TEXT[] NOT NULL,
content TEXT NOT NULL,
view_count BIGINT NOT NULL,
revision_count BIGINT NOT NULL,
last_view_timestamp TIMESTAMPTZ NOT NULL
);
CREATE TABLE IF NOT EXISTS mino_structured_data (
id BIGSERIAL PRIMARY KEY,
page BIGINT NOT NULL REFERENCES mino_pages(id) ON DELETE CASCADE,
key TEXT NOT NULL,
numeric_value DOUBLE PRECISION,
text_value TEXT
);
CREATE TABLE IF NOT EXISTS mino_files (
id BIGSERIAL PRIMARY KEY,
page BIGINT NOT NULL REFERENCES mino_pages(id) ON DELETE CASCADE,
filename TEXT NOT NULL,
size INTEGER NOT NULL,
timestamp TIMESTAMPTZ NOT NULL,
metadata JSONB NOT NULL
);
"#]
}
fn tables(&self) -> &'static [TableSpec] {
&[
TableSpec {
name: "mino_pages",
parent: None,
columns: &[
ColumnSpec {
name: "title",
fts: true,
fts_short: true,
trigram: true,
is_array: false
},
ColumnSpec {
name: "content",
fts: true,
fts_short: false,
trigram: false,
is_array: false
},
ColumnSpec {
name: "tags",
fts: true,
fts_short: true,
trigram: true,
is_array: true
},
ColumnSpec {
name: "names",
fts: true,
fts_short: true,
trigram: true,
is_array: true
}
],
url_source_column: Some("ulid"),
title_source_column: "title",
summary_columns: &["title", "tags", "timestamp", "created", "words", "view_count", "revision_count"]
},
TableSpec {
name: "mino_structured_data",
parent: Some(("page", "mino_pages")),
columns: &[
ColumnSpec {
name: "key",
fts: false,
fts_short: false,
trigram: true,
is_array: false
},
ColumnSpec {
name: "text_value",
fts: true,
fts_short: true,
trigram: true,
is_array: false
}
],
title_source_column: "key",
url_source_column: None,
summary_columns: &["key", "text_value", "numeric_value"]
},
TableSpec {
name: "mino_files",
parent: Some(("page", "mino_pages")),
columns: &[
ColumnSpec {
name: "filename",
fts: true,
fts_short: true,
trigram: true,
is_array: false
}
],
url_source_column: None,
title_source_column: "filename",
summary_columns: &["filename", "size", "timestamp", "metadata"]
}
]
}
async fn run(&self, ctx: Arc<Ctx>) -> Result<()> {
let (tx, mut rx) = tokio::sync::mpsc::channel(200);
let self = self.clone();
let bg = tokio::task::spawn_blocking(move || self.read_database(tx));
let mut timestamps = HashMap::new();
let mut conn = ctx.pool.get().await?;
for row in conn.query("SELECT ulid, timestamp, last_view_timestamp FROM mino_pages", &[]).await? {
let ulid: String = row.get(0);
let updated: DateTime<Utc> = row.get(1);
let last_view_timestamp: DateTime<Utc> = row.get(2);
timestamps.insert(ulid::Ulid::from_string(&ulid)?, (updated, last_view_timestamp));
}
while let Some((id, object)) = rx.recv().await {
match object {
minoteaur_types::Object::Page(page) => {
// If we already have the latest information on this page, skip it.
if let Some((updated_timestamp, _last_view_timestamp)) = timestamps.get(&id) {
if *updated_timestamp >= page.updated {
continue;
}
}
let ulid = id.to_string();
let int_id = hash_str(&ulid);
let tx = conn.transaction().await?;
tx.execute("DELETE FROM mino_pages WHERE id = $1", &[&int_id]).await?;
tx.execute("INSERT INTO mino_pages VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, 0, 0, $10)",
&[&int_id, &ulid, &page.updated, &page.created, &page.title, &(page.size.words as i32), &page.tags.into_iter().collect::<Vec<String>>(), &page.names.into_iter().collect::<Vec<String>>(), &page.content, &page.created])
.await?;
for (key, value) in page.structured_data {
let (num, text) = match value {
minoteaur_types::Value::Number(x) => (Some(x), None),
minoteaur_types::Value::Text(t) => (None, Some(t))
};
tx.execute("INSERT INTO mino_structured_data (page, key, numeric_value, text_value) VALUES ($1, $2, $3, $4)", &[&int_id, &key, &num, &text]).await?;
}
for (_key, file) in page.files {
tx.execute("INSERT INTO mino_files (page, filename, size, timestamp, metadata) VALUES ($1, $2, $3, $4, $5)", &[&int_id, &file.filename, &(file.size as i32), &file.created, &tokio_postgres::types::Json(file.metadata)]).await?;
}
tx.commit().await?;
},
// These should only occur after the page's record, with the exception of page creation.
minoteaur_types::Object::PageView(view) => {
if let Some((_updated_timestamp, last_view_timestamp)) = timestamps.get(&view.page) {
if *last_view_timestamp >= view.time {
continue;
}
}
let int_id = hash_str(&view.page.to_string());
conn.execute("UPDATE mino_pages SET view_count = view_count + 1, last_view_timestamp = $2 WHERE id = $1", &[&int_id, &view.time]).await?;
},
minoteaur_types::Object::Revision(rev) => {
// There's no separate "last revision timestamp" because revisions should always be associated with the updated field being adjusted.
if let Some((updated_timestamp, _last_view_timestamp)) = timestamps.get(&rev.page) {
if *updated_timestamp >= rev.time {
continue;
}
}
if let minoteaur_types::RevisionType::PageCreated = rev.ty {
continue;
}
let int_id = hash_str(&rev.page.to_string());
conn.execute("UPDATE mino_pages SET revision_count = revision_count + 1 WHERE id = $1", &[&int_id]).await?;
}
}
}
// Minoteaur doesn't have a delete button so not supporting deletes is clearly fine, probably.
bg.await??;
Ok(())
}
fn url_for(&self, _table: &str, column_content: &str) -> String {
format!("{}#/page/{}", self.config.base_url, column_content)
}
}
impl MinoteaurIndexer {
pub async fn new(config: toml::Table) -> Result<Box<Self>> {
let config: Config = config.try_into()?;
Ok(Box::new(MinoteaurIndexer {
config: Arc::new(config)
}))
}
fn read_database(&self, target: tokio::sync::mpsc::Sender<(ulid::Ulid, minoteaur_types::Object)>) -> Result<()> {
let conn = rusqlite::Connection::open_with_flags(&self.config.db_path, OpenFlags::SQLITE_OPEN_READ_ONLY)?;
// Minoteaur databases are structured so that the system state can be understood by reading objects in ID order, as ID increases with timestamp.
// Technically the clocks might not be perfectly monotonic but this is unlikely enough to ever be significant that I don't care.
let mut objs_stmt = conn.prepare("SELECT id, data FROM objects ORDER BY id ASC")?;
for row in objs_stmt.query_map([], |row| {
let id: Vec<u8> = row.get(0)?;
let data: Vec<u8> = row.get(1)?;
Ok((id, data))
})? {
let (id, data) = row?;
target.blocking_send((ulid::Ulid::from_bytes(id.try_into().map_err(|_| anyhow!("conversion failure"))?), rmp_serde::decode::from_slice(&data).context("parse object")?))?;
}
Ok(())
}
}