use crate::page; use crate::page::{Id, Tag, TagRef}; use anyhow::Result; use async_trait::async_trait; use log::info; use std::collections::{HashMap, HashSet}; /// Indexing wrapper over `page::Store` /// /// `Index` keeps track of page data neccessary /// to quickly look them up by a tag query. #[derive(Default)] pub struct Index { page_ids_by_tag: HashMap>, tags_by_page_id: HashMap>, page_info_by_page_id: HashMap, store: T, } /// Basic page info #[derive(Debug, Clone)] pub struct PageInfo { pub id: Id, pub title: String, pub headers: page::Headers, } /// Results of tag query lookup #[derive(Default, Debug, Clone)] pub struct FindResults { pub matching_pages: Vec, pub matching_tags: Vec, } impl FindResults { fn empty() -> Self { Self::default() } } /// More compact (post-processed) `FindResults` pub struct CompactResults { // all tags that were not already filtered on pub tags: Vec<(Tag, usize)>, // all pages that can't be reached by one of the `tags` pub direct_hit_pages: Vec, } impl Index where T: page::StoreMut, { pub async fn new(store: T) -> Result { let mut s = Index { page_ids_by_tag: Default::default(), tags_by_page_id: Default::default(), page_info_by_page_id: Default::default(), store, }; s.index_inner().await?; Ok(s) } /// Index the inner `Store` async fn index_inner(&mut self) -> Result<()> { let mut count = 0; let ids = self.store.iter().await?.collect::>(); for id in ids { count += 1; let page = self.store.get(id).await?; self.add_data_for_page(&page); } info!("Indexed {} pages", count); Ok(()) } /// Compact the results to a shorter form pub fn compact_results(&self, results: &FindResults) -> CompactResults { let matching_tags: HashSet = results.matching_tags.iter().cloned().collect(); let mut unmatched_tags: HashMap = Default::default(); for page_info in &results.matching_pages { for page_tag in &self.tags_by_page_id[&page_info.id] { if !matching_tags.contains(page_tag.as_str()) { *unmatched_tags.entry(page_tag.to_owned()).or_default() += 1; } } } let mut direct_hit_pages: Vec = results .matching_pages .iter() .filter(|page_info| { self.tags_by_page_id[&page_info.id] .iter() .filter(|page_tag| !matching_tags.contains(page_tag.as_str())) .count() == 0 }) .cloned() .collect(); direct_hit_pages.sort_by(|a, b| a.title.cmp(&b.title)); let mut tags: Vec<_> = unmatched_tags.into_iter().collect(); tags.sort_by(|a, b| a.1.cmp(&b.1).reverse().then_with(|| a.0.cmp(&b.0))); CompactResults { tags, direct_hit_pages, } } } impl Index { /// Lookup pages with a list of tags pub fn find(&self, tags: &[TagRef]) -> FindResults { let mut matching_pages: Vec = vec![]; let mut matching_tags: Vec = vec![]; let mut already_tried_tags = HashSet::new(); if tags.is_empty() { matching_pages = self .tags_by_page_id .keys() .cloned() .map(|id| self.page_info_by_page_id[&id].clone()) .collect(); } for tag in tags { if already_tried_tags.contains(tag) { continue; } already_tried_tags.insert(tag); if matching_tags.is_empty() { if let Some(ids) = &self.page_ids_by_tag.get(*tag) { matching_pages = ids .iter() .map(|id| self.page_info_by_page_id[id].clone()) .collect(); matching_tags.push(tag.to_string()) } else { return FindResults::empty(); } } else { if let Some(ids) = self.page_ids_by_tag.get(*tag) { let new_matching_pages: Vec<_> = matching_pages .iter() .filter(|info| ids.contains(info.id.as_str())) .map(|id| id.to_owned()) .collect(); if new_matching_pages.is_empty() { return FindResults { matching_pages, matching_tags, }; } matching_pages = new_matching_pages; matching_tags.push(tag.to_string()); } else { return FindResults { matching_pages, matching_tags, }; } } } matching_pages.sort_unstable_by_key(|info| std::cmp::Reverse(info.headers.creation_time)); FindResults { matching_pages, matching_tags, } } fn add_data_for_page(&mut self, page: &page::Parsed) { for tag in &page.tags { self.page_ids_by_tag .entry(tag.clone()) .or_default() .insert(page.id().to_owned()); } self.tags_by_page_id .insert(page.id().to_owned(), page.tags.clone()); self.page_info_by_page_id.insert( page.id().to_owned(), PageInfo { id: page.id().to_owned(), title: page.title.clone(), headers: page.headers.clone(), }, ); } fn clean_data_for_page(&mut self, id: Id) { for tag in self .tags_by_page_id .get(&id) .cloned() .unwrap_or_else(|| HashSet::new()) { self.page_ids_by_tag .get_mut(&tag) .map(|set| set.remove(&id)); } self.tags_by_page_id.remove(&id); self.page_info_by_page_id.remove(&id); } } #[async_trait] impl page::StoreMut for Index where T: page::StoreMut + Send + Sync, { async fn get(&self, id: Id) -> Result { self.store.get(id).await } async fn put(&mut self, page: &page::Parsed) -> Result<()> { self.store.put(page).await?; if let Some(_tags) = self.tags_by_page_id.get(page.id()) { self.clean_data_for_page(page.id().to_owned()); } self.add_data_for_page(page); Ok(()) } async fn delete(&mut self, id: Id) -> Result<()> { self.store.delete(id.clone()).await?; self.clean_data_for_page(id); Ok(()) } async fn iter<'s>(&'s self) -> Result + 's>> { self.store.iter().await } }