mirror of
https://github.com/osmarks/random-stuff
synced 2025-01-28 01:44:54 +00:00
232 lines
8.4 KiB
Rust
232 lines
8.4 KiB
Rust
// Earlier version attempting to use quick-xml
|
|
// Dropped because SQLite appears to be what most of the time is spent in anyway, and because quick-xml had some issues wrt. escaping
|
|
|
|
use std::fs;
|
|
use anyhow::{Result, Context};
|
|
use crossbeam::channel::{bounded};
|
|
use crossbeam::thread;
|
|
use std::path::PathBuf;
|
|
use rusqlite::{params, Connection};
|
|
use std::fs::File;
|
|
use xml::reader::{EventReader, XmlEvent, ParserConfig};
|
|
use quick_xml::{Reader, events::Event};
|
|
use std::io::BufReader;
|
|
use epub::doc::EpubDoc;
|
|
use lazy_static::lazy_static;
|
|
use std::collections::HashMap;
|
|
|
|
#[derive(Debug, Clone)]
|
|
struct BookMeta {
|
|
title: String,
|
|
author: String,
|
|
description: String
|
|
}
|
|
|
|
#[derive(Debug, Clone, Copy, PartialEq)]
|
|
enum XMLReadState {
|
|
None,
|
|
ReadingTitle,
|
|
ReadingAuthor,
|
|
ReadingDescription
|
|
}
|
|
|
|
lazy_static! {
|
|
static ref ESCAPES: HashMap<Vec<u8>, Vec<u8>> = {
|
|
let mut m = HashMap::new();
|
|
m.insert(b"nbsp".to_vec(), b"\xc2\xa0".to_vec());
|
|
m.insert(b"copy".to_vec(), b"\xc2\xa9".to_vec());
|
|
m.insert(b"eacute".to_vec(), b"\xc3\x89".to_vec());
|
|
m.insert(b"shy".to_vec(), b"\xc2\xad".to_vec());
|
|
m.insert(b"iuml".to_vec(), b"\xc3\x8f".to_vec());
|
|
m
|
|
};
|
|
}
|
|
|
|
// Extract text from an XHTML page in an ebook
|
|
// Ignores <script>, <style>, etc
|
|
// Also extracts chapter titles via assuming that any <hN> is part of a chapter title
|
|
fn extract_text(r: Vec<u8>) -> Result<(String, String)> {
|
|
//println!("{:?}", String::from_utf8_lossy(r.as_slice()));
|
|
let mut text = String::new();
|
|
let conf = ParserConfig::new()
|
|
.ignore_comments(true)
|
|
.cdata_to_characters(true);
|
|
|
|
let mut ignoring = false;
|
|
let mut newline_appended_last = false;
|
|
let mut in_header = None;
|
|
let mut chapter = String::new();
|
|
let mut reader = Reader::from_reader(r.as_slice());
|
|
let mut buf = Vec::new();
|
|
reader.trim_text_end(true);
|
|
loop {
|
|
match reader.read_event(&mut buf)? {
|
|
Event::Start(ref e) => {
|
|
match e.name() {
|
|
b"style" | b"script" | b"nav" | b"iframe" | b"svg" => { ignoring = true },
|
|
b"h1" | b"h2" | b"h3" | b"h4" | b"h5" | b"h6" => { ignoring = false; in_header = Some(e.name().to_vec()) }
|
|
_ => { ignoring = false }
|
|
}
|
|
},
|
|
Event::Text(new) => {
|
|
if !ignoring {
|
|
text += &new.unescape_and_decode_with_custom_entities(&reader, &*ESCAPES)?;
|
|
if in_header.is_some() && &*new != b"\xA7" && &*new != b"*" {
|
|
chapter += &new.unescape_and_decode_with_custom_entities(&reader, &*ESCAPES)?;
|
|
}
|
|
newline_appended_last = false;
|
|
}
|
|
},
|
|
Event::Eof => break,
|
|
Event::End(ref e) => {
|
|
if let Some(ref h) = in_header {
|
|
if h == &e.name() {
|
|
chapter += "\n";
|
|
in_header = None;
|
|
}
|
|
}
|
|
ignoring = false;
|
|
match e.name() {
|
|
b"span" | b"sub" | b"sup" | b"small" | b"i" | b"b" | b"em" | b"strike" | b"strong" | b"a" | b"link" | b"head" => {}
|
|
x => {
|
|
if !newline_appended_last {
|
|
text += "\n";
|
|
newline_appended_last = true;
|
|
if in_header.is_some() && x == b"br" {
|
|
chapter += "\n";
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
_ => {}
|
|
}
|
|
}
|
|
Ok((text, chapter))
|
|
}
|
|
|
|
fn read_opf(path: PathBuf) -> Result<BookMeta> {
|
|
let file = File::open(path)?;
|
|
let file = BufReader::new(file);
|
|
let conf = ParserConfig::new()
|
|
.ignore_comments(true)
|
|
.cdata_to_characters(true);
|
|
|
|
let mut meta = BookMeta {
|
|
title: "".to_string(),
|
|
author: "".to_string(),
|
|
description: "".to_string()
|
|
};
|
|
let mut buf = String::new();
|
|
let mut state = XMLReadState::None;
|
|
for e in EventReader::new_with_config(file, conf) {
|
|
match e? {
|
|
XmlEvent::StartElement { name, .. } => {
|
|
match name.local_name.as_str() {
|
|
"title" => { state = XMLReadState::ReadingTitle },
|
|
"creator" => { state = XMLReadState::ReadingAuthor },
|
|
"description" => { state = XMLReadState::ReadingDescription },
|
|
_ => {}
|
|
}
|
|
},
|
|
XmlEvent::Characters(s) => {
|
|
if state != XMLReadState::None {
|
|
buf += &s;
|
|
}
|
|
},
|
|
XmlEvent::EndElement { .. } => {
|
|
match state {
|
|
XMLReadState::ReadingTitle => { meta.title = buf.clone() },
|
|
XMLReadState::ReadingDescription => { meta.description = buf.clone() },
|
|
XMLReadState::ReadingAuthor => { meta.author = buf.clone() },
|
|
XMLReadState::None => {}
|
|
}
|
|
state = XMLReadState::None;
|
|
buf.clear();
|
|
}
|
|
_ => {}
|
|
}
|
|
}
|
|
Ok(meta)
|
|
}
|
|
|
|
fn path_append(p: &PathBuf, c: &str) -> PathBuf {
|
|
let mut o = p.clone();
|
|
o.push(c);
|
|
o
|
|
}
|
|
|
|
fn main() -> Result<()> {
|
|
let (tx, rx) = bounded::<PathBuf>(16);
|
|
let res: Result<()> = thread::scope(|sc| {
|
|
let db = Connection::open("./data.sqlite3")?;
|
|
db.execute_batch("
|
|
BEGIN;
|
|
CREATE TABLE IF NOT EXISTS files (
|
|
id INTEGER PRIMARY KEY,
|
|
path BLOB NOT NULL,
|
|
last_modified INTEGER NOT NULL
|
|
);
|
|
CREATE VIRTUAL TABLE IF NOT EXISTS data USING fts5 (
|
|
author,
|
|
book,
|
|
chapter,
|
|
content,
|
|
file
|
|
);
|
|
COMMIT;
|
|
").with_context(|| "database initialization")?;
|
|
|
|
let mut threads: Vec<thread::ScopedJoinHandle<()>> = vec![];
|
|
for i in 0..num_cpus::get() {
|
|
let rx = rx.clone();
|
|
let go = move || -> Result<()> {
|
|
for book_dir in rx.iter() {
|
|
println!("{} begin handling {:?}", i, book_dir);
|
|
let meta = read_opf(path_append(&book_dir, "metadata.opf")).with_context(|| format!("OPF metadata parsing for {:?}", book_dir))?;
|
|
let epub_path = path_append(&book_dir, &format!("{} - {}.epub", meta.title, meta.author));
|
|
if epub_path.exists() {
|
|
let mut doc = EpubDoc::new(epub_path).with_context(|| format!("reading {:?}", book_dir))?;
|
|
let spine = doc.spine.clone();
|
|
for resource in spine {
|
|
let content = doc.get_resource(&resource).with_context(|| format!("reading {:?} in {:?}", &resource, book_dir))?;
|
|
let (mut content, chapter) = extract_text(content).with_context(|| format!("parsing {:?} in {:?}", &resource, book_dir))?;
|
|
let chapter = chapter.trim();
|
|
// in place trim of newlines - avoid allocating new string
|
|
while content.ends_with("\n") {
|
|
content.truncate(content.len() - 1);
|
|
}
|
|
if content != "" {
|
|
//println!("{}: {}: {}", meta.title, resource, chapter);
|
|
}
|
|
}
|
|
//println!("{} - {}", meta.author, meta.title);
|
|
}
|
|
println!("{} end handling {:?}", i, book_dir);
|
|
}
|
|
Ok(())
|
|
};
|
|
threads.push(sc.spawn(move |_| { go().unwrap() }));
|
|
}
|
|
|
|
for author_dir in fs::read_dir("/data/calibre").with_context(|| "reading library location")? {
|
|
let author_dir = author_dir?;
|
|
if author_dir.file_type()?.is_dir() {
|
|
for book_dir in fs::read_dir(author_dir.path())? {
|
|
let path = book_dir?.path();
|
|
//println!("{:?}", path);
|
|
tx.send(path)?;
|
|
}
|
|
}
|
|
}
|
|
std::mem::drop(tx);
|
|
|
|
for thread in threads {
|
|
thread.join().unwrap();
|
|
}
|
|
Ok(())
|
|
}).unwrap();
|
|
res?;
|
|
Ok(())
|
|
}
|