1
0
mirror of https://github.com/osmarks/random-stuff synced 2024-11-17 17:34:48 +00:00
random-stuff/calibre-indexer/qxml.rs

232 lines
8.4 KiB
Rust

// Earlier version attempting to use quick-xml
// Dropped because SQLite appears to be what most of the time is spent in anyway, and because quick-xml had some issues wrt. escaping
use std::fs;
use anyhow::{Result, Context};
use crossbeam::channel::{bounded};
use crossbeam::thread;
use std::path::PathBuf;
use rusqlite::{params, Connection};
use std::fs::File;
use xml::reader::{EventReader, XmlEvent, ParserConfig};
use quick_xml::{Reader, events::Event};
use std::io::BufReader;
use epub::doc::EpubDoc;
use lazy_static::lazy_static;
use std::collections::HashMap;
#[derive(Debug, Clone)]
struct BookMeta {
title: String,
author: String,
description: String
}
#[derive(Debug, Clone, Copy, PartialEq)]
enum XMLReadState {
None,
ReadingTitle,
ReadingAuthor,
ReadingDescription
}
lazy_static! {
static ref ESCAPES: HashMap<Vec<u8>, Vec<u8>> = {
let mut m = HashMap::new();
m.insert(b"nbsp".to_vec(), b"\xc2\xa0".to_vec());
m.insert(b"copy".to_vec(), b"\xc2\xa9".to_vec());
m.insert(b"eacute".to_vec(), b"\xc3\x89".to_vec());
m.insert(b"shy".to_vec(), b"\xc2\xad".to_vec());
m.insert(b"iuml".to_vec(), b"\xc3\x8f".to_vec());
m
};
}
// Extract text from an XHTML page in an ebook
// Ignores <script>, <style>, etc
// Also extracts chapter titles via assuming that any <hN> is part of a chapter title
fn extract_text(r: Vec<u8>) -> Result<(String, String)> {
//println!("{:?}", String::from_utf8_lossy(r.as_slice()));
let mut text = String::new();
let conf = ParserConfig::new()
.ignore_comments(true)
.cdata_to_characters(true);
let mut ignoring = false;
let mut newline_appended_last = false;
let mut in_header = None;
let mut chapter = String::new();
let mut reader = Reader::from_reader(r.as_slice());
let mut buf = Vec::new();
reader.trim_text_end(true);
loop {
match reader.read_event(&mut buf)? {
Event::Start(ref e) => {
match e.name() {
b"style" | b"script" | b"nav" | b"iframe" | b"svg" => { ignoring = true },
b"h1" | b"h2" | b"h3" | b"h4" | b"h5" | b"h6" => { ignoring = false; in_header = Some(e.name().to_vec()) }
_ => { ignoring = false }
}
},
Event::Text(new) => {
if !ignoring {
text += &new.unescape_and_decode_with_custom_entities(&reader, &*ESCAPES)?;
if in_header.is_some() && &*new != b"\xA7" && &*new != b"*" {
chapter += &new.unescape_and_decode_with_custom_entities(&reader, &*ESCAPES)?;
}
newline_appended_last = false;
}
},
Event::Eof => break,
Event::End(ref e) => {
if let Some(ref h) = in_header {
if h == &e.name() {
chapter += "\n";
in_header = None;
}
}
ignoring = false;
match e.name() {
b"span" | b"sub" | b"sup" | b"small" | b"i" | b"b" | b"em" | b"strike" | b"strong" | b"a" | b"link" | b"head" => {}
x => {
if !newline_appended_last {
text += "\n";
newline_appended_last = true;
if in_header.is_some() && x == b"br" {
chapter += "\n";
}
}
}
}
}
_ => {}
}
}
Ok((text, chapter))
}
fn read_opf(path: PathBuf) -> Result<BookMeta> {
let file = File::open(path)?;
let file = BufReader::new(file);
let conf = ParserConfig::new()
.ignore_comments(true)
.cdata_to_characters(true);
let mut meta = BookMeta {
title: "".to_string(),
author: "".to_string(),
description: "".to_string()
};
let mut buf = String::new();
let mut state = XMLReadState::None;
for e in EventReader::new_with_config(file, conf) {
match e? {
XmlEvent::StartElement { name, .. } => {
match name.local_name.as_str() {
"title" => { state = XMLReadState::ReadingTitle },
"creator" => { state = XMLReadState::ReadingAuthor },
"description" => { state = XMLReadState::ReadingDescription },
_ => {}
}
},
XmlEvent::Characters(s) => {
if state != XMLReadState::None {
buf += &s;
}
},
XmlEvent::EndElement { .. } => {
match state {
XMLReadState::ReadingTitle => { meta.title = buf.clone() },
XMLReadState::ReadingDescription => { meta.description = buf.clone() },
XMLReadState::ReadingAuthor => { meta.author = buf.clone() },
XMLReadState::None => {}
}
state = XMLReadState::None;
buf.clear();
}
_ => {}
}
}
Ok(meta)
}
fn path_append(p: &PathBuf, c: &str) -> PathBuf {
let mut o = p.clone();
o.push(c);
o
}
fn main() -> Result<()> {
let (tx, rx) = bounded::<PathBuf>(16);
let res: Result<()> = thread::scope(|sc| {
let db = Connection::open("./data.sqlite3")?;
db.execute_batch("
BEGIN;
CREATE TABLE IF NOT EXISTS files (
id INTEGER PRIMARY KEY,
path BLOB NOT NULL,
last_modified INTEGER NOT NULL
);
CREATE VIRTUAL TABLE IF NOT EXISTS data USING fts5 (
author,
book,
chapter,
content,
file
);
COMMIT;
").with_context(|| "database initialization")?;
let mut threads: Vec<thread::ScopedJoinHandle<()>> = vec![];
for i in 0..num_cpus::get() {
let rx = rx.clone();
let go = move || -> Result<()> {
for book_dir in rx.iter() {
println!("{} begin handling {:?}", i, book_dir);
let meta = read_opf(path_append(&book_dir, "metadata.opf")).with_context(|| format!("OPF metadata parsing for {:?}", book_dir))?;
let epub_path = path_append(&book_dir, &format!("{} - {}.epub", meta.title, meta.author));
if epub_path.exists() {
let mut doc = EpubDoc::new(epub_path).with_context(|| format!("reading {:?}", book_dir))?;
let spine = doc.spine.clone();
for resource in spine {
let content = doc.get_resource(&resource).with_context(|| format!("reading {:?} in {:?}", &resource, book_dir))?;
let (mut content, chapter) = extract_text(content).with_context(|| format!("parsing {:?} in {:?}", &resource, book_dir))?;
let chapter = chapter.trim();
// in place trim of newlines - avoid allocating new string
while content.ends_with("\n") {
content.truncate(content.len() - 1);
}
if content != "" {
//println!("{}: {}: {}", meta.title, resource, chapter);
}
}
//println!("{} - {}", meta.author, meta.title);
}
println!("{} end handling {:?}", i, book_dir);
}
Ok(())
};
threads.push(sc.spawn(move |_| { go().unwrap() }));
}
for author_dir in fs::read_dir("/data/calibre").with_context(|| "reading library location")? {
let author_dir = author_dir?;
if author_dir.file_type()?.is_dir() {
for book_dir in fs::read_dir(author_dir.path())? {
let path = book_dir?.path();
//println!("{:?}", path);
tx.send(path)?;
}
}
}
std::mem::drop(tx);
for thread in threads {
thread.join().unwrap();
}
Ok(())
}).unwrap();
res?;
Ok(())
}