mirror of
https://github.com/osmarks/random-stuff
synced 2025-01-24 16:07:07 +00:00
upload failed binary HTML thing
This commit is contained in:
parent
19e50a1186
commit
393d8632f6
1
binary-html/.gitignore
vendored
Normal file
1
binary-html/.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
||||
/target
|
153
binary-html/Cargo.lock
generated
Normal file
153
binary-html/Cargo.lock
generated
Normal file
@ -0,0 +1,153 @@
|
||||
# This file is automatically @generated by Cargo.
|
||||
# It is not intended for manual editing.
|
||||
[[package]]
|
||||
name = "autocfg"
|
||||
version = "1.0.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "cdb031dd78e28731d87d56cc8ffef4a8f36ca26c38fe2de700543e627f8a464a"
|
||||
|
||||
[[package]]
|
||||
name = "binary-html"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"num_enum",
|
||||
"rmp",
|
||||
"thiserror",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "byteorder"
|
||||
version = "1.3.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "08c48aae112d48ed9f069b33538ea9e3e90aa263cfa3d1c24309612b1f7472de"
|
||||
|
||||
[[package]]
|
||||
name = "derivative"
|
||||
version = "2.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "cb582b60359da160a9477ee80f15c8d784c477e69c217ef2cdd4169c24ea380f"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "num-traits"
|
||||
version = "0.2.12"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ac267bcc07f48ee5f8935ab0d24f316fb722d7a1292e2913f0cc196b29ffd611"
|
||||
dependencies = [
|
||||
"autocfg",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "num_enum"
|
||||
version = "0.5.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "226b45a5c2ac4dd696ed30fa6b94b057ad909c7b7fc2e0d0808192bced894066"
|
||||
dependencies = [
|
||||
"derivative",
|
||||
"num_enum_derive",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "num_enum_derive"
|
||||
version = "0.5.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1c0fd9eba1d5db0994a239e09c1be402d35622277e35468ba891aa5e3188ce7e"
|
||||
dependencies = [
|
||||
"proc-macro-crate",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "proc-macro-crate"
|
||||
version = "0.1.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1d6ea3c4595b96363c13943497db34af4460fb474a95c43f4446ad341b8c9785"
|
||||
dependencies = [
|
||||
"toml",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "proc-macro2"
|
||||
version = "1.0.19"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "04f5f085b5d71e2188cb8271e5da0161ad52c3f227a661a3c135fdf28e258b12"
|
||||
dependencies = [
|
||||
"unicode-xid",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "quote"
|
||||
version = "1.0.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "aa563d17ecb180e500da1cfd2b028310ac758de548efdd203e18f283af693f37"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rmp"
|
||||
version = "0.8.9"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0f10b46df14cf1ee1ac7baa4d2fbc2c52c0622a4b82fa8740e37bc452ac0184f"
|
||||
dependencies = [
|
||||
"byteorder",
|
||||
"num-traits",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "serde"
|
||||
version = "1.0.115"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e54c9a88f2da7238af84b5101443f0c0d0a3bbdc455e34a5c9497b1903ed55d5"
|
||||
|
||||
[[package]]
|
||||
name = "syn"
|
||||
version = "1.0.39"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "891d8d6567fe7c7f8835a3a98af4208f3846fba258c1bc3c31d6e506239f11f9"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"unicode-xid",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "thiserror"
|
||||
version = "1.0.20"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7dfdd070ccd8ccb78f4ad66bf1982dc37f620ef696c6b5028fe2ed83dd3d0d08"
|
||||
dependencies = [
|
||||
"thiserror-impl",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "thiserror-impl"
|
||||
version = "1.0.20"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "bd80fc12f73063ac132ac92aceea36734f04a1d93c1240c6944e23a3b8841793"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "toml"
|
||||
version = "0.5.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ffc92d160b1eef40665be3a05630d003936a3bc7da7421277846c2613e92c71a"
|
||||
dependencies = [
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "unicode-xid"
|
||||
version = "0.2.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f7fe0bb3479651439c9112f72b6c505038574c9fbb575ed1bf3b797fa39dd564"
|
10
binary-html/Cargo.toml
Normal file
10
binary-html/Cargo.toml
Normal file
@ -0,0 +1,10 @@
|
||||
[package]
|
||||
name = "binary-html"
|
||||
version = "0.1.0"
|
||||
authors = ["osmarks <osmarks@protonmail.com>"]
|
||||
edition = "2018"
|
||||
|
||||
[dependencies]
|
||||
num_enum = "0.5"
|
||||
rmp = "0.8"
|
||||
thiserror = "1.0"
|
11
binary-html/README.md
Normal file
11
binary-html/README.md
Normal file
@ -0,0 +1,11 @@
|
||||
# binary-html
|
||||
|
||||
Contains a failed attempt at making a msgpack-based binary serialization format for HTML.
|
||||
This would have a number of advantages, such as likely being much faster to parse, not having to deal with all the weird parsing irregularities textual HTML has to for backward compatibility reasons, and being more compact.
|
||||
Unfortunately, this implementation doesn't actually work (quite possibly because I misunderstood how readers work), the code is kind of terrible anyway, and I cannot be bothered to fix it.
|
||||
|
||||
## Format
|
||||
|
||||
A node is either text or an element. An element has a tag name and optionally children and attributes.
|
||||
Text is serialized directly to strings.
|
||||
An element becomes `[tag, attributes, children]`, where tag is either a string or a number representing one of the more common tag types, attributes is a map of strings/numbers (same idea) to strings, and children is a list of nodes. Attributes can be omitted. Children can also be omitted if attributes are too.
|
47
binary-html/serde.rs
Normal file
47
binary-html/serde.rs
Normal file
@ -0,0 +1,47 @@
|
||||
use std::collections::HashMap;
|
||||
use serde::{Serialize, Deserialize, Serializer, ser::SerializeTuple, de, de::Visitor, de::SeqAccess};
|
||||
use serde_repr::{Serialize_repr, Deserialize_repr};
|
||||
|
||||
#[derive(Serialize_repr, Deserialize_repr, PartialEq, Eq, Debug, Hash)]
|
||||
#[repr(u8)]
|
||||
enum CommonTag {
|
||||
Div,
|
||||
Span,
|
||||
P,
|
||||
H1,
|
||||
}
|
||||
#[derive(Serialize_repr, Deserialize_repr, PartialEq, Eq, Debug, Hash)]
|
||||
#[repr(u8)]
|
||||
enum CommonAttr {
|
||||
Class,
|
||||
Id,
|
||||
Href,
|
||||
}
|
||||
#[derive(Serialize, Deserialize, PartialEq, Eq, Debug, Hash)]
|
||||
enum Attribute { Common(CommonAttr), Other(String) }
|
||||
#[derive(Serialize, Deserialize, PartialEq, Eq, Debug, Hash)]
|
||||
enum Tag { Common(CommonTag), Other(String) }
|
||||
#[derive(Serialize, Deserialize, PartialEq, Eq, Debug)]
|
||||
enum Node {
|
||||
Text(String),
|
||||
Element { tag: Tag, attributes: HashMap<Attribute, String>, children: Vec<Node> },
|
||||
ChildlessElement { tag: Tag, attributes: HashMap<Attribute, String> },
|
||||
AttributelessElement { tag: Tag, children: Vec<Node> },
|
||||
ContentlessElement(Tag)
|
||||
}
|
||||
|
||||
use html5ever::driver::ParseOpts;
|
||||
use html5ever::tendril::TendrilSink;
|
||||
use html5ever::tree_builder::TreeBuilderOpts;
|
||||
use html5ever::{parse_document, serialize};
|
||||
|
||||
fn main() {
|
||||
let opts = ParseOpts {
|
||||
tree_builder: TreeBuilderOpts {
|
||||
drop_doctype: true,
|
||||
..Default::default()
|
||||
},
|
||||
..Default::default()
|
||||
};
|
||||
println!("Hello, world!");
|
||||
}
|
189
binary-html/src/main.rs
Normal file
189
binary-html/src/main.rs
Normal file
@ -0,0 +1,189 @@
|
||||
use std::collections::HashMap;
|
||||
use std::io;
|
||||
use rmp::{encode, decode, decode::NumValueReadError};
|
||||
use std::convert::TryFrom;
|
||||
use num_enum::{IntoPrimitive, TryFromPrimitive};
|
||||
use thiserror::Error;
|
||||
|
||||
#[derive(IntoPrimitive, TryFromPrimitive)]
|
||||
#[derive(PartialEq, Eq, Debug, Hash, Clone, Copy)]
|
||||
#[repr(u8)]
|
||||
enum CommonTag {
|
||||
Div = 0,
|
||||
Span = 1,
|
||||
P = 2,
|
||||
H1 = 3,
|
||||
}
|
||||
#[derive(IntoPrimitive, TryFromPrimitive)]
|
||||
#[derive(PartialEq, Eq, Debug, Hash, Clone, Copy)]
|
||||
#[repr(u8)]
|
||||
enum CommonAttr {
|
||||
Class = 0,
|
||||
Id = 1,
|
||||
Href = 2,
|
||||
}
|
||||
#[derive(PartialEq, Eq, Debug, Hash, Clone)]
|
||||
enum Attribute { Common(CommonAttr), Other(String) }
|
||||
#[derive(PartialEq, Eq, Debug, Hash, Clone)]
|
||||
enum Tag { Common(CommonTag), Other(String) }
|
||||
#[derive(PartialEq, Eq, Debug, Clone)]
|
||||
enum Node {
|
||||
Text(String),
|
||||
Element { tag: Tag, attributes: HashMap<Attribute, String>, children: Vec<Node> },
|
||||
SimpleElement { tag: Tag, children: Vec<Node> },
|
||||
EmptyElement(Tag)
|
||||
}
|
||||
|
||||
fn encode_tag<W: io::Write>(wr: &mut W, tag: &Tag) -> Result<(), encode::ValueWriteError> {
|
||||
match tag {
|
||||
Tag::Common(t) => encode::write_u8(wr, (*t).into()),
|
||||
Tag::Other(t) => encode::write_str(wr, t)
|
||||
}
|
||||
}
|
||||
fn encode_attr<W: io::Write>(wr: &mut W, attr: &Attribute) -> Result<(), encode::ValueWriteError> {
|
||||
match attr {
|
||||
Attribute::Common(a) => encode::write_u8(wr, (*a).into()),
|
||||
Attribute::Other(a) => encode::write_str(wr, a)
|
||||
}
|
||||
}
|
||||
|
||||
fn encode_node<W: io::Write>(wr: &mut W, node: &Node) -> Result<(), encode::ValueWriteError> {
|
||||
match node {
|
||||
Node::Text(s) => encode::write_str(wr, s),
|
||||
Node::Element { tag, attributes, children } => {
|
||||
encode::write_array_len(wr, 3)?;
|
||||
encode_tag(wr, tag)?;
|
||||
encode::write_map_len(wr, attributes.len() as u32)?;
|
||||
for (k, v) in attributes {
|
||||
encode_attr(wr, k)?;
|
||||
encode::write_str(wr, v)?;
|
||||
}
|
||||
encode::write_array_len(wr, children.len() as u32)?;
|
||||
for child in children {
|
||||
encode_node(wr, child)?;
|
||||
}
|
||||
Ok(())
|
||||
},
|
||||
Node::SimpleElement { tag, children } => {
|
||||
encode::write_array_len(wr, 2)?;
|
||||
encode_tag(wr, tag)?;
|
||||
encode::write_array_len(wr, children.len() as u32)?;
|
||||
for child in children {
|
||||
encode_node(wr, child)?;
|
||||
}
|
||||
Ok(())
|
||||
},
|
||||
Node::EmptyElement(tag) =>{
|
||||
encode::write_array_len(wr, 1)?;
|
||||
encode_tag(wr, tag)?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Error, Debug)]
|
||||
enum DecodeError {
|
||||
#[error("tag ID {0} not known")]
|
||||
InvalidTagID(u8),
|
||||
#[error("attribute ID {0} not known")]
|
||||
InvalidAttrID(u8),
|
||||
// TODO
|
||||
#[error("parse fail")]
|
||||
ParseError
|
||||
}
|
||||
|
||||
fn decode_string<R: io::Read>(r: &mut R) -> Result<String, DecodeError> {
|
||||
let len = decode::read_str_len(r).map_err(|_| DecodeError::ParseError)?;
|
||||
let mut buf = Vec::with_capacity(len as usize);
|
||||
println!("{:?}", buf);
|
||||
r.read(&mut buf).map_err(|_| DecodeError::ParseError)?;
|
||||
Ok(String::from_utf8(buf).map_err(|_| DecodeError::ParseError)?)
|
||||
}
|
||||
|
||||
fn decode_tag<R: io::Read>(r: &mut R) -> Result<Tag, DecodeError> {
|
||||
match decode::read_int(r) {
|
||||
Ok(x) => {
|
||||
let x: u8 = x; // satisfy type inference
|
||||
Ok(Tag::Common(CommonTag::try_from(x).map_err(|x: num_enum::TryFromPrimitiveError<CommonTag>| DecodeError::InvalidTagID(x.number))?))
|
||||
},
|
||||
Err(e) => match e {
|
||||
NumValueReadError::TypeMismatch(_) => Ok(Tag::Other(decode_string(r)?)),
|
||||
_ => Err(DecodeError::ParseError)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn decode_attr<R: io::Read>(r: &mut R) -> Result<Attribute, DecodeError> {
|
||||
match decode::read_int(r) {
|
||||
Ok(x) => {
|
||||
let x: u8 = x; // satisfy type inference
|
||||
Ok(Attribute::Common(CommonAttr::try_from(x)
|
||||
.map_err(|x: num_enum::TryFromPrimitiveError<CommonAttr>| DecodeError::InvalidAttrID(x.number))?))
|
||||
},
|
||||
Err(e) => match e {
|
||||
NumValueReadError::TypeMismatch(_) => Ok(Attribute::Other(decode_string(r)?)),
|
||||
_ => Err(DecodeError::ParseError)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn decode_nodes<R: io::Read>(r: &mut R) -> Result<Vec<Node>, DecodeError> {
|
||||
let len = decode::read_array_len(r).map_err(|_| DecodeError::ParseError)?;
|
||||
let mut out = Vec::with_capacity(len as usize);
|
||||
for _ in 0..len {
|
||||
out.push(decode_node(r)?);
|
||||
}
|
||||
Ok(out)
|
||||
}
|
||||
|
||||
fn decode_node<R: io::Read>(r: &mut R) -> Result<Node, DecodeError> {
|
||||
match decode::read_array_len(r) {
|
||||
Ok(len) => {
|
||||
if len > 3 || len < 1 {
|
||||
return Err(DecodeError::ParseError)
|
||||
}
|
||||
let tag = decode_tag(r)?;
|
||||
if len > 1 {
|
||||
if len > 2 {
|
||||
let maplen = decode::read_map_len(r).map_err(|_| DecodeError::ParseError)?;
|
||||
let mut attrs = HashMap::with_capacity(maplen as usize);
|
||||
for _ in 0..maplen {
|
||||
let key = decode_attr(r)?;
|
||||
let val = decode_string(r)?;
|
||||
attrs.insert(key, val);
|
||||
}
|
||||
let children = decode_nodes(r)?;
|
||||
Ok(Node::Element { tag, attributes: attrs, children })
|
||||
} else {
|
||||
Ok(Node::SimpleElement { tag, children: decode_nodes(r)? })
|
||||
}
|
||||
} else {
|
||||
Ok(Node::EmptyElement(tag))
|
||||
}
|
||||
},
|
||||
Err(decode::ValueReadError::TypeMismatch(_)) => decode_string(r).map(Node::Text),
|
||||
Err(_) => Err(DecodeError::ParseError)
|
||||
}
|
||||
}
|
||||
|
||||
fn main() {
|
||||
let mut out = Vec::new();
|
||||
let attrs1 = vec![
|
||||
(Attribute::Common(CommonAttr::Class), "test1".to_string()),
|
||||
(Attribute::Common(CommonAttr::Id), "idtest".to_string()),
|
||||
(Attribute::Other("test-attr".to_string()), "test attr content".to_string())
|
||||
].into_iter().collect();
|
||||
let attrs2 = vec![
|
||||
(Attribute::Common(CommonAttr::Href), "/test-href".to_string()),
|
||||
(Attribute::Other("test-attr-2".to_string()), "test attr 2 content".to_string()),
|
||||
(Attribute::Common(CommonAttr::Id), "id2".to_string()),
|
||||
].into_iter().collect();
|
||||
let node2 = Node::Element { tag: Tag::Common(CommonTag::Div), attributes: attrs2,
|
||||
children: vec![Node::Text(String::from("hello, 2"))] };
|
||||
let node1 = Node::Element { tag: Tag::Common(CommonTag::Div), attributes: attrs1,
|
||||
children: vec![Node::Text(String::from("hello, world")), node2] };
|
||||
encode_node(&mut out, &node1).unwrap();
|
||||
println!("{:?} {:?}", out, node1);
|
||||
let res = decode_node(&mut std::io::Cursor::new(out)).unwrap();
|
||||
assert_eq!(res, node1);
|
||||
}
|
Loading…
Reference in New Issue
Block a user