1
0
mirror of https://github.com/osmarks/random-stuff synced 2025-01-02 21:40:35 +00:00

upload failed binary HTML thing

This commit is contained in:
osmarks 2020-08-31 21:36:44 +01:00
parent 19e50a1186
commit 393d8632f6
6 changed files with 411 additions and 0 deletions

1
binary-html/.gitignore vendored Normal file
View File

@ -0,0 +1 @@
/target

153
binary-html/Cargo.lock generated Normal file
View File

@ -0,0 +1,153 @@
# This file is automatically @generated by Cargo.
# It is not intended for manual editing.
[[package]]
name = "autocfg"
version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cdb031dd78e28731d87d56cc8ffef4a8f36ca26c38fe2de700543e627f8a464a"
[[package]]
name = "binary-html"
version = "0.1.0"
dependencies = [
"num_enum",
"rmp",
"thiserror",
]
[[package]]
name = "byteorder"
version = "1.3.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "08c48aae112d48ed9f069b33538ea9e3e90aa263cfa3d1c24309612b1f7472de"
[[package]]
name = "derivative"
version = "2.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cb582b60359da160a9477ee80f15c8d784c477e69c217ef2cdd4169c24ea380f"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "num-traits"
version = "0.2.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ac267bcc07f48ee5f8935ab0d24f316fb722d7a1292e2913f0cc196b29ffd611"
dependencies = [
"autocfg",
]
[[package]]
name = "num_enum"
version = "0.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "226b45a5c2ac4dd696ed30fa6b94b057ad909c7b7fc2e0d0808192bced894066"
dependencies = [
"derivative",
"num_enum_derive",
]
[[package]]
name = "num_enum_derive"
version = "0.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1c0fd9eba1d5db0994a239e09c1be402d35622277e35468ba891aa5e3188ce7e"
dependencies = [
"proc-macro-crate",
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "proc-macro-crate"
version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1d6ea3c4595b96363c13943497db34af4460fb474a95c43f4446ad341b8c9785"
dependencies = [
"toml",
]
[[package]]
name = "proc-macro2"
version = "1.0.19"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "04f5f085b5d71e2188cb8271e5da0161ad52c3f227a661a3c135fdf28e258b12"
dependencies = [
"unicode-xid",
]
[[package]]
name = "quote"
version = "1.0.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "aa563d17ecb180e500da1cfd2b028310ac758de548efdd203e18f283af693f37"
dependencies = [
"proc-macro2",
]
[[package]]
name = "rmp"
version = "0.8.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0f10b46df14cf1ee1ac7baa4d2fbc2c52c0622a4b82fa8740e37bc452ac0184f"
dependencies = [
"byteorder",
"num-traits",
]
[[package]]
name = "serde"
version = "1.0.115"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e54c9a88f2da7238af84b5101443f0c0d0a3bbdc455e34a5c9497b1903ed55d5"
[[package]]
name = "syn"
version = "1.0.39"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "891d8d6567fe7c7f8835a3a98af4208f3846fba258c1bc3c31d6e506239f11f9"
dependencies = [
"proc-macro2",
"quote",
"unicode-xid",
]
[[package]]
name = "thiserror"
version = "1.0.20"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7dfdd070ccd8ccb78f4ad66bf1982dc37f620ef696c6b5028fe2ed83dd3d0d08"
dependencies = [
"thiserror-impl",
]
[[package]]
name = "thiserror-impl"
version = "1.0.20"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bd80fc12f73063ac132ac92aceea36734f04a1d93c1240c6944e23a3b8841793"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "toml"
version = "0.5.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ffc92d160b1eef40665be3a05630d003936a3bc7da7421277846c2613e92c71a"
dependencies = [
"serde",
]
[[package]]
name = "unicode-xid"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f7fe0bb3479651439c9112f72b6c505038574c9fbb575ed1bf3b797fa39dd564"

10
binary-html/Cargo.toml Normal file
View File

@ -0,0 +1,10 @@
[package]
name = "binary-html"
version = "0.1.0"
authors = ["osmarks <osmarks@protonmail.com>"]
edition = "2018"
[dependencies]
num_enum = "0.5"
rmp = "0.8"
thiserror = "1.0"

11
binary-html/README.md Normal file
View File

@ -0,0 +1,11 @@
# binary-html
Contains a failed attempt at making a msgpack-based binary serialization format for HTML.
This would have a number of advantages, such as likely being much faster to parse, not having to deal with all the weird parsing irregularities textual HTML has to for backward compatibility reasons, and being more compact.
Unfortunately, this implementation doesn't actually work (quite possibly because I misunderstood how readers work), the code is kind of terrible anyway, and I cannot be bothered to fix it.
## Format
A node is either text or an element. An element has a tag name and optionally children and attributes.
Text is serialized directly to strings.
An element becomes `[tag, attributes, children]`, where tag is either a string or a number representing one of the more common tag types, attributes is a map of strings/numbers (same idea) to strings, and children is a list of nodes. Attributes can be omitted. Children can also be omitted if attributes are too.

47
binary-html/serde.rs Normal file
View File

@ -0,0 +1,47 @@
use std::collections::HashMap;
use serde::{Serialize, Deserialize, Serializer, ser::SerializeTuple, de, de::Visitor, de::SeqAccess};
use serde_repr::{Serialize_repr, Deserialize_repr};
#[derive(Serialize_repr, Deserialize_repr, PartialEq, Eq, Debug, Hash)]
#[repr(u8)]
enum CommonTag {
Div,
Span,
P,
H1,
}
#[derive(Serialize_repr, Deserialize_repr, PartialEq, Eq, Debug, Hash)]
#[repr(u8)]
enum CommonAttr {
Class,
Id,
Href,
}
#[derive(Serialize, Deserialize, PartialEq, Eq, Debug, Hash)]
enum Attribute { Common(CommonAttr), Other(String) }
#[derive(Serialize, Deserialize, PartialEq, Eq, Debug, Hash)]
enum Tag { Common(CommonTag), Other(String) }
#[derive(Serialize, Deserialize, PartialEq, Eq, Debug)]
enum Node {
Text(String),
Element { tag: Tag, attributes: HashMap<Attribute, String>, children: Vec<Node> },
ChildlessElement { tag: Tag, attributes: HashMap<Attribute, String> },
AttributelessElement { tag: Tag, children: Vec<Node> },
ContentlessElement(Tag)
}
use html5ever::driver::ParseOpts;
use html5ever::tendril::TendrilSink;
use html5ever::tree_builder::TreeBuilderOpts;
use html5ever::{parse_document, serialize};
fn main() {
let opts = ParseOpts {
tree_builder: TreeBuilderOpts {
drop_doctype: true,
..Default::default()
},
..Default::default()
};
println!("Hello, world!");
}

189
binary-html/src/main.rs Normal file
View File

@ -0,0 +1,189 @@
use std::collections::HashMap;
use std::io;
use rmp::{encode, decode, decode::NumValueReadError};
use std::convert::TryFrom;
use num_enum::{IntoPrimitive, TryFromPrimitive};
use thiserror::Error;
#[derive(IntoPrimitive, TryFromPrimitive)]
#[derive(PartialEq, Eq, Debug, Hash, Clone, Copy)]
#[repr(u8)]
enum CommonTag {
Div = 0,
Span = 1,
P = 2,
H1 = 3,
}
#[derive(IntoPrimitive, TryFromPrimitive)]
#[derive(PartialEq, Eq, Debug, Hash, Clone, Copy)]
#[repr(u8)]
enum CommonAttr {
Class = 0,
Id = 1,
Href = 2,
}
#[derive(PartialEq, Eq, Debug, Hash, Clone)]
enum Attribute { Common(CommonAttr), Other(String) }
#[derive(PartialEq, Eq, Debug, Hash, Clone)]
enum Tag { Common(CommonTag), Other(String) }
#[derive(PartialEq, Eq, Debug, Clone)]
enum Node {
Text(String),
Element { tag: Tag, attributes: HashMap<Attribute, String>, children: Vec<Node> },
SimpleElement { tag: Tag, children: Vec<Node> },
EmptyElement(Tag)
}
fn encode_tag<W: io::Write>(wr: &mut W, tag: &Tag) -> Result<(), encode::ValueWriteError> {
match tag {
Tag::Common(t) => encode::write_u8(wr, (*t).into()),
Tag::Other(t) => encode::write_str(wr, t)
}
}
fn encode_attr<W: io::Write>(wr: &mut W, attr: &Attribute) -> Result<(), encode::ValueWriteError> {
match attr {
Attribute::Common(a) => encode::write_u8(wr, (*a).into()),
Attribute::Other(a) => encode::write_str(wr, a)
}
}
fn encode_node<W: io::Write>(wr: &mut W, node: &Node) -> Result<(), encode::ValueWriteError> {
match node {
Node::Text(s) => encode::write_str(wr, s),
Node::Element { tag, attributes, children } => {
encode::write_array_len(wr, 3)?;
encode_tag(wr, tag)?;
encode::write_map_len(wr, attributes.len() as u32)?;
for (k, v) in attributes {
encode_attr(wr, k)?;
encode::write_str(wr, v)?;
}
encode::write_array_len(wr, children.len() as u32)?;
for child in children {
encode_node(wr, child)?;
}
Ok(())
},
Node::SimpleElement { tag, children } => {
encode::write_array_len(wr, 2)?;
encode_tag(wr, tag)?;
encode::write_array_len(wr, children.len() as u32)?;
for child in children {
encode_node(wr, child)?;
}
Ok(())
},
Node::EmptyElement(tag) =>{
encode::write_array_len(wr, 1)?;
encode_tag(wr, tag)?;
Ok(())
}
}
}
#[derive(Error, Debug)]
enum DecodeError {
#[error("tag ID {0} not known")]
InvalidTagID(u8),
#[error("attribute ID {0} not known")]
InvalidAttrID(u8),
// TODO
#[error("parse fail")]
ParseError
}
fn decode_string<R: io::Read>(r: &mut R) -> Result<String, DecodeError> {
let len = decode::read_str_len(r).map_err(|_| DecodeError::ParseError)?;
let mut buf = Vec::with_capacity(len as usize);
println!("{:?}", buf);
r.read(&mut buf).map_err(|_| DecodeError::ParseError)?;
Ok(String::from_utf8(buf).map_err(|_| DecodeError::ParseError)?)
}
fn decode_tag<R: io::Read>(r: &mut R) -> Result<Tag, DecodeError> {
match decode::read_int(r) {
Ok(x) => {
let x: u8 = x; // satisfy type inference
Ok(Tag::Common(CommonTag::try_from(x).map_err(|x: num_enum::TryFromPrimitiveError<CommonTag>| DecodeError::InvalidTagID(x.number))?))
},
Err(e) => match e {
NumValueReadError::TypeMismatch(_) => Ok(Tag::Other(decode_string(r)?)),
_ => Err(DecodeError::ParseError)
}
}
}
fn decode_attr<R: io::Read>(r: &mut R) -> Result<Attribute, DecodeError> {
match decode::read_int(r) {
Ok(x) => {
let x: u8 = x; // satisfy type inference
Ok(Attribute::Common(CommonAttr::try_from(x)
.map_err(|x: num_enum::TryFromPrimitiveError<CommonAttr>| DecodeError::InvalidAttrID(x.number))?))
},
Err(e) => match e {
NumValueReadError::TypeMismatch(_) => Ok(Attribute::Other(decode_string(r)?)),
_ => Err(DecodeError::ParseError)
}
}
}
fn decode_nodes<R: io::Read>(r: &mut R) -> Result<Vec<Node>, DecodeError> {
let len = decode::read_array_len(r).map_err(|_| DecodeError::ParseError)?;
let mut out = Vec::with_capacity(len as usize);
for _ in 0..len {
out.push(decode_node(r)?);
}
Ok(out)
}
fn decode_node<R: io::Read>(r: &mut R) -> Result<Node, DecodeError> {
match decode::read_array_len(r) {
Ok(len) => {
if len > 3 || len < 1 {
return Err(DecodeError::ParseError)
}
let tag = decode_tag(r)?;
if len > 1 {
if len > 2 {
let maplen = decode::read_map_len(r).map_err(|_| DecodeError::ParseError)?;
let mut attrs = HashMap::with_capacity(maplen as usize);
for _ in 0..maplen {
let key = decode_attr(r)?;
let val = decode_string(r)?;
attrs.insert(key, val);
}
let children = decode_nodes(r)?;
Ok(Node::Element { tag, attributes: attrs, children })
} else {
Ok(Node::SimpleElement { tag, children: decode_nodes(r)? })
}
} else {
Ok(Node::EmptyElement(tag))
}
},
Err(decode::ValueReadError::TypeMismatch(_)) => decode_string(r).map(Node::Text),
Err(_) => Err(DecodeError::ParseError)
}
}
fn main() {
let mut out = Vec::new();
let attrs1 = vec![
(Attribute::Common(CommonAttr::Class), "test1".to_string()),
(Attribute::Common(CommonAttr::Id), "idtest".to_string()),
(Attribute::Other("test-attr".to_string()), "test attr content".to_string())
].into_iter().collect();
let attrs2 = vec![
(Attribute::Common(CommonAttr::Href), "/test-href".to_string()),
(Attribute::Other("test-attr-2".to_string()), "test attr 2 content".to_string()),
(Attribute::Common(CommonAttr::Id), "id2".to_string()),
].into_iter().collect();
let node2 = Node::Element { tag: Tag::Common(CommonTag::Div), attributes: attrs2,
children: vec![Node::Text(String::from("hello, 2"))] };
let node1 = Node::Element { tag: Tag::Common(CommonTag::Div), attributes: attrs1,
children: vec![Node::Text(String::from("hello, world")), node2] };
encode_node(&mut out, &node1).unwrap();
println!("{:?} {:?}", out, node1);
let res = decode_node(&mut std::io::Cursor::new(out)).unwrap();
assert_eq!(res, node1);
}