From 393d8632f62c82084ae82a6a34fe143e5ed7dae0 Mon Sep 17 00:00:00 2001 From: osmarks Date: Mon, 31 Aug 2020 21:36:44 +0100 Subject: [PATCH] upload failed binary HTML thing --- binary-html/.gitignore | 1 + binary-html/Cargo.lock | 153 ++++++++++++++++++++++++++++++++ binary-html/Cargo.toml | 10 +++ binary-html/README.md | 11 +++ binary-html/serde.rs | 47 ++++++++++ binary-html/src/main.rs | 189 ++++++++++++++++++++++++++++++++++++++++ 6 files changed, 411 insertions(+) create mode 100644 binary-html/.gitignore create mode 100644 binary-html/Cargo.lock create mode 100644 binary-html/Cargo.toml create mode 100644 binary-html/README.md create mode 100644 binary-html/serde.rs create mode 100644 binary-html/src/main.rs diff --git a/binary-html/.gitignore b/binary-html/.gitignore new file mode 100644 index 0000000..ea8c4bf --- /dev/null +++ b/binary-html/.gitignore @@ -0,0 +1 @@ +/target diff --git a/binary-html/Cargo.lock b/binary-html/Cargo.lock new file mode 100644 index 0000000..136ac4e --- /dev/null +++ b/binary-html/Cargo.lock @@ -0,0 +1,153 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +[[package]] +name = "autocfg" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cdb031dd78e28731d87d56cc8ffef4a8f36ca26c38fe2de700543e627f8a464a" + +[[package]] +name = "binary-html" +version = "0.1.0" +dependencies = [ + "num_enum", + "rmp", + "thiserror", +] + +[[package]] +name = "byteorder" +version = "1.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08c48aae112d48ed9f069b33538ea9e3e90aa263cfa3d1c24309612b1f7472de" + +[[package]] +name = "derivative" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb582b60359da160a9477ee80f15c8d784c477e69c217ef2cdd4169c24ea380f" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "num-traits" +version = "0.2.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac267bcc07f48ee5f8935ab0d24f316fb722d7a1292e2913f0cc196b29ffd611" +dependencies = [ + "autocfg", +] + +[[package]] +name = "num_enum" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "226b45a5c2ac4dd696ed30fa6b94b057ad909c7b7fc2e0d0808192bced894066" +dependencies = [ + "derivative", + "num_enum_derive", +] + +[[package]] +name = "num_enum_derive" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1c0fd9eba1d5db0994a239e09c1be402d35622277e35468ba891aa5e3188ce7e" +dependencies = [ + "proc-macro-crate", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "proc-macro-crate" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d6ea3c4595b96363c13943497db34af4460fb474a95c43f4446ad341b8c9785" +dependencies = [ + "toml", +] + +[[package]] +name = "proc-macro2" +version = "1.0.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "04f5f085b5d71e2188cb8271e5da0161ad52c3f227a661a3c135fdf28e258b12" +dependencies = [ + "unicode-xid", +] + +[[package]] +name = "quote" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aa563d17ecb180e500da1cfd2b028310ac758de548efdd203e18f283af693f37" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "rmp" +version = "0.8.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0f10b46df14cf1ee1ac7baa4d2fbc2c52c0622a4b82fa8740e37bc452ac0184f" +dependencies = [ + "byteorder", + "num-traits", +] + +[[package]] +name = "serde" +version = "1.0.115" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e54c9a88f2da7238af84b5101443f0c0d0a3bbdc455e34a5c9497b1903ed55d5" + +[[package]] +name = "syn" +version = "1.0.39" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "891d8d6567fe7c7f8835a3a98af4208f3846fba258c1bc3c31d6e506239f11f9" +dependencies = [ + "proc-macro2", + "quote", + "unicode-xid", +] + +[[package]] +name = "thiserror" +version = "1.0.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7dfdd070ccd8ccb78f4ad66bf1982dc37f620ef696c6b5028fe2ed83dd3d0d08" +dependencies = [ + "thiserror-impl", +] + +[[package]] +name = "thiserror-impl" +version = "1.0.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bd80fc12f73063ac132ac92aceea36734f04a1d93c1240c6944e23a3b8841793" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "toml" +version = "0.5.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ffc92d160b1eef40665be3a05630d003936a3bc7da7421277846c2613e92c71a" +dependencies = [ + "serde", +] + +[[package]] +name = "unicode-xid" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f7fe0bb3479651439c9112f72b6c505038574c9fbb575ed1bf3b797fa39dd564" diff --git a/binary-html/Cargo.toml b/binary-html/Cargo.toml new file mode 100644 index 0000000..5af49e7 --- /dev/null +++ b/binary-html/Cargo.toml @@ -0,0 +1,10 @@ +[package] +name = "binary-html" +version = "0.1.0" +authors = ["osmarks "] +edition = "2018" + +[dependencies] +num_enum = "0.5" +rmp = "0.8" +thiserror = "1.0" \ No newline at end of file diff --git a/binary-html/README.md b/binary-html/README.md new file mode 100644 index 0000000..7227a97 --- /dev/null +++ b/binary-html/README.md @@ -0,0 +1,11 @@ +# binary-html + +Contains a failed attempt at making a msgpack-based binary serialization format for HTML. +This would have a number of advantages, such as likely being much faster to parse, not having to deal with all the weird parsing irregularities textual HTML has to for backward compatibility reasons, and being more compact. +Unfortunately, this implementation doesn't actually work (quite possibly because I misunderstood how readers work), the code is kind of terrible anyway, and I cannot be bothered to fix it. + +## Format + +A node is either text or an element. An element has a tag name and optionally children and attributes. +Text is serialized directly to strings. +An element becomes `[tag, attributes, children]`, where tag is either a string or a number representing one of the more common tag types, attributes is a map of strings/numbers (same idea) to strings, and children is a list of nodes. Attributes can be omitted. Children can also be omitted if attributes are too. \ No newline at end of file diff --git a/binary-html/serde.rs b/binary-html/serde.rs new file mode 100644 index 0000000..f509151 --- /dev/null +++ b/binary-html/serde.rs @@ -0,0 +1,47 @@ +use std::collections::HashMap; +use serde::{Serialize, Deserialize, Serializer, ser::SerializeTuple, de, de::Visitor, de::SeqAccess}; +use serde_repr::{Serialize_repr, Deserialize_repr}; + +#[derive(Serialize_repr, Deserialize_repr, PartialEq, Eq, Debug, Hash)] +#[repr(u8)] +enum CommonTag { + Div, + Span, + P, + H1, +} +#[derive(Serialize_repr, Deserialize_repr, PartialEq, Eq, Debug, Hash)] +#[repr(u8)] +enum CommonAttr { + Class, + Id, + Href, +} +#[derive(Serialize, Deserialize, PartialEq, Eq, Debug, Hash)] +enum Attribute { Common(CommonAttr), Other(String) } +#[derive(Serialize, Deserialize, PartialEq, Eq, Debug, Hash)] +enum Tag { Common(CommonTag), Other(String) } +#[derive(Serialize, Deserialize, PartialEq, Eq, Debug)] +enum Node { + Text(String), + Element { tag: Tag, attributes: HashMap, children: Vec }, + ChildlessElement { tag: Tag, attributes: HashMap }, + AttributelessElement { tag: Tag, children: Vec }, + ContentlessElement(Tag) +} + +use html5ever::driver::ParseOpts; +use html5ever::tendril::TendrilSink; +use html5ever::tree_builder::TreeBuilderOpts; +use html5ever::{parse_document, serialize}; + +fn main() { + let opts = ParseOpts { + tree_builder: TreeBuilderOpts { + drop_doctype: true, + ..Default::default() + }, + ..Default::default() + }; + println!("Hello, world!"); +} diff --git a/binary-html/src/main.rs b/binary-html/src/main.rs new file mode 100644 index 0000000..4446e8e --- /dev/null +++ b/binary-html/src/main.rs @@ -0,0 +1,189 @@ +use std::collections::HashMap; +use std::io; +use rmp::{encode, decode, decode::NumValueReadError}; +use std::convert::TryFrom; +use num_enum::{IntoPrimitive, TryFromPrimitive}; +use thiserror::Error; + +#[derive(IntoPrimitive, TryFromPrimitive)] +#[derive(PartialEq, Eq, Debug, Hash, Clone, Copy)] +#[repr(u8)] +enum CommonTag { + Div = 0, + Span = 1, + P = 2, + H1 = 3, +} +#[derive(IntoPrimitive, TryFromPrimitive)] +#[derive(PartialEq, Eq, Debug, Hash, Clone, Copy)] +#[repr(u8)] +enum CommonAttr { + Class = 0, + Id = 1, + Href = 2, +} +#[derive(PartialEq, Eq, Debug, Hash, Clone)] +enum Attribute { Common(CommonAttr), Other(String) } +#[derive(PartialEq, Eq, Debug, Hash, Clone)] +enum Tag { Common(CommonTag), Other(String) } +#[derive(PartialEq, Eq, Debug, Clone)] +enum Node { + Text(String), + Element { tag: Tag, attributes: HashMap, children: Vec }, + SimpleElement { tag: Tag, children: Vec }, + EmptyElement(Tag) +} + +fn encode_tag(wr: &mut W, tag: &Tag) -> Result<(), encode::ValueWriteError> { + match tag { + Tag::Common(t) => encode::write_u8(wr, (*t).into()), + Tag::Other(t) => encode::write_str(wr, t) + } +} +fn encode_attr(wr: &mut W, attr: &Attribute) -> Result<(), encode::ValueWriteError> { + match attr { + Attribute::Common(a) => encode::write_u8(wr, (*a).into()), + Attribute::Other(a) => encode::write_str(wr, a) + } +} + +fn encode_node(wr: &mut W, node: &Node) -> Result<(), encode::ValueWriteError> { + match node { + Node::Text(s) => encode::write_str(wr, s), + Node::Element { tag, attributes, children } => { + encode::write_array_len(wr, 3)?; + encode_tag(wr, tag)?; + encode::write_map_len(wr, attributes.len() as u32)?; + for (k, v) in attributes { + encode_attr(wr, k)?; + encode::write_str(wr, v)?; + } + encode::write_array_len(wr, children.len() as u32)?; + for child in children { + encode_node(wr, child)?; + } + Ok(()) + }, + Node::SimpleElement { tag, children } => { + encode::write_array_len(wr, 2)?; + encode_tag(wr, tag)?; + encode::write_array_len(wr, children.len() as u32)?; + for child in children { + encode_node(wr, child)?; + } + Ok(()) + }, + Node::EmptyElement(tag) =>{ + encode::write_array_len(wr, 1)?; + encode_tag(wr, tag)?; + Ok(()) + } + } +} + +#[derive(Error, Debug)] +enum DecodeError { + #[error("tag ID {0} not known")] + InvalidTagID(u8), + #[error("attribute ID {0} not known")] + InvalidAttrID(u8), + // TODO + #[error("parse fail")] + ParseError +} + +fn decode_string(r: &mut R) -> Result { + let len = decode::read_str_len(r).map_err(|_| DecodeError::ParseError)?; + let mut buf = Vec::with_capacity(len as usize); + println!("{:?}", buf); + r.read(&mut buf).map_err(|_| DecodeError::ParseError)?; + Ok(String::from_utf8(buf).map_err(|_| DecodeError::ParseError)?) +} + +fn decode_tag(r: &mut R) -> Result { + match decode::read_int(r) { + Ok(x) => { + let x: u8 = x; // satisfy type inference + Ok(Tag::Common(CommonTag::try_from(x).map_err(|x: num_enum::TryFromPrimitiveError| DecodeError::InvalidTagID(x.number))?)) + }, + Err(e) => match e { + NumValueReadError::TypeMismatch(_) => Ok(Tag::Other(decode_string(r)?)), + _ => Err(DecodeError::ParseError) + } + } +} + +fn decode_attr(r: &mut R) -> Result { + match decode::read_int(r) { + Ok(x) => { + let x: u8 = x; // satisfy type inference + Ok(Attribute::Common(CommonAttr::try_from(x) + .map_err(|x: num_enum::TryFromPrimitiveError| DecodeError::InvalidAttrID(x.number))?)) + }, + Err(e) => match e { + NumValueReadError::TypeMismatch(_) => Ok(Attribute::Other(decode_string(r)?)), + _ => Err(DecodeError::ParseError) + } + } +} + +fn decode_nodes(r: &mut R) -> Result, DecodeError> { + let len = decode::read_array_len(r).map_err(|_| DecodeError::ParseError)?; + let mut out = Vec::with_capacity(len as usize); + for _ in 0..len { + out.push(decode_node(r)?); + } + Ok(out) +} + +fn decode_node(r: &mut R) -> Result { + match decode::read_array_len(r) { + Ok(len) => { + if len > 3 || len < 1 { + return Err(DecodeError::ParseError) + } + let tag = decode_tag(r)?; + if len > 1 { + if len > 2 { + let maplen = decode::read_map_len(r).map_err(|_| DecodeError::ParseError)?; + let mut attrs = HashMap::with_capacity(maplen as usize); + for _ in 0..maplen { + let key = decode_attr(r)?; + let val = decode_string(r)?; + attrs.insert(key, val); + } + let children = decode_nodes(r)?; + Ok(Node::Element { tag, attributes: attrs, children }) + } else { + Ok(Node::SimpleElement { tag, children: decode_nodes(r)? }) + } + } else { + Ok(Node::EmptyElement(tag)) + } + }, + Err(decode::ValueReadError::TypeMismatch(_)) => decode_string(r).map(Node::Text), + Err(_) => Err(DecodeError::ParseError) + } +} + +fn main() { + let mut out = Vec::new(); + let attrs1 = vec![ + (Attribute::Common(CommonAttr::Class), "test1".to_string()), + (Attribute::Common(CommonAttr::Id), "idtest".to_string()), + (Attribute::Other("test-attr".to_string()), "test attr content".to_string()) + ].into_iter().collect(); + let attrs2 = vec![ + (Attribute::Common(CommonAttr::Href), "/test-href".to_string()), + (Attribute::Other("test-attr-2".to_string()), "test attr 2 content".to_string()), + (Attribute::Common(CommonAttr::Id), "id2".to_string()), + ].into_iter().collect(); + let node2 = Node::Element { tag: Tag::Common(CommonTag::Div), attributes: attrs2, + children: vec![Node::Text(String::from("hello, 2"))] }; + let node1 = Node::Element { tag: Tag::Common(CommonTag::Div), attributes: attrs1, + children: vec![Node::Text(String::from("hello, world")), node2] }; + encode_node(&mut out, &node1).unwrap(); + println!("{:?} {:?}", out, node1); + let res = decode_node(&mut std::io::Cursor::new(out)).unwrap(); + assert_eq!(res, node1); +}