commit 99fd700ba6d51007300a9d9aeee05cdda5a5d5f5 Author: 8051Enthusiast <8051Enthusiast@protonmail.com> Date: Thu Apr 9 11:18:47 2020 +0200 Initial commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..801ac3b --- /dev/null +++ b/.gitignore @@ -0,0 +1,21 @@ +/target + +# Swap +[._]*.s[a-v][a-z] +!*.svg # comment out if you don't need vector files +[._]*.sw[a-p] +[._]s[a-rt-v][a-z] +[._]ss[a-gi-z] +[._]sw[a-p] + +# Session +Session.vim +Sessionx.vim + +# Temporary +.netrwhist +*~ +# Auto-generated tag files +tags +# Persistent undo +[._]*.un~ diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..f64c90e --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,136 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +[[package]] +name = "ansi_term" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee49baf6cb617b853aa8d93bf420db2383fab46d314482ca2803b40d5fde979b" +dependencies = [ + "winapi", +] + +[[package]] +name = "atty" +version = "0.2.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8" +dependencies = [ + "hermit-abi", + "libc", + "winapi", +] + +[[package]] +name = "bitflags" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cf1de2fe8c75bc145a2f577add951f8134889b4795d47466a54a5c846d691693" + +[[package]] +name = "byteorder" +version = "1.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08c48aae112d48ed9f069b33538ea9e3e90aa263cfa3d1c24309612b1f7472de" + +[[package]] +name = "clap" +version = "2.33.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5067f5bb2d80ef5d68b4c87db81601f0b75bca627bc2ef76b141d7b846a3c6d9" +dependencies = [ + "ansi_term", + "atty", + "bitflags", + "strsim", + "textwrap", + "unicode-width", + "vec_map", +] + +[[package]] +name = "hermit-abi" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "725cf19794cf90aa94e65050cb4191ff5d8fa87a498383774c47b332e3af952e" +dependencies = [ + "libc", +] + +[[package]] +name = "libc" +version = "0.2.68" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dea0c0405123bba743ee3f91f49b1c7cfb684eef0da0a50110f758ccf24cdff0" + +[[package]] +name = "regex-automata" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae1ded71d66a4a97f5e961fd0cb25a5f366a42a41570d16a763a69c092c26ae4" +dependencies = [ + "byteorder", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.6.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7fe5bd57d1d7414c6b5ed48563a2c855d995ff777729dcd91c369ec7fea395ae" + +[[package]] +name = "regex2fat" +version = "0.1.0" +dependencies = [ + "clap", + "regex-automata", +] + +[[package]] +name = "strsim" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a" + +[[package]] +name = "textwrap" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d326610f408c7a4eb6f51c37c330e496b08506c9457c9d34287ecc38809fb060" +dependencies = [ + "unicode-width", +] + +[[package]] +name = "unicode-width" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "caaa9d531767d1ff2150b9332433f32a24622147e5ebb1f26409d5da67afd479" + +[[package]] +name = "vec_map" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05c78687fb1a80548ae3250346c3db86a80a7cdd77bda190189f2d0a0987c81a" + +[[package]] +name = "winapi" +version = "0.3.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8093091eeb260906a183e6ae1abdba2ef5ef2257a21801128899c3fc699229c6" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..6a73ff9 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,15 @@ +[package] +name = "regex2fat" +description = "Turn your favourite regex into FAT32" +version = "0.1.0" +authors = ["8051Enthusiast <8051Enthusiast@protonmail.com>"] +readme = "README.md" +repository = "https://github.com/8051Enthusiast/regex2fat" +keywords = ["regex", "fat"] +edition = "2018" +license = "Unlicense" + + +[dependencies] +regex-automata = "0.1" +clap = "2.33" diff --git a/README.md b/README.md new file mode 100644 index 0000000..d3ca183 --- /dev/null +++ b/README.md @@ -0,0 +1,40 @@ +regex2fat +========= + +Did you ever want to match a regex, but all you had was a fat32 driver? +Ever wanted to serialize your regex DFAs into one of the most widely supported formats used by over 3 billion devices? +[Are directory loops your thing?](https://xkcd.com/981/) + +Worry no more, with `regex2fat`, this has become easier than ever before! +With just a little `regex2fat '[YOUR] F{4}VOUR{1,7}E (R[^E]G)*EX HERE.' /dev/whatever`, you will have a fat32 regex DFA of your favourite regex. +For example, to see whether the string 'Y FFFFVOURRE EX HEREM' would match, just mount it and check if '/Y/SPACE/F/F/F/F/V/O/U/R/R/E/SPACE/E/X/SPACE/H/E/R/E/M/MATCH' exists. + +To run it, you can [install cargo](https://doc.rust-lang.org/cargo/getting-started/installation.html) and then run `cargo install regex2fat` (or compile it directly from this repo). +If you have the cargo bin directory in your path, you should be able to invoke it like described above. +The file created will be a fat32 image, which can probably be mounted or put on a drive in some way, but most likely shouldn't. + +## FAQ +### Q: How does this work? +A: Regular regexes (i.e. no backreferences and similar advanced features) can be turned into a so called DFA (deterministic state automaton). +This is basically a bunch of arrows going between states, where an arrow is labeled with a letter so that a letter in a state causes the current state to go along the arrow to another state, with a subset of states being accepting. +Yes, I'm bad at explaining, you're better off reading [the wikipedia article on DFAs](https://en.wikipedia.org/wiki/Deterministic_finite_automaton) if you don't know what it is. + +Because I'm lazy, I used [BurntSushi/regex-automata](https://github.com/BurntSushi/regex-automata) to get an DFA from a regex. + +While Fat32 normally has a tree-like structure, each directory just references blocks anywhere on the file system, so the same block can be referenced from multiple directories. +The directories also have no explicit field for parent directories, so one can leave `..` out. +This allows for graph structures inside a file system, which a DFA basically is. + +### Q: Should I use this in production anywhere? +A: No, but I can't stop you. + +### Q: Does this actually work? +A: I've tried it on Windows 10 and Linux so far. +It seems to work flawlessly on Windows as far as I've tested. + +On Linux, the fat32 code claims an directory is invalid if there are two dentries with the same directory name and the same parent in a loop (or something like that), so some paths are forbidden. + +Might be fun to try on some embedded devices. + +### Q: NOOOOOOOOOOO!!! YOU CAN'T TURN A DFA INTO A FAT32 FILE SYSTEM!!!! YOU CAN'T JUST HAVE A DIRECTORY WITH MULTIPLE PARENTS!!! YOU ARE BREAKING THE ASSUMPTION OF LACK OF LOOPERINOS NOOOOOOOOO +A: Haha OS-driven regex engine go brrrrr diff --git a/src/fat32.rs b/src/fat32.rs new file mode 100644 index 0000000..3916c45 --- /dev/null +++ b/src/fat32.rs @@ -0,0 +1,185 @@ +use regex_automata::DFA; +use std::collections::HashMap; +pub type UFat = u32; +pub const BLOCK_SIZE: usize = 512; +const BOOT_SECTOR: [u8; 90] = [ + /* 0 */ 0xeb, 0xfe, 0x90, // jump to self (placeholder) + /* 3 */ 0x72, 0x65, 0x67, 0x65, 0x78, 0x20, 0x20, 0x20, // "regex " as vendor name + /* 11 */ 0x00, 0x02, // bytes per sector (512) + /* 13 */ 0x01, // one sector per cluster, why not + /* 14 */ 0x08, 0x00, // 8 reserved sectors + /* 16 */ 0x01, // one fat sector (don't really need two) + /* 17 */ 0x00, 0x00, // zero for fat32 + /* 19 */ 0x00, 0x00, // zero for fat32 + /* 21 */ 0xF8, // pretend to be a non-removable device + /* 22 */ 0x00, 0x00, // zero for fat32 + /* 24 */ 0x01, 0x00, // it is the year 2020, no one uses CHS + /* 26 */ 0x01, 0x00, // but the values are 1 to prevent divide by zero... + /* 28 */ 0x00, 0x00, 0x00, 0x00, // I don't ever want to boot from this + /* 32 */ 0, 0, 0, 0, // total number of sectors, gets calculated later + /* 36 */ 0, 0, 0, 0, // number of sectors for FAT, gets calculated later + /* 40 */ 0x00, 0x00, // fat mirroring enabled + /* 42 */ 0x00, 0x00, // version 0 + /* 44 */ 0x02, 0x00, 0x00, 0x00, // first cluster of root directory is 2 + /* 48 */ 0x01, 0x00, // FSINFO location + /* 50 */ 0x06, 0x00, // backup in sector 6 + /* 52 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 12 zeros reserved + /* 64 */ 0x80, // sure hope no one ever uses this on a floppy + /* 65 */ 0x00, // reserved + /* 66 */ 0x00, // no volume label/serial + /* 67 */ 0x00, 0x00, 0x00, 0x00, // no serial + /* 71 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // no volume label + /* 82 */ 0x66, 0x61, 0x74, 0x33, 0x32, 0x20, 0x20, 0x20 // "FAT32 " +]; +const BOOT_SECTOR_TOTAL_SEC_32: usize = 32; +const BOOT_SECTOR_FAT_SZ_32: usize = 36; + +const FSINFO_HEAD: [u8; 4] = [0x52, 0x52, 0x61, 0x41]; + +const FSINFO_TAIL: [u8; 28] = [ + 0x72, 0x72, 0x41, 0x61, // required signature + 0x00, 0x00, 0x00, 0x00, // ideally, we used all sectors (else we would just make the image smaller) + 0xff, 0xff, 0xff, 0xff, // don't know where the first free sector is, if there is none + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // a bunch of zeros + 0x00, 0x00, 0x55, 0xaa // classic IBM +]; + +const FAT32_EOF: [u8; 4] = [0xff, 0xff, 0xff, 0x0f]; + +pub struct StatePosInfo { + pub block: UFat, // position (in blocks) of state dir + pub byte_sized: usize, // size (in bytes) of state dir +} +pub struct StateFatMap { + pub blocks: UFat, + pub order_list: Vec, + pub pos_hash: HashMap, +} + + +fn write_u32_into(into: &mut [u8], pos: usize, val: u32) { + // why use indexing when iterators do the job in triple the space + // (what I would love is assigning to slices) + for (x, &y) in into.iter_mut().skip(pos).take(4).zip(val.to_le_bytes().iter()) { + *x = y; + } +} + +fn write_u16_into(into: &mut [u8], pos: usize, val: u16) { + for (x, &y) in into.iter_mut().skip(pos).take(2).zip(val.to_le_bytes().iter()) { + *x = y; + } +} + +pub fn len_to_block(size: usize) -> UFat { + (size/BLOCK_SIZE + if size % BLOCK_SIZE != 0 {1} else {0}) as UFat +} + +pub fn generate_header(n_state_sector: UFat) -> Vec { + + // boot block + let mut boot_and_fsinfo = BOOT_SECTOR.to_vec(); + let fatsize: u32 = len_to_block((2+n_state_sector as usize)*(std::mem::size_of::())); + write_u32_into(&mut boot_and_fsinfo, BOOT_SECTOR_FAT_SZ_32, fatsize); + write_u32_into(&mut boot_and_fsinfo, BOOT_SECTOR_TOTAL_SEC_32, n_state_sector + 8 + fatsize); + boot_and_fsinfo.extend_from_slice(&[0u8; BLOCK_SIZE - 2 - BOOT_SECTOR.len()]); + boot_and_fsinfo.push(0x55); + boot_and_fsinfo.push(0xaa); + + // fsinfo + boot_and_fsinfo.extend_from_slice(&FSINFO_HEAD); + boot_and_fsinfo.extend_from_slice(&[0u8; BLOCK_SIZE - FSINFO_HEAD.len() - FSINFO_TAIL.len()]); + boot_and_fsinfo.extend_from_slice(&FSINFO_TAIL); + let mut volume = boot_and_fsinfo.clone(); + + volume.extend_from_slice(&[0u8; 4*BLOCK_SIZE]); + + // backup copy in block 6 and 7 + volume.append(&mut boot_and_fsinfo); + volume +} + +pub fn generate_fat(state_blocks: &StateFatMap, pad: UFat) -> Result, &'static str> { + let mut fat = Vec::new(); + fat.extend_from_slice(&FAT32_EOF); + fat.extend_from_slice(&FAT32_EOF); + let mut current_cluster: UFat = 2; + for state in &state_blocks.order_list { + let pl = match state_blocks.pos_hash.get(&state) { + Some(x) => x, + None => return Err("Refernce to invalid state") + }; + let size = len_to_block(pl.byte_sized); + if size == 0 { + return Err("Zero size state"); + } + for i in 0..size { + current_cluster += 1; + if i == size - 1 { + fat.extend_from_slice(&FAT32_EOF); + } + else { + fat.extend_from_slice(¤t_cluster.to_le_bytes()); + } + } + } + for _ in 0..pad { + fat.extend_from_slice(&[0xffu8, 0xff, 0xff, 0x0f]); + } + if fat.len() % BLOCK_SIZE != 0 { + fat.extend( + std::iter::repeat(0u8) + .take(BLOCK_SIZE - fat.len() % BLOCK_SIZE) + ); + } + Ok(fat) +} + +const ENTRY_TEMPLATE: [u8; 32] = [ + /* 0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // to be filled in (8.3 name) + /* 11 */ 0, // attributes (to be filled in) + /* 12 */ 0x00, // reserved + /* 13 */ 0x00, // creation time deciseconds (0) + /* 14 */ 0x00, 0x00, // creation time + /* 16 */ 0x00, 0x00, // creation date + /* 18 */ 0x00, 0x00, // access date + /* 20 */ 0, 0, // to be filled in (cluster high word) + /* 22 */ 0x00, 0x00, // write time + /* 24 */ 0x21, 0x00, // write date (1980-01-01) + /* 26 */ 0, 0, // to be filled in (cluster low word) + /* 28 */ 0, 0, 0, 0, // size for directory is zero +]; + +pub fn generate_dir_short(letter: u8, target: UFat) -> Vec { + let name_8_3: [u8; 11] = if letter == b' ' { + *b"SPACE " + } + else { + // fat32 entries are padded with space + [letter, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20] + }; + let mut dir_entry = ENTRY_TEMPLATE.to_vec(); + for (x, &y) in dir_entry.iter_mut().take(11).zip(name_8_3.iter()) { + *x = y; + } + dir_entry[11] = 0x11; // read-only (defunct but I'll use it anyway), directory + write_u16_into(&mut dir_entry, 20, (target >> 16) as u16); + write_u16_into(&mut dir_entry, 26, (target & 0xffff) as u16); + // directories have size of zero + write_u32_into(&mut dir_entry, 28, 0); + dir_entry +} + +pub fn generate_match(target: UFat) -> Vec { + let name_8_3 = *b"MATCH "; + let mut dir_entry = ENTRY_TEMPLATE.to_vec(); + for (x, &y) in dir_entry.iter_mut().take(11).zip(name_8_3.iter()) { + *x = y; + } + dir_entry[11] = 0; + write_u16_into(&mut dir_entry, 20, (target >> 16) as u16); + write_u16_into(&mut dir_entry, 26, (target & 0xffff) as u16); + // just make it a 0-length file, idc + write_u32_into(&mut dir_entry, 28, 0); + dir_entry +} diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..3a7b6fe --- /dev/null +++ b/src/main.rs @@ -0,0 +1,163 @@ +mod fat32; +use clap::{App, Arg}; +use fat32::{StateFatMap, StatePosInfo, UFat}; +use regex_automata::{dense, DFA}; +use std::collections::{HashMap, HashSet}; +use std::error::Error; +use std::fs::File; +use std::io::Write; +use std::process::exit; + +const FORBIDDEN_PRINT_ASCII: [u8; 17] = [ + 0x22, 0x2a, 0x2b, 0x2c, 0x2e, 0x2f, 0x3a, 0x3b, 0x3c, 0x3c, 0x3d, 0x3e, 0x3f, 0x5b, 0x5c, 0x5d, + 0x7c, +]; + +// precalculate the position of every dfa state inside the fat table so we can +// later replace the referenced state numbers by the fat entry when writing +// directories +fn determine_state_positions( + dfa: &D, + validlist: &[u8], +) -> Result, &'static str> { + let nomatch_len = validlist.len() * 32; + let match_len = (validlist.len() + 1) * 32; + // root directory starts at 2 + let mut current_block: UFat = 2; + let mut current_index: usize = 0; + // vector of visited states in order of visit + let mut state_vec = Vec::new(); + // map state numbers to StatePosInfos + let mut state_pos_hash = HashMap::new(); + // keep track of visited states + let mut state_set = HashSet::new(); + state_vec.push(dfa.start_state()); + while let Some(¤t_state) = state_vec.get(current_index) { + // queue all unvisited states from current state + for &next_byte in validlist { + let next_state = dfa.next_state(current_state, next_byte); + if state_set.insert(next_state) { + state_vec.push(next_state); + } + } + // relevant for size of directory (but mostly not because it's constant + // and they're both the same) + let size = if dfa.is_match_state(current_state) { + match_len + } else { + nomatch_len + }; + state_pos_hash.insert( + current_state, + StatePosInfo { + block: current_block, + byte_sized: size, + }, + ); + match current_block.checked_add(fat32::len_to_block(size)) { + Some(val) => { + current_block = val; + } + None => return Err("State machine exceeds Fate32 capacity!"), + } + current_index += 1; + } + Ok(StateFatMap { + blocks: current_block - 2, + order_list: state_vec, + pos_hash: state_pos_hash, + }) +} + +fn regex_to_fat32( + dfa: &D, + validlist: &[u8], + mut vol: W, +) -> Result<(), Box> { + let state_blocks = determine_state_positions(&dfa, &validlist)?; + // pad until at least 65536 blocks, since otherwise ideologically + // I would have to implement fat12/fat16 + // also keep at least one free block for match file (which is 0 bytes, + // but I'm not sure if it needs to reference a valid block) + let pad = 1isize.max(65536 - state_blocks.blocks as isize) as UFat; + vol.write_all(&fat32::generate_header(state_blocks.blocks + pad))?; + vol.write_all(&fat32::generate_fat(&state_blocks, pad)?)?; + for &state in &state_blocks.order_list { + let mut current_dir = Vec::::new(); + // generate directories for each possible character + for &c in validlist { + let next_state = dfa.next_state(state, c); + // maps the state to the block where the state directory is + let &state_block = &state_blocks.pos_hash[&next_state].block; + current_dir.append(&mut fat32::generate_dir_short(c, state_block)); + } + // if accepting state, put match file into dir + if dfa.is_match_state(state) { + current_dir.append(&mut fat32::generate_match(state_blocks.blocks + 2)) + } + if current_dir.len() % fat32::BLOCK_SIZE == 0 { + vol.write_all(¤t_dir)?; + continue; + } + // fill up current block to multiple of BLOCK_SIZE + current_dir.extend( + std::iter::repeat(0u8).take(fat32::BLOCK_SIZE - current_dir.len() % fat32::BLOCK_SIZE), + ); + vol.write_all(¤t_dir)?; + } + let emptyblock = &[0u8; fat32::BLOCK_SIZE]; + // make space for one more (match file) + for _ in 0..pad { + vol.write_all(emptyblock)?; + } + Ok(()) +} + +fn main() { + let matches = App::new("regex2fat") + .version("0.1.0") + .author("8051Enthusiast") + .about("Convert regex DFAs to FAT32 file systems") + .arg( + Arg::with_name("anchor") + .short("a") + .long("anchor") + .help("Anchor regex at beginning (off by default)"), + ) + .arg( + Arg::with_name("pattern") + .required(true) + .index(1) + .help("The regex pattern to match"), + ) + .arg( + Arg::with_name("outfile") + .required(true) + .index(2) + .help("The file to write the fat fs to"), + ) + .get_matches(); + let pattern = matches.value_of("pattern").unwrap(); + let dfa = dense::Builder::new() + // fat32 is case insensitive + .case_insensitive(true) + .anchored(matches.is_present("anchor")) + .build(pattern) + .unwrap_or_else(|err| { + eprintln!("Could not compile regex '{}': {}", pattern, err); + exit(1); + }); + let validlist: Vec = (0x20..0x61) + .chain(0x7b..0x7e) + .filter(|c| !FORBIDDEN_PRINT_ASCII.contains(c)) + .collect(); + let outfile = matches.value_of("outfile").unwrap(); + let file = File::create(outfile).unwrap_or_else(|err| { + eprintln!("Could not open file '{}': {}", outfile, err); + exit(1); + }); + regex_to_fat32(&dfa, &validlist, file).unwrap_or_else(|err| { + eprintln!("Could not write DFA to '{}': {}", outfile, err); + exit(1); + }); +}