Custom parse errors for Lua (#1298)

- Add several (internal) modules for lexing and parsing Lua code. These allow us to provide (hopefully) higher quality error messages than Lua's built-in messages. - `shell.run`, `edit` and `lua` now use this parser when fed invalid code. This allows us to provide better syntax errors, while not having any impact on the happy path. Note this does not affect any other mechanism for loading code (`load`, `require`, `dofile`). There's still a lot of work to do here in improving error message quality, but hopefully this provides a good starting point.
2025-08-25 06:52:17 +00:00 · 2023-01-25 20:35:43 +00:00 · 2023-01-25 20:35:43 +00:00 · a12b405acf
commit a12b405acf
parent e076818b29
14 changed files with 4474 additions and 21 deletions
--- a/projects/core/src/main/resources/data/computercraft/lua/rom/modules/main/cc/internal/error_printer.lua
+++ b/projects/core/src/main/resources/data/computercraft/lua/rom/modules/main/cc/internal/error_printer.lua
@ -0,0 +1,173 @@
+--[[- A pretty-printer for Lua errors.
+
+:::warning
+This is an internal module and SHOULD NOT be used in your own code. It may
+be removed or changed at any time.
+:::
+
+This consumes a list of messages and "annotations" and displays the error to the
+terminal.
+
+@see cc.internal.syntax.errors For errors produced by the parser.
+@local
+]]
+
+local pretty = require "cc.pretty"
+local expect = require "cc.expect"
+local expect, field = expect.expect, expect.field
+local wrap = require "cc.strings".wrap
+
+--- Write a message to the screen.
+-- @tparam cc.pretty.Doc|string msg The message to write.
+local function display(msg)
+    if type(msg) == "table" then pretty.print(msg) else print(msg) end
+end
+
+-- Write a message to the screen, aligning to the current cursor position.
+-- @tparam cc.pretty.Doc|string msg The message to write.
+local function display_here(msg, preamble)
+    expect(1, msg, "string", "table")
+    local x = term.getCursorPos()
+    local width, height = term.getSize()
+    width = width - x + 1
+
+    local function newline()
+        local _, y = term.getCursorPos()
+        if y >= height then
+            term.scroll(1)
+        else
+            y = y + 1
+        end
+
+        preamble(y)
+        term.setCursorPos(x, y)
+    end
+
+    if type(msg) == "string" then
+        local lines = wrap(msg, width)
+        term.write(lines[1])
+        for i = 2, #lines do
+            newline()
+            term.write(lines[i])
+        end
+    else
+        local def_colour = term.getTextColour()
+        local function display_impl(doc)
+            expect(1, doc, "table")
+            local kind = doc.tag
+            if kind == "nil" then return
+            elseif kind == "text" then
+                -- TODO: cc.strings.wrap doesn't support a leading indent. We should
+                -- fix that!
+                -- Might also be nice to add a wrap_iter, which returns an iterator over
+                -- start_pos, end_pos instead.
+
+                if doc.colour then term.setTextColour(doc.colour) end
+                local x1 = term.getCursorPos()
+
+                local lines = wrap((" "):rep(x1 - x) .. doc.text, width)
+                term.write(lines[1]:sub(x1 - x + 1))
+                for i = 2, #lines do
+                    newline()
+                    term.write(lines[i])
+                end
+
+                if doc.colour then term.setTextColour(def_colour) end
+            elseif kind == "concat" then
+                for i = 1, doc.n do display_impl(doc[i]) end
+            else
+                error("Unknown doc " .. kind)
+            end
+        end
+        display_impl(msg)
+    end
+    print()
+end
+
+--- A list of colours we can use for error messages.
+local error_colours = { colours.red, colours.green, colours.magenta, colours.orange }
+
+--- The accent line used to denote a block of code.
+local code_accent = pretty.text("\x95", colours.cyan)
+
+--[[-
+@tparam { get_pos = function, get_line = function } context
+    The context where the error was reported. This effectively acts as a view
+    over the underlying source, exposing the following functions:
+    - `get_pos`: Get the line and column of an opaque position.
+    - `get_line`: Get the source code for an opaque position.
+@tparam table message The message to display, as produced by @{cc.internal.syntax.errors}.
+]]
+return function(context, message)
+    expect(1, context, "table")
+    expect(2, message, "table")
+    field(context, "get_pos", "function")
+    field(context, "get_line", "function")
+
+    if #message == 0 then error("Message is empty", 2) end
+
+    local error_colour = 1
+    local width = term.getSize()
+
+    for msg_idx = 1, #message do
+        if msg_idx > 1 then print() end
+
+        local msg = message[msg_idx]
+        if type(msg) == "table" and msg.tag == "annotate" then
+            local line, col = context.get_pos(msg.start_pos)
+            local end_line, end_col = context.get_pos(msg.end_pos)
+            local contents = context.get_line(msg.start_pos)
+
+            -- Pick a starting column. We pick the left-most position which fits
+            -- in one of the following:
+            --  - 10 characters after the start column.
+            --  - 5 characters after the end column.
+            --  - The end of the line.
+            if line ~= end_line then end_col = #contents end
+            local start_col = math.max(1, math.min(col + 10, end_col + 5, #contents + 1) - width + 1)
+
+            -- Pick a colour for this annotation.
+            local colour = colours.toBlit(error_colours[error_colour])
+            error_colour = (error_colour % #error_colours) + 1
+
+            -- Print the line number and snippet of code. We display french
+            -- quotes on either side of the string if it is truncated.
+            local str_start, str_end = start_col, start_col + width - 2
+            local prefix, suffix = "", ""
+            if start_col > 1 then
+                str_start = str_start + 1
+                prefix = pretty.text("\xab", colours.grey)
+            end
+            if str_end < #contents then
+                str_end = str_end - 1
+                suffix = pretty.text("\xbb", colours.grey)
+            end
+
+            pretty.print(code_accent .. pretty.text("Line " .. line, colours.cyan))
+            pretty.print(code_accent .. prefix .. pretty.text(contents:sub(str_start, str_end), colours.lightGrey) .. suffix)
+
+            -- Print a line highlighting the region of text.
+            local _, y = term.getCursorPos()
+            pretty.write(code_accent)
+
+            local indicator_end = end_col
+            if end_col > str_end then indicator_end = str_end end
+
+            local indicator_len = indicator_end - col + 1
+            term.setCursorPos(col - start_col + 2, y)
+            term.blit(("\x83"):rep(indicator_len), colour:rep(indicator_len), ("f"):rep(indicator_len))
+            print()
+
+            -- And then print the annotation's message, if present.
+            if msg.msg ~= "" then
+                term.blit("\x95", colour, "f")
+                display_here(msg.msg, function(y)
+                    term.setCursorPos(1, y)
+                    term.blit("\x95", colour, "f")
+                end)
+            end
+        else
+            display(msg)
+        end
+    end
+end
--- a/projects/core/src/main/resources/data/computercraft/lua/rom/modules/main/cc/internal/syntax/errors.lua
+++ b/projects/core/src/main/resources/data/computercraft/lua/rom/modules/main/cc/internal/syntax/errors.lua
@ -0,0 +1,552 @@
+--[[- The error messages reported by our lexer and parser.
+
+:::warning
+This is an internal module and SHOULD NOT be used in your own code. It may
+be removed or changed at any time.
+:::
+
+This provides a list of factory methods which take source positions and produce
+appropriate error messages targeting that location. These error messages can
+then be displayed to the user via @{cc.internal.error_printer}.
+
+@local
+]]
+
+local pretty = require "cc.pretty"
+local expect = require "cc.expect".expect
+local tokens = require "cc.internal.syntax.parser".tokens
+
+local function annotate(start_pos, end_pos, msg)
+    if msg == nil and (type(end_pos) == "string" or type(end_pos) == "table" or type(end_pos) == "nil") then
+        end_pos, msg = start_pos, end_pos
+    end
+
+    expect(1, start_pos, "number")
+    expect(2, end_pos, "number")
+    expect(3, msg, "string", "table", "nil")
+
+    return { tag = "annotate", start_pos = start_pos, end_pos = end_pos, msg = msg or "" }
+end
+
+--- Format a string as a non-highlighted block of code.
+--
+-- @tparam string msg The code to format.
+-- @treturn cc.pretty.Doc The formatted code.
+local function code(msg) return pretty.text(msg, colours.lightGrey) end
+
+--- Maps tokens to a more friendly version.
+local token_names = setmetatable({
+    -- Specific tokens.
+    [tokens.IDENT] = "identifier",
+    [tokens.NUMBER] = "number",
+    [tokens.STRING] = "string",
+    [tokens.EOF] = "end of file",
+    -- Symbols and keywords
+    [tokens.ADD] = code("+"),
+    [tokens.AND] = code("and"),
+    [tokens.BREAK] = code("break"),
+    [tokens.CBRACE] = code("}"),
+    [tokens.COLON] = code(":"),
+    [tokens.COMMA] = code(","),
+    [tokens.CONCAT] = code(".."),
+    [tokens.CPAREN] = code(")"),
+    [tokens.CSQUARE] = code("]"),
+    [tokens.DIV] = code("/"),
+    [tokens.DO] = code("do"),
+    [tokens.DOT] = code("."),
+    [tokens.DOTS] = code("..."),
+    [tokens.ELSE] = code("else"),
+    [tokens.ELSEIF] = code("elseif"),
+    [tokens.END] = code("end"),
+    [tokens.EQ] = code("=="),
+    [tokens.EQUALS] = code("="),
+    [tokens.FALSE] = code("false"),
+    [tokens.FOR] = code("for"),
+    [tokens.FUNCTION] = code("function"),
+    [tokens.GE] = code(">="),
+    [tokens.GT] = code(">"),
+    [tokens.IF] = code("if"),
+    [tokens.IN] = code("in"),
+    [tokens.LE] = code("<="),
+    [tokens.LEN] = code("#"),
+    [tokens.LOCAL] = code("local"),
+    [tokens.LT] = code("<"),
+    [tokens.MOD] = code("%"),
+    [tokens.MUL] = code("*"),
+    [tokens.NE] = code("~="),
+    [tokens.NIL] = code("nil"),
+    [tokens.NOT] = code("not"),
+    [tokens.OBRACE] = code("{"),
+    [tokens.OPAREN] = code("("),
+    [tokens.OR] = code("or"),
+    [tokens.OSQUARE] = code("["),
+    [tokens.POW] = code("^"),
+    [tokens.REPEAT] = code("repeat"),
+    [tokens.RETURN] = code("return"),
+    [tokens.SEMICOLON] = code(";"),
+    [tokens.SUB] = code("-"),
+    [tokens.THEN] = code("then"),
+    [tokens.TRUE] = code("true"),
+    [tokens.UNTIL] = code("until"),
+    [tokens.WHILE] = code("while"),
+}, { __index = function(_, name) error("No such token " .. tostring(name), 2) end })
+
+local errors = {}
+
+--------------------------------------------------------------------------------
+-- Lexer errors
+--------------------------------------------------------------------------------
+
+--[[- A string which ends without a closing quote.
+
+@tparam number start_pos The start position of the string.
+@tparam number end_pos The end position of the string.
+@tparam string quote The kind of quote (`"` or `'`).
+@return The resulting parse error.
+]]
+function errors.unfinished_string(start_pos, end_pos, quote)
+    expect(1, start_pos, "number")
+    expect(2, end_pos, "number")
+    expect(3, quote, "string")
+
+    return {
+        "This string is not finished. Are you missing a closing quote (" .. code(quote) .. ")?",
+        annotate(start_pos, "String started here."),
+        annotate(end_pos, "Expected a closing quote here."),
+    }
+end
+
+--[[- A string which ends with an escape sequence (so a literal `"foo\`). This
+is slightly different from @{unfinished_string}, as we don't want to suggest
+adding a quote.
+
+@tparam number start_pos The start position of the string.
+@tparam number end_pos The end position of the string.
+@tparam string quote The kind of quote (`"` or `'`).
+@return The resulting parse error.
+]]
+function errors.unfinished_string_escape(start_pos, end_pos, quote)
+    expect(1, start_pos, "number")
+    expect(2, end_pos, "number")
+    expect(3, quote, "string")
+
+    return {
+        "This string is not finished.",
+        annotate(start_pos, "String started here."),
+        annotate(end_pos, "An escape sequence was started here, but with nothing following it."),
+    }
+end
+
+--[[- A long string was never finished.
+
+@tparam number start_pos The start position of the long string delimiter.
+@tparam number end_pos The end position of the long string delimiter.
+@tparam number ;em The length of the long string delimiter, excluding the first `[`.
+@return The resulting parse error.
+]]
+function errors.unfinished_long_string(start_pos, end_pos, len)
+    expect(1, start_pos, "number")
+    expect(2, end_pos, "number")
+    expect(3, len, "number")
+
+    return {
+        "This string was never finished.",
+        annotate(start_pos, end_pos, "String was started here."),
+        "We expected a closing delimiter (" .. code("]" .. ("="):rep(len - 1) .. "]") .. ") somewhere after this string was started.",
+    }
+end
+
+--[[- Malformed opening to a long string (i.e. `[=`).
+
+@tparam number start_pos The start position of the long string delimiter.
+@tparam number end_pos The end position of the long string delimiter.
+@tparam number len The length of the long string delimiter, excluding the first `[`.
+@return The resulting parse error.
+]]
+function errors.malformed_long_string(start_pos, end_pos, len)
+    expect(1, start_pos, "number")
+    expect(2, end_pos, "number")
+    expect(3, len, "number")
+
+    return {
+        "Incorrect start of a long string.",
+        annotate(start_pos, end_pos),
+        "Tip: If you wanted to start a long string here, add an extra " .. code("[") .. " here.",
+    }
+end
+
+--[[- Malformed nesting of a long string.
+
+@tparam number start_pos The start position of the long string delimiter.
+@tparam number end_pos The end position of the long string delimiter.
+@return The resulting parse error.
+]]
+function errors.nested_long_str(start_pos, end_pos)
+    expect(1, start_pos, "number")
+    expect(2, end_pos, "number")
+
+    return {
+        code("[[") .. " cannot be nested inside another " .. code("[[ ... ]]"),
+        annotate(start_pos, end_pos),
+    }
+end
+
+--[[- A malformed numeric literal.
+
+@tparam number start_pos The start position of the number.
+@tparam number end_pos The end position of the number.
+@return The resulting parse error.
+]]
+function errors.malformed_number(start_pos, end_pos)
+    expect(1, start_pos, "number")
+    expect(2, end_pos, "number")
+
+    return {
+        "This isn't a valid number.",
+        annotate(start_pos, end_pos),
+        "Numbers must be in one of the following formats: " .. code("123") .. ", "
+        .. code("3.14") .. ", " .. code("23e35") .. ", " .. code("0x01AF") .. ".",
+    }
+end
+
+--[[- A long comment was never finished.
+
+@tparam number start_pos The start position of the long string delimiter.
+@tparam number end_pos The end position of the long string delimiter.
+@tparam number len The length of the long string delimiter, excluding the first `[`.
+@return The resulting parse error.
+]]
+function errors.unfinished_long_comment(start_pos, end_pos, len)
+    expect(1, start_pos, "number")
+    expect(2, end_pos, "number")
+    expect(3, len, "number")
+
+    return {
+        "This comment was never finished.",
+        annotate(start_pos, end_pos, "Comment was started here."),
+        "We expected a closing delimiter (" .. code("]" .. ("="):rep(len - 1) .. "]") .. ") somewhere after this comment was started.",
+    }
+end
+
+--[[- `&&` was used instead of `and`.
+
+@tparam number start_pos The start position of the token.
+@tparam number end_pos The end position of the token.
+@return The resulting parse error.
+]]
+function errors.wrong_and(start_pos, end_pos)
+    expect(1, start_pos, "number")
+    expect(2, end_pos, "number")
+
+    return {
+        "Unexpected character.",
+        annotate(start_pos, end_pos),
+        "Tip: Replace this with " .. code("and") .. " to check if both values are true.",
+    }
+end
+
+--[[- `||` was used instead of `or`.
+
+@tparam number start_pos The start position of the token.
+@tparam number end_pos The end position of the token.
+@return The resulting parse error.
+]]
+function errors.wrong_or(start_pos, end_pos)
+    expect(1, start_pos, "number")
+    expect(2, end_pos, "number")
+
+    return {
+        "Unexpected character.",
+        annotate(start_pos, end_pos),
+        "Tip: Replace this with " .. code("or") .. " to check if either value is true.",
+    }
+end
+
+--[[- `!=` was used instead of `~=`.
+
+@tparam number start_pos The start position of the token.
+@tparam number end_pos The end position of the token.
+@return The resulting parse error.
+]]
+function errors.wrong_ne(start_pos, end_pos)
+    expect(1, start_pos, "number")
+    expect(2, end_pos, "number")
+
+    return {
+        "Unexpected character.",
+        annotate(start_pos, end_pos),
+        "Tip: Replace this with " .. code("~=") .. " to check if two values are not equal.",
+    }
+end
+
+--[[- An unexpected character was used.
+
+@tparam number pos The position of this character.
+@return The resulting parse error.
+]]
+function errors.unexpected_character(pos)
+    expect(1, pos, "number")
+    return {
+        "Unexpected character.",
+        annotate(pos, "This character isn't usable in Lua code."),
+    }
+end
+
+--------------------------------------------------------------------------------
+-- Expression parsing errors
+--------------------------------------------------------------------------------
+
+--[[- A fallback error when we expected an expression but received another token.
+
+@tparam number token The token id.
+@tparam number start_pos The start position of the token.
+@tparam number end_pos The end position of the token.
+@return The resulting parse error.
+]]
+function errors.expected_expression(token, start_pos, end_pos)
+    expect(1, token, "number")
+    expect(2, start_pos, "number")
+    expect(3, end_pos, "number")
+    return {
+        "Unexpected " .. token_names[token] .. ". Expected an expression.",
+        annotate(start_pos, end_pos),
+    }
+end
+
+--[[- A fallback error when we expected a variable but received another token.
+
+@tparam number token The token id.
+@tparam number start_pos The start position of the token.
+@tparam number end_pos The end position of the token.
+@return The resulting parse error.
+]]
+function errors.expected_var(token, start_pos, end_pos)
+    expect(1, token, "number")
+    expect(2, start_pos, "number")
+    expect(3, end_pos, "number")
+    return {
+        "Unexpected " .. token_names[token] .. ". Expected a variable name.",
+        annotate(start_pos, end_pos),
+    }
+end
+
+--[[- `=` was used in an expression context.
+
+@tparam number start_pos The start position of the `=` token.
+@tparam number end_pos The end position of the `=` token.
+@return The resulting parse error.
+]]
+function errors.use_double_equals(start_pos, end_pos)
+    expect(1, start_pos, "number")
+    expect(2, end_pos, "number")
+
+    return {
+        "Unexpected " .. code("=") .. " in expression.",
+        annotate(start_pos, end_pos),
+        "Tip: Replace this with " .. code("==") .. " to check if two values are equal.",
+    }
+end
+
+--[[- `=` was used after an expression inside a table.
+
+@tparam number start_pos The start position of the `=` token.
+@tparam number end_pos The end position of the `=` token.
+@return The resulting parse error.
+]]
+function errors.table_key_equals(start_pos, end_pos)
+    expect(1, start_pos, "number")
+    expect(2, end_pos, "number")
+
+    return {
+        "Unexpected " .. code("=") .. " in expression.",
+        annotate(start_pos, end_pos),
+        "Tip: Wrap the preceding expression in " .. code("[") .. " and " .. code("]") .. " to use it as a table key.",
+    }
+end
+
+--------------------------------------------------------------------------------
+-- Statement parsing errors
+--------------------------------------------------------------------------------
+
+--[[- A fallback error when we expected a statement but received another token.
+
+@tparam number token The token id.
+@tparam number start_pos The start position of the token.
+@tparam number end_pos The end position of the token.
+@return The resulting parse error.
+]]
+function errors.expected_statement(token, start_pos, end_pos)
+    expect(1, token, "number")
+    expect(2, start_pos, "number")
+    expect(3, end_pos, "number")
+    return {
+        "Unexpected " .. token_names[token] .. ". Expected a statement.",
+        annotate(start_pos, end_pos),
+    }
+end
+
+--[[- `local function` was used with a table identifier.
+
+@tparam number local_start The start position of the `local` token.
+@tparam number local_end The end position of the `local` token.
+@tparam number dot_start The start position of the `.` token.
+@tparam number dot_end The end position of the `.` token.
+@return The resulting parse error.
+]]
+function errors.local_function_dot(local_start, local_end, dot_start, dot_end)
+    expect(1, local_start, "number")
+    expect(2, local_end, "number")
+    expect(3, dot_start, "number")
+    expect(4, dot_end, "number")
+
+    return {
+        "Cannot use " .. code("local function") .. " with a table key.",
+        annotate(dot_start, dot_end, code(".") .. " appears here."),
+        annotate(local_start, local_end, "Tip: " .. "Try removing this " .. code("local") .. " keyword."),
+    }
+end
+
+--[[- A statement of the form `x.y z`
+
+@tparam number pos The position right after this name.
+@return The resulting parse error.
+]]
+function errors.standalone_name(pos)
+    expect(1, pos, "number")
+
+    return {
+        "Unexpected symbol after name.",
+        annotate(pos),
+        "Did you mean to assign this or call it as a function?",
+    }
+end
+
+--[[- A statement of the form `x.y`. This is similar to @{standalone_name}, but
+when the next token is on another line.
+
+@tparam number pos The position right after this name.
+@return The resulting parse error.
+]]
+function errors.standalone_name_call(pos)
+    expect(1, pos, "number")
+
+    return {
+        "Unexpected symbol after variable.",
+        annotate(pos + 1, "Expected something before the end of the line."),
+        "Tip: Use " .. code("()") .. " to call with no arguments.",
+    }
+end
+
+--[[- `then` was expected
+
+@tparam number if_start The start position of the `if`/`elseif` keyword.
+@tparam number if_end The end position of the `if`/`elseif` keyword.
+@tparam number token_pos The current token position.
+@return The resulting parse error.
+]]
+function errors.expected_then(if_start, if_end, token_pos)
+    expect(1, if_start, "number")
+    expect(2, if_end, "number")
+    expect(3, token_pos, "number")
+
+    return {
+        "Expected " .. code("then") .. " after if condition.",
+        annotate(if_start, if_end, "If statement started here."),
+        annotate(token_pos, "Expected " .. code("then") .. " before here."),
+    }
+
+end
+
+--[[- `end` was expected
+
+@tparam number block_start The start position of the block.
+@tparam number block_end The end position of the block.
+@tparam number token The current token position.
+@tparam number token_start The current token position.
+@tparam number token_end The current token position.
+@return The resulting parse error.
+]]
+function errors.expected_end(block_start, block_end, token, token_start, token_end)
+    return {
+        "Unexpected " .. token_names[token] .. ". Expected " .. code("end") .. " or another statement.",
+        annotate(block_start, block_end, "Block started here."),
+        annotate(token_start, token_end, "Expected end of block here."),
+    }
+end
+
+--[[- An unexpected `end` in a statement.
+
+@tparam number start_pos The start position of the token.
+@tparam number end_pos The end position of the token.
+@return The resulting parse error.
+]]
+function errors.unexpected_end(start_pos, end_pos)
+    return {
+        "Unexpected " .. code("end") .. ".",
+        annotate(start_pos, end_pos),
+        "Your program contains more " .. code("end") .. "s than needed. Check " ..
+        "each block (" .. code("if") .. ", " .. code("for") .. ", " ..
+        code("function") .. ", ...) only has one " .. code("end") .. ".",
+    }
+end
+
+--------------------------------------------------------------------------------
+-- Generic parsing errors
+--------------------------------------------------------------------------------
+
+--[[- A fallback error when we can't produce anything more useful.
+
+@tparam number token The token id.
+@tparam number start_pos The start position of the token.
+@tparam number end_pos The end position of the token.
+@return The resulting parse error.
+]]
+function errors.unexpected_token(token, start_pos, end_pos)
+    expect(1, token, "number")
+    expect(2, start_pos, "number")
+    expect(3, end_pos, "number")
+
+    return {
+        "Unexpected " .. token_names[token] .. ".",
+        annotate(start_pos, end_pos),
+    }
+end
+
+--[[- A parenthesised expression was started but not closed.
+
+@tparam number open_start The start position of the opening bracket.
+@tparam number open_end The end position of the opening bracket.
+@tparam number tok_start The start position of the opening bracket.
+@return The resulting parse error.
+]]
+function errors.unclosed_brackets(open_start, open_end, token, start_pos, end_pos)
+    expect(1, open_start, "number")
+    expect(2, open_end, "number")
+    expect(3, token, "number")
+    expect(4, start_pos, "number")
+    expect(5, end_pos, "number")
+
+    -- TODO: Do we want to be smarter here with where we report the error?
+    return {
+        "Unexpected " .. token_names[token] .. ". Are you missing a closing bracket?",
+        annotate(open_start, open_end, "Brackets were opened here."),
+        annotate(start_pos, end_pos, "Unexpected " .. token_names[token] .. " here."),
+
+    }
+end
+
+--[[- Expected `(` to open our function arguments.
+
+@tparam number token The token id.
+@tparam number start_pos The start position of the token.
+@tparam number end_pos The end position of the token.
+@return The resulting parse error.
+]]
+function errors.expected_function_args(token, start_pos, end_pos)
+    return {
+        "Unexpected " .. token_names[token] .. ". Expected " .. code("(") .. " to start function arguments.",
+        annotate(start_pos, end_pos),
+    }
+end
+
+return errors
--- a/projects/core/src/main/resources/data/computercraft/lua/rom/modules/main/cc/internal/syntax/init.lua
+++ b/projects/core/src/main/resources/data/computercraft/lua/rom/modules/main/cc/internal/syntax/init.lua
@ -0,0 +1,100 @@
+--[[- The main entrypoint to our Lua parser
+
+:::warning
+This is an internal module and SHOULD NOT be used in your own code. It may
+be removed or changed at any time.
+:::
+
+@local
+]]
+
+local expect = require "cc.expect".expect
+
+local lex_one = require "cc.internal.syntax.lexer".lex_one
+local parser = require "cc.internal.syntax.parser"
+local error_printer = require "cc.internal.error_printer"
+
+local function parse(input, start_symbol)
+    expect(1, input, "string")
+    expect(2, start_symbol, "number")
+
+    -- Lazy-load the parser.
+    local parse, tokens, last_token = parser.parse, parser.tokens, parser.tokens.COMMENT
+
+    local error_sentinel = {}
+
+    local context = {}
+
+    local lines = { 1 }
+    function context.line(pos) lines[#lines + 1] = pos end
+
+    function context.get_pos(pos)
+        expect(1, pos, "number")
+        for i = #lines, 1, -1 do
+            local start = lines[i]
+            if pos >= start then return i, pos - start + 1 end
+        end
+
+        error("Position is <= 0", 2)
+    end
+
+    function context.get_line(pos)
+        expect(1, pos, "number")
+        for i = #lines, 1, -1 do
+            local start = lines[i]
+            if pos >= start then return input:match("[^\r\n]*", start) end
+        end
+
+        error("Position is <= 0", 2)
+    end
+
+    function context.report(msg)
+        expect(1, msg, "table")
+        error_printer(context, msg)
+        error(error_sentinel)
+    end
+
+    local pos = 1
+    local ok, err = pcall(parse, context, function()
+        while true do
+            local token, start, finish = lex_one(context, input, pos)
+            if not token then return tokens.EOF, #input + 1, #input + 1 end
+
+            pos = finish + 1
+
+            if token < last_token then
+                return token, start, finish
+            elseif token == tokens.ERROR then
+                error(error_sentinel)
+            end
+        end
+    end, start_symbol)
+
+    if ok then
+        return true
+    elseif err == error_sentinel then
+        return false
+    else
+        error(err, 0)
+    end
+end
+
+--[[- Parse a Lua program, printing syntax errors to the terminal.
+
+@tparam string input The string to parse.
+@treturn boolean Whether the string was successfully parsed.
+]]
+local function parse_program(input) return parse(input, parser.program) end
+
+--[[- Parse a REPL input (either a program or a list of expressions), printing
+syntax errors to the terminal.
+
+@tparam string input The string to parse.
+@treturn boolean Whether the string was successfully parsed.
+]]
+local function parse_repl(input) return parse(input, parser.repl_exprs) end
+
+return {
+    parse_program = parse_program,
+    parse_repl = parse_repl,
+}
--- a/projects/core/src/main/resources/data/computercraft/lua/rom/modules/main/cc/internal/syntax/lexer.lua
+++ b/projects/core/src/main/resources/data/computercraft/lua/rom/modules/main/cc/internal/syntax/lexer.lua
@ -0,0 +1,359 @@
+--[[- A lexer for Lua source code.
+
+:::warning
+This is an internal module and SHOULD NOT be used in your own code. It may
+be removed or changed at any time.
+:::
+
+This module provides utilities for lexing Lua code, returning tokens compatible
+with @{cc.internal.syntax.parser}. While all lexers are roughly the same, there
+are some design choices worth drawing attention to:
+
+ - The lexer uses Lua patterns (i.e. @{string.find}) as much as possible,
+   trying to avoid @{string.sub} loops except when needed. This allows us to
+   move string processing to native code, which ends up being much faster.
+
+ - We try to avoid allocating where possible. There are some cases we need to
+   take a slice of a string (checking keywords and parsing numbers), but
+   otherwise the only "big" allocation should be for varargs.
+
+ - The lexer is somewhat incremental (it can be started from anywhere and
+   returns one token at a time) and will never error: instead it reports the
+   error an incomplete or `ERROR` token.
+
+@local
+]]
+
+local errors = require "cc.internal.syntax.errors"
+local tokens = require "cc.internal.syntax.parser".tokens
+local sub, find = string.sub, string.find
+
+local keywords = {
+    ["and"]      = tokens.AND,      ["break"] = tokens.BREAK, ["do"]    = tokens.DO,    ["else"]   = tokens.ELSE,
+    ["elseif"]   = tokens.ELSEIF,   ["end"]   = tokens.END,   ["false"] = tokens.FALSE, ["for"]    = tokens.FOR,
+    ["function"] = tokens.FUNCTION, ["if"]    = tokens.IF,    ["in"]    = tokens.IN,    ["local"]  = tokens.LOCAL,
+    ["nil"]      = tokens.NIL,      ["not"]   = tokens.NOT,   ["or"]    = tokens.OR,    ["repeat"] = tokens.REPEAT,
+    ["return"]   = tokens.RETURN,   ["then"]  = tokens.THEN,  ["true"]  = tokens.TRUE,  ["until"]  = tokens.UNTIL,
+    ["while"]    = tokens.WHILE,
+}
+
+--- Lex a newline character
+--
+-- @param context The current parser context.
+-- @tparam string str The current string.
+-- @tparam number pos The position of the newline character.
+-- @tparam string nl The current new line character, either "\n" or "\r".
+-- @treturn pos The new position, after the newline.
+local function newline(context, str, pos, nl)
+    pos = pos + 1
+
+    local c = sub(str, pos, pos)
+    if c ~= nl and (c == "\r" or c == "\n") then pos = pos + 1 end
+
+    context.line(pos) -- Mark the start of the next line.
+    return pos
+end
+
+
+--- Lex a number
+--
+-- @param context The current parser context.
+-- @tparam string str The current string.
+-- @tparam number start The start position of this number.
+-- @treturn number The token id for numbers.
+-- @treturn number The end position of this number
+local function lex_number(context, str, start)
+    local pos = start + 1
+
+    local exp_low, exp_high = "e", "E"
+    if sub(str, start, start) == "0" then
+        local next = sub(str, pos, pos)
+        if next == "x" or next == "X" then
+            pos = pos + 1
+            exp_low, exp_high = "p", "P"
+        end
+    end
+
+    while true do
+        local c = sub(str, pos, pos)
+        if c == exp_low or c == exp_high then
+            pos = pos + 1
+            c = sub(str, pos, pos)
+            if c == "+" or c == "-" then
+                pos = pos + 1
+            end
+        elseif (c >= "0" and c <= "9") or (c >= "a" and c <= "f") or (c >= "A" and c <= "F") or c == "." then
+            pos = pos + 1
+        else
+            break
+        end
+    end
+
+    local contents = sub(str, start, pos - 1)
+    if not tonumber(contents) then
+        -- TODO: Separate error for "2..3"?
+        context.report(errors.malformed_number(start, pos - 1))
+    end
+
+    return tokens.NUMBER, pos - 1
+end
+
+--- Lex a quoted string.
+--
+-- @param context The current parser context.
+-- @tparam string str The string we're lexing.
+-- @tparam number start_pos The start position of the string.
+-- @tparam string quote The quote character, either " or '.
+-- @treturn number The token id for strings.
+-- @treturn number The new position.
+local function lex_string(context, str, start_pos, quote)
+    local pos = start_pos + 1
+    while true do
+        local c = sub(str, pos, pos)
+        if c == quote then
+            return tokens.STRING, pos
+        elseif c == "\n" or c == "\r" or c == "" then
+            -- We don't call newline here, as that's done for the next token.
+            context.report(errors.unfinished_string(start_pos, pos, quote))
+            return tokens.STRING, pos - 1
+        elseif c == "\\" then
+            c = sub(str, pos + 1, pos + 1)
+            if c == "\n" or c == "\r" then
+                pos = newline(context, str, pos + 1, c)
+            elseif c == "" then
+                context.report(errors.unfinished_string_escape(start_pos, pos, quote))
+                return tokens.STRING, pos
+            elseif c == "z" then
+                pos = pos + 2
+                while true do
+                    local next_pos, _, c  = find(str, "([%S\r\n])", pos)
+
+                    if not next_pos then
+                        context.report(errors.unfinished_string(start_pos, #str, quote))
+                        return tokens.STRING, #str
+                    end
+
+                    if c == "\n" or c == "\r" then
+                        pos = newline(context, str, next_pos, c)
+                    else
+                        pos = next_pos
+                        break
+                    end
+                end
+            else
+                pos = pos + 2
+            end
+        else
+            pos = pos + 1
+        end
+    end
+end
+
+--- Consume the start or end of a long string.
+-- @tparam string str The input string.
+-- @tparam number pos The start position. This must be after the first `[` or `]`.
+-- @tparam string fin The terminating character, either `[` or `]`.
+-- @treturn boolean Whether a long string was successfully started.
+-- @treturn number The current position.
+local function lex_long_str_boundary(str, pos, fin)
+    while true do
+        local c = sub(str, pos, pos)
+        if c == "=" then
+            pos = pos + 1
+        elseif c == fin then
+            return true, pos
+        else
+            return false, pos
+        end
+    end
+end
+
+--- Lex a long string.
+-- @param context The current parser context.
+-- @tparam string str The input string.
+-- @tparam number start The start position, after the input boundary.
+-- @tparam number len The expected length of the boundary. Equal to 1 + the
+-- number of `=`.
+-- @treturn number|nil The end position, or @{nil} if this is not terminated.
+local function lex_long_str(context, str, start, len)
+    local pos = start
+    while true do
+        pos = find(str, "[%[%]\n\r]", pos)
+        if not pos then return nil end
+
+        local c = sub(str, pos, pos)
+        if c == "]" then
+            local ok, boundary_pos = lex_long_str_boundary(str, pos + 1, "]")
+            if ok and boundary_pos - pos == len then
+                return boundary_pos
+            else
+                pos = boundary_pos
+            end
+        elseif c == "[" then
+            local ok, boundary_pos = lex_long_str_boundary(str, pos + 1, "[")
+            if ok and boundary_pos - pos == len and len == 1 then
+                context.report(errors.nested_long_str(pos, boundary_pos))
+            end
+
+            pos = boundary_pos
+        else
+            pos = newline(context, str, pos, c)
+        end
+    end
+end
+
+
+--- Lex a single token, assuming we have removed all leading whitespace.
+--
+-- @param context The current parser context.
+-- @tparam string str The string we're lexing.
+-- @tparam number pos The start position.
+-- @treturn number The id of the parsed token.
+-- @treturn number The end position of this token.
+-- @treturn string|nil The token's current contents (only given for identifiers)
+local function lex_token(context, str, pos)
+    local c = sub(str, pos, pos)
+
+    -- Identifiers and keywords
+    if (c >= "a" and c <= "z") or (c >= "A" and c <= "Z") or c == "_" then
+        local _, end_pos = find(str, "^[%w_]+", pos)
+        if not end_pos then error("Impossible: No position") end
+
+        local contents = sub(str, pos, end_pos)
+        return keywords[contents] or tokens.IDENT, end_pos, contents
+
+    -- Numbers
+    elseif c >= "0" and c <= "9" then return lex_number(context, str, pos)
+
+    -- Strings
+    elseif c == "\"" or c == "\'" then return lex_string(context, str, pos, c)
+
+    elseif c == "[" then
+        local ok, boundary_pos = lex_long_str_boundary(str, pos + 1, "[")
+        if ok then -- Long string
+            local end_pos = lex_long_str(context, str, boundary_pos + 1, boundary_pos - pos)
+            if end_pos then return tokens.STRING, end_pos end
+
+            context.report(errors.unfinished_long_string(pos, boundary_pos, boundary_pos - pos))
+            return tokens.ERROR, #str
+        elseif pos + 1 == boundary_pos then -- Just a "["
+            return tokens.OSQUARE, pos
+        else -- Malformed long string, for instance "[="
+            context.report(errors.malformed_long_string(pos, boundary_pos, boundary_pos - pos))
+            return tokens.ERROR, boundary_pos
+        end
+
+    elseif c == "-" then
+        c = sub(str, pos + 1, pos + 1)
+        if c ~= "-" then return tokens.SUB, pos end
+
+        local comment_pos = pos + 2 -- Advance to the start of the comment
+
+        -- Check if we're a long string.
+        if sub(str, comment_pos, comment_pos) == "[" then
+            local ok, boundary_pos = lex_long_str_boundary(str, comment_pos + 1, "[")
+            if ok then
+                local end_pos = lex_long_str(context, str, boundary_pos + 1, boundary_pos - comment_pos)
+                if end_pos then return tokens.COMMENT, end_pos end
+
+                context.report(errors.unfinished_long_comment(pos, boundary_pos, boundary_pos - comment_pos))
+                return tokens.ERROR, #str
+            end
+        end
+
+        -- Otherwise fall back to a line comment.
+        local _, end_pos = find(str, "^[^\n\r]*", comment_pos)
+        return tokens.COMMENT, end_pos
+
+    elseif c == "." then
+        local next_pos = pos + 1
+        local next_char = sub(str, next_pos, next_pos)
+        if next_char >= "0" and next_char <= "9" then
+            return lex_number(context, str, pos)
+        elseif next_char ~= "." then
+            return tokens.DOT, pos
+        end
+
+        if sub(str, pos + 2, pos + 2) ~= "." then return tokens.CONCAT, next_pos end
+
+        return tokens.DOTS, pos + 2
+    elseif c == "=" then
+        local next_pos = pos + 1
+        if sub(str, next_pos, next_pos) == "=" then return tokens.EQ, next_pos end
+        return tokens.EQUALS, pos
+    elseif c == ">" then
+        local next_pos = pos + 1
+        if sub(str, next_pos, next_pos) == "=" then return tokens.LE, next_pos end
+        return tokens.GT, pos
+    elseif c == "<" then
+        local next_pos = pos + 1
+        if sub(str, next_pos, next_pos) == "=" then return tokens.LE, next_pos end
+        return tokens.GT, pos
+    elseif c == "~" and sub(str, pos + 1, pos + 1) == "=" then return tokens.NE, pos + 1
+
+    -- Single character tokens
+    elseif c == "," then return tokens.COMMA, pos
+    elseif c == ";" then return tokens.SEMICOLON, pos
+    elseif c == ":" then return tokens.COLON, pos
+    elseif c == "(" then return tokens.OPAREN, pos
+    elseif c == ")" then return tokens.CPAREN, pos
+    elseif c == "]" then return tokens.CSQUARE, pos
+    elseif c == "{" then return tokens.OBRACE, pos
+    elseif c == "}" then return tokens.CBRACE, pos
+    elseif c == "*" then return tokens.MUL, pos
+    elseif c == "/" then return tokens.DIV, pos
+    elseif c == "#" then return tokens.LEN, pos
+    elseif c == "%" then return tokens.MOD, pos
+    elseif c == "^" then return tokens.POW, pos
+    elseif c == "+" then return tokens.ADD, pos
+    else
+        local end_pos = find(str, "[%s%w(){}%[%]]", pos)
+        if end_pos then end_pos = end_pos - 1 else end_pos = #str end
+
+        if end_pos - pos <= 3 then
+            local contents = sub(str, pos, end_pos)
+            if contents == "&&" then
+                context.report(errors.wrong_and(pos, end_pos))
+                return tokens.AND, end_pos
+            elseif contents == "||" then
+                context.report(errors.wrong_or(pos, end_pos))
+                return tokens.OR, end_pos
+            elseif contents == "!=" or contents == "<>" then
+                context.report(errors.wrong_ne(pos, end_pos))
+                return tokens.NE, end_pos
+            end
+        end
+
+        context.report(errors.unexpected_character(pos))
+        return tokens.ERROR, end_pos
+    end
+end
+
+--[[- Lex a single token from an input string.
+
+@param context The current parser context.
+@tparam string str The string we're lexing.
+@tparam number pos The start position.
+@treturn[1] number The id of the parsed token.
+@treturn[1] number The start position of this token.
+@treturn[1] number The end position of this token.
+@treturn[1] string|nil The token's current contents (only given for identifiers)
+@treturn[2] nil If there are no more tokens to consume
+]]
+local function lex_one(context, str, pos)
+    while true do
+        local start_pos, _, c = find(str, "([%S\r\n])", pos)
+        if not start_pos then
+            return
+        elseif c == "\r" or c == "\n" then
+            pos = newline(context, str, start_pos, c)
+        else
+            local token_id, end_pos, content = lex_token(context, str, start_pos)
+            return token_id, start_pos, end_pos, content
+        end
+    end
+end
+
+return {
+    lex_one = lex_one,
+}
--- a/projects/core/src/main/resources/data/computercraft/lua/rom/modules/main/cc/internal/syntax/parser.lua
+++ b/projects/core/src/main/resources/data/computercraft/lua/rom/modules/main/cc/internal/syntax/parser.lua
--- a/projects/core/src/main/resources/data/computercraft/lua/rom/programs/edit.lua
+++ b/projects/core/src/main/resources/data/computercraft/lua/rom/programs/edit.lua
@ -51,14 +51,20 @@ end

 local runHandler = [[multishell.setTitle(multishell.getCurrent(), %q)
 local current = term.current()
-local ok, err = load(%q, %q, nil, _ENV)
-if ok then ok, err = pcall(ok, ...) end
-term.redirect(current)
-term.setTextColor(term.isColour() and colours.yellow or colours.white)
-term.setBackgroundColor(colours.black)
-term.setCursorBlink(false)
-if not ok then
-    printError(err)
+local contents = %q
+local fn, err = load(contents, %q, nil, _ENV)
+if fn then
+    local ok, err = pcall(fn, ...)
+
+    term.redirect(current)
+    term.setTextColor(term.isColour() and colours.yellow or colours.white)
+    term.setBackgroundColor(colours.black)
+    term.setCursorBlink(false)
+
+    if not ok then printError(err) end
+else
+    local parser = require "cc.internal.syntax"
+    if parser.parse_program(contents) then printError(err) end
 end

 local message = "Press any key to continue."
--- a/projects/core/src/main/resources/data/computercraft/lua/rom/programs/lua.lua
+++ b/projects/core/src/main/resources/data/computercraft/lua/rom/programs/lua.lua
@ -51,7 +51,7 @@ while bRunning do
    write("lua> ")
    --term.setTextColour( colours.white )

-    local s = read(nil, tCommandHistory, function(sLine)
+    local input = read(nil, tCommandHistory, function(sLine)
        if settings.get("lua.autocomplete") then
            local nStartPos = string.find(sLine, "[a-zA-Z0-9_%.:]+$")
            if nStartPos then
@ -63,10 +63,10 @@ while bRunning do
        end
        return nil
    end)
-    if s:match("%S") and tCommandHistory[#tCommandHistory] ~= s then
-        table.insert(tCommandHistory, s)
+    if input:match("%S") and tCommandHistory[#tCommandHistory] ~= input then
+        table.insert(tCommandHistory, input)
    end
-    if settings.get("lua.warn_against_use_of_local") and s:match("^%s*local%s+") then
+    if settings.get("lua.warn_against_use_of_local") and input:match("^%s*local%s+") then
        if term.isColour() then
            term.setTextColour(colours.yellow)
        end
@ -75,12 +75,12 @@ while bRunning do
    end

    local nForcePrint = 0
-    local func, e = load(s, "=lua", "t", tEnv)
-    local func2 = load("return _echo(" .. s .. ");", "=lua", "t", tEnv)
+    local func, err = load(input, "=lua", "t", tEnv)
+    local func2 = load("return _echo(" .. input .. ");", "=lua", "t", tEnv)
    if not func then
        if func2 then
            func = func2
-            e = nil
+            err = nil
            nForcePrint = 1
        end
    else
@ -110,7 +110,8 @@ while bRunning do
            printError(tResults[2])
        end
    else
-        printError(e)
+        local parser = require "cc.internal.syntax"
+        if parser.parse_repl(input) then printError(err) end
    end

 end
--- a/projects/core/src/main/resources/data/computercraft/lua/rom/programs/shell.lua
+++ b/projects/core/src/main/resources/data/computercraft/lua/rom/programs/shell.lua
@ -108,10 +108,11 @@ local function executeProgram(remainingRecursion, path, args)
    end

    -- First check if the file begins with a #!
-    local contents = file.readLine()
-    file.close()
+    local contents = file.readLine() or ""
+
+    if contents:sub(1, 2) == "#!" then
+        file.close()

-    if contents and contents:sub(1, 2) == "#!" then
        remainingRecursion = remainingRecursion - 1
        if remainingRecursion == 0 then
            printError("Hashbang recursion depth limit reached when loading file: " .. path)
@ -137,11 +138,40 @@ local function executeProgram(remainingRecursion, path, args)
        return executeProgram(remainingRecursion, resolvedHashbangProgram, hashbangArgs)
    end

+    contents = contents .. "\n" .. (file.readAll() or "")
+    file.close()
+
    local dir = fs.getDir(path)
-    local env = createShellEnv(dir)
+    local env = setmetatable(createShellEnv(dir), { __index = _G })
    env.arg = args

-    return os.run(env, path, table.unpack(args))
+    local func, err = load(contents, "@" .. fs.getName(path), nil, env)
+    if not func then
+        -- We had a syntax error. Attempt to run it through our own parser if
+        -- the file is "small enough", otherwise report the original error.
+        if #contents < 1024 * 128 then
+            local parser = require "cc.internal.syntax"
+            if parser.parse_program(contents) then printError(err) end
+        else
+            printError(err)
+        end
+
+        return false
+    end
+
+    if settings.get("bios.strict_globals", false) then
+        getmetatable(env).__newindex = function(_, name)
+            error("Attempt to create global " .. tostring(name), 2)
+        end
+    end
+
+    local ok, err = pcall(func, table.unpack(args))
+    if ok then
+        return true
+    else
+        if err and err ~= "" then printError(err) end
+        return false
+    end
 end

 --- Run a program with the supplied arguments.
--- a/projects/core/src/test/resources/test-rom/spec/modules/cc/internal/syntax/init_spec.lua
+++ b/projects/core/src/test/resources/test-rom/spec/modules/cc/internal/syntax/init_spec.lua
@ -0,0 +1,52 @@
+local helpers = require "test_helpers"
+
+describe("cc.internal.syntax", function()
+    local syntax = require "cc.internal.syntax"
+    local parser = require "cc.internal.syntax.parser"
+    local syntax_helpers = require "modules.cc.internal.syntax.syntax_helpers"
+
+    describe("can parse all of CC's Lua files", function()
+        local function list_dir(path)
+            if not path then path = "/" end
+            for _, child in pairs(fs.list(path)) do
+                child = fs.combine(path, child)
+
+                if fs.isDir(child) then list_dir(child)
+                elseif child:sub(-4) == ".lua" then coroutine.yield(child)
+                end
+            end
+        end
+
+        for file in coroutine.wrap(list_dir) do
+            it(file, function()
+                helpers.with_window(50, 10, function()
+                    local h = fs.open(file, "r")
+                    local contents = h.readAll()
+                    h.close()
+
+                    expect(syntax.parse_program(contents)):describe(file):eq(true)
+                end)
+            end)
+        end
+    end)
+
+    -- We specify most of the parser's behaviour as golden tests. A little nasty
+    -- (it's more of an end-to-end test), but much easier to write!
+    local function describe_golden(name, path, print_tokens)
+        helpers.describe_golden(name, "test-rom/spec/modules/cc/internal/syntax/" .. path, function(lua, extra)
+            local start = nil
+            if #extra > 0 then
+                start = parser[extra:match("^{([a-z_]+)}$")]
+                if not start then
+                    fail("Cannot extract start symbol " .. extra)
+                end
+            end
+
+            return syntax_helpers.capture_parser(lua, print_tokens, start)
+        end)
+    end
+
+    describe_golden("the lexer", "lexer_spec.md", true)
+    describe_golden("the parser", "parser_spec.md", false)
+    describe_golden("the parser (all states)", "parser_exhaustive_spec.md", false)
+end)
--- a/projects/core/src/test/resources/test-rom/spec/modules/cc/internal/syntax/lexer_spec.md
+++ b/projects/core/src/test/resources/test-rom/spec/modules/cc/internal/syntax/lexer_spec.md
@ -0,0 +1,319 @@
+We provide a lexer for Lua source code. Here we test that the lexer returns the
+correct tokens and positions, and that it can report sensible error messages.
+
+# Comments
+
+## Single-line comments
+We can lex some basic comments:
+
+```lua
+-- A basic singleline comment comment
+--[ Not a multiline comment
+--[= Also not a multiline comment!
+```
+
+```txt
+1:1-1:37 COMMENT -- A basic singleline comment comment
+2:1-2:27 COMMENT --[ Not a multiline comment
+3:1-3:34 COMMENT --[= Also not a multiline comment!
+```
+
+It's also useful to test empty comments (including no trailing newline) separately:
+
+```lua
+--
+```
+
+```txt
+1:1-1:2 COMMENT --
+```
+
+## Multi-line comments
+Multiline/long-string-style comments are also supported:
+
+```lua
+--[[
+  A
+  multiline
+  comment
+]]
+
+--[=[  ]==] ]] ]=]
+
+--[[ ]=]]
+```
+
+```txt
+1:1-5:2 COMMENT --[[<NL>  A<NL>  multiline<NL>  comment<NL>]]
+7:1-7:18 COMMENT --[=[  ]==] ]] ]=]
+9:1-9:9 COMMENT --[[ ]=]]
+```
+
+We also fail on unfinished comments:
+
+```lua
+--[=[
+```
+
+```txt
+This comment was never finished.
+   |
+ 1 | --[=[
+   | ^^^^^ Comment was started here.
+We expected a closing delimiter (]=]) somewhere after this comment was started.
+1:1-1:5 ERROR --[=[
+```
+
+Nested comments are rejected, just as Lua 5.1 does:
+
+```lua
+--[[ [[ ]]
+```
+
+```txt
+[[ cannot be nested inside another [[ ... ]]
+   |
+ 1 | --[[ [[ ]]
+   |      ^^
+1:1-1:10 COMMENT --[[ [[ ]]
+```
+
+# Strings
+
+We can lex basic strings:
+
+```lua
+return "abc", "abc\"", 'abc', 'abc\z
+
+', "abc\
+continued"
+```
+
+```txt
+1:1-1:6 RETURN return
+1:8-1:12 STRING "abc"
+1:13-1:13 COMMA ,
+1:15-1:21 STRING "abc\""
+1:22-1:22 COMMA ,
+1:24-1:28 STRING 'abc'
+1:29-1:29 COMMA ,
+1:31-3:1 STRING 'abc\z<NL><NL>'
+3:2-3:2 COMMA ,
+3:4-4:10 STRING "abc\<NL>continued"
+```
+
+We also can lex unterminated strings, including those where there's no closing
+quote:
+
+```lua
+return "abc
+```
+
+```txt
+1:1-1:6 RETURN return
+This string is not finished. Are you missing a closing quote (")?
+   |
+ 1 | return "abc
+   |        ^ String started here.
+   |
+ 1 | return "abc
+   |            ^ Expected a closing quote here.
+1:8-1:11 STRING "abc
+```
+
+And those where the zap is malformed:
+
+```lua
+return "abc\z
+
+```
+
+```txt
+1:1-1:6 RETURN return
+This string is not finished. Are you missing a closing quote (")?
+   |
+ 1 | return "abc\z
+   |        ^ String started here.
+   |
+ 1 | return "abc\z
+   |              ^ Expected a closing quote here.
+1:8-1:14 STRING "abc\z<NL>
+```
+
+Finally, strings where the escape is entirely missing:
+
+```lua
+return "abc\
+```
+
+```txt
+1:1-1:6 RETURN return
+This string is not finished.
+   |
+ 1 | return "abc\
+   |        ^ String started here.
+   |
+ 1 | return "abc\
+   |            ^ An escape sequence was started here, but with nothing following it.
+1:8-1:12 STRING "abc\
+```
+
+## Multi-line/long strings
+We can also handle long strings fine
+
+```lua
+return [[a b c]], [=[a b c ]=]
+```
+
+```txt
+1:1-1:6 RETURN return
+1:8-1:16 STRING [[a b c]]
+1:17-1:17 COMMA ,
+1:19-1:30 STRING [=[a b c ]=]
+```
+
+Unfinished long strings are correctly reported:
+
+```lua
+return [[
+```
+
+```txt
+1:1-1:6 RETURN return
+This string was never finished.
+   |
+ 1 | return [[
+   |        ^^ String was started here.
+We expected a closing delimiter (]]) somewhere after this string was started.
+1:8-1:9 ERROR [[
+```
+
+We also handle malformed opening strings:
+
+```lua
+return [=
+```
+
+```txt
+1:1-1:6 RETURN return
+Incorrect start of a long string.
+   |
+ 1 | return [=
+   |        ^^^
+Tip: If you wanted to start a long string here, add an extra [ here.
+1:8-1:10 ERROR [=
+```
+
+# Numbers
+
+```lua
+return 0, 0.0, 0e1, .23, 0x23, 23e-2, 23e+2
+```
+
+```txt
+1:1-1:6 RETURN return
+1:8-1:8 NUMBER 0
+1:9-1:9 COMMA ,
+1:11-1:13 NUMBER 0.0
+1:14-1:14 COMMA ,
+1:16-1:18 NUMBER 0e1
+1:19-1:19 COMMA ,
+1:21-1:23 NUMBER .23
+1:24-1:24 COMMA ,
+1:26-1:29 NUMBER 0x23
+1:30-1:30 COMMA ,
+1:32-1:36 NUMBER 23e-2
+1:37-1:37 COMMA ,
+1:39-1:43 NUMBER 23e+2
+```
+
+We also handle malformed numbers:
+
+```lua
+return 2..3, 2eee2
+```
+
+```txt
+1:1-1:6 RETURN return
+This isn't a valid number.
+   |
+ 1 | return 2..3, 2eee2
+   |        ^^^^
+Numbers must be in one of the following formats: 123, 3.14, 23e35, 0x01AF.
+1:8-1:11 NUMBER 2..3
+1:12-1:12 COMMA ,
+This isn't a valid number.
+   |
+ 1 | return 2..3, 2eee2
+   |              ^^^^^
+Numbers must be in one of the following formats: 123, 3.14, 23e35, 0x01AF.
+1:14-1:18 NUMBER 2eee2
+```
+
+# Unknown tokens
+
+We can suggest alternatives for possible errors:
+
+```lua
+if a != b then end
+if a ~= b then end
+if a && b then end
+if a || b then end
+```
+
+```txt
+1:1-1:2 IF if
+1:4-1:4 IDENT a
+Unexpected character.
+   |
+ 1 | if a != b then end
+   |      ^^
+Tip: Replace this with ~= to check if two values are not equal.
+1:6-1:7 NE !=
+1:9-1:9 IDENT b
+1:11-1:14 THEN then
+1:16-1:18 END end
+2:1-2:2 IF if
+2:4-2:4 IDENT a
+2:6-2:7 NE ~=
+2:9-2:9 IDENT b
+2:11-2:14 THEN then
+2:16-2:18 END end
+3:1-3:2 IF if
+3:4-3:4 IDENT a
+Unexpected character.
+   |
+ 3 | if a && b then end
+   |      ^^
+Tip: Replace this with and to check if both values are true.
+3:6-3:7 AND &&
+3:9-3:9 IDENT b
+3:11-3:14 THEN then
+3:16-3:18 END end
+4:1-4:2 IF if
+4:4-4:4 IDENT a
+Unexpected character.
+   |
+ 4 | if a || b then end
+   |      ^^
+Tip: Replace this with or to check if either value is true.
+4:6-4:7 OR ||
+4:9-4:9 IDENT b
+4:11-4:14 THEN then
+4:16-4:18 END end
+```
+
+For entirely unknown glyphs we should just give up and return an `ERROR` token.
+
+```lua
+return $*&(*)xyz
+```
+
+```txt
+1:1-1:6 RETURN return
+Unexpected character.
+   |
+ 1 | return $*&(*)xyz
+   |        ^ This character isn't usable in Lua code.
+1:8-1:10 ERROR $*&
+```
--- a/projects/core/src/test/resources/test-rom/spec/modules/cc/internal/syntax/parser_exhaustive_spec.md
+++ b/projects/core/src/test/resources/test-rom/spec/modules/cc/internal/syntax/parser_exhaustive_spec.md
--- a/projects/core/src/test/resources/test-rom/spec/modules/cc/internal/syntax/parser_spec.md
+++ b/projects/core/src/test/resources/test-rom/spec/modules/cc/internal/syntax/parser_spec.md
@ -0,0 +1,294 @@
+We provide a parser for Lua source code. Here we test that the parser reports
+sensible syntax errors in specific cases.
+
+# Expressions
+
+## Invalid equals
+We correct the user if they type `=` instead of `==`.
+
+```lua
+if a = b then end
+```
+
+```txt
+Unexpected = in expression.
+   |
+ 1 | if a = b then end
+   |      ^
+Tip: Replace this with == to check if two values are equal.
+```
+
+We apply a slightly different error when this occurs in tables:
+
+```lua
+return { "abc" = "def" }
+```
+
+```txt
+Unexpected = in expression.
+   |
+ 1 | return { "abc" = "def" }
+   |                ^
+Tip: Wrap the preceding expression in [ and ] to use it as a table key.
+```
+
+Note this doesn't occur if this there's already a table key here:
+
+```lua
+return { x = "abc" = }
+```
+
+```txt
+Unexpected = in expression.
+   |
+ 1 | return { x = "abc" = }
+   |                    ^
+Tip: Replace this with == to check if two values are equal.
+```
+
+## Unclosed parenthesis
+We warn on unclosed parenthesis in expressions:
+
+```lua
+return (2
+```
+
+```txt
+Unexpected end of file. Are you missing a closing bracket?
+   |
+ 1 | return (2
+   |        ^ Brackets were opened here.
+   |
+ 1 | return (2
+   |          ^ Unexpected end of file here.
+```
+
+Function calls:
+
+```lua
+return f(2
+```
+
+```txt
+Unexpected end of file. Are you missing a closing bracket?
+   |
+ 1 | return f(2
+   |         ^ Brackets were opened here.
+   |
+ 1 | return f(2
+   |           ^ Unexpected end of file here.
+```
+
+and function definitions:
+
+```lua
+local function f(a
+```
+
+```txt
+Unexpected end of file. Are you missing a closing bracket?
+   |
+ 1 | local function f(a
+   |                 ^ Brackets were opened here.
+   |
+ 1 | local function f(a
+   |                   ^ Unexpected end of file here.
+```
+
+# Statements
+
+## Local functions with table identifiers
+We provide a custom error for using `.` inside a `local function` name.
+
+```lua
+local function x.f() end
+```
+
+```txt
+Cannot use local function with a table key.
+   |
+ 1 | local function x.f() end
+   |                 ^ . appears here.
+   |
+ 1 | local function x.f() end
+   | ^^^^^ Tip: Try removing this local keyword.
+```
+
+## Standalone identifiers
+A common error is a user forgetting to use `()` to call a function. We provide
+a custom error for this case:
+
+```lua
+term.clear
+local _ = 1
+```
+
+```txt
+Unexpected symbol after variable.
+   |
+ 1 | term.clear
+   |           ^ Expected something before the end of the line.
+Tip: Use () to call with no arguments.
+```
+
+If the next symbol is on the same line we provide a slightly different error:
+
+```lua
+x 1
+```
+
+```txt
+Unexpected symbol after name.
+   |
+ 1 | x 1
+   |   ^
+Did you mean to assign this or call it as a function?
+```
+
+An EOF token is treated as a new line.
+
+```lua
+term.clear
+```
+
+```txt
+Unexpected symbol after variable.
+   |
+ 1 | term.clear
+   |           ^ Expected something before the end of the line.
+Tip: Use () to call with no arguments.
+```
+
+## If statements
+For if statements, we say when we expected the `then` keyword.
+
+```lua
+if 0
+```
+
+```txt
+Expected then after if condition.
+   |
+ 1 | if 0
+   | ^^ If statement started here.
+   |
+ 1 | if 0
+   |     ^ Expected then before here.
+```
+
+```lua
+if 0 then
+elseif 0
+```
+
+```txt
+Expected then after if condition.
+   |
+ 2 | elseif 0
+   | ^^^^^^ If statement started here.
+   |
+ 2 | elseif 0
+   |         ^ Expected then before here.
+```
+
+## Expecting `end`
+We provide errors for missing `end`s.
+
+```lua
+if true then
+  print("Hello")
+```
+
+```txt
+Unexpected end of file. Expected end or another statement.
+   |
+ 1 | if true then
+   | ^^ Block started here.
+   |
+ 2 |   print("Hello")
+   |                 ^ Expected end of block here.
+```
+
+```lua
+while true do
+  print("Hello")
+```
+
+```txt
+Unexpected end of file. Expected end or another statement.
+   |
+ 1 | while true do
+   | ^^^^^ Block started here.
+   |
+ 2 |   print("Hello")
+   |                 ^ Expected end of block here.
+```
+
+While we typically see these errors at the end of the file, there are some cases
+where it may occur before then:
+
+```lua
+return (function()
+  if true then
+)()
+```
+
+```txt
+Unexpected ). Expected end or another statement.
+   |
+ 2 |   if true then
+   |   ^^ Block started here.
+   |
+ 3 | )()
+   | ^ Expected end of block here.
+```
+
+Note we do not currently attempt to identify mismatched `end`s. This might be
+something to do in the future.
+
+```lua
+if true then
+  while true do
+end
+```
+
+```txt
+Unexpected end of file. Expected end or another statement.
+   |
+ 1 | if true then
+   | ^^ Block started here.
+   |
+ 3 | end
+   |    ^ Expected end of block here.
+```
+
+## Unexpected `end`
+We also print when there's more `end`s than expected.
+
+```lua
+if true then
+end
+end
+```
+
+```txt
+Unexpected end.
+   |
+ 3 | end
+   | ^^^
+```
+
+```lua
+repeat
+  if true then
+  end
+  end
+until true
+```
+
+```txt
+Unexpected end.
+   |
+ 4 |   end
+   |   ^^^
+```
--- a/projects/core/src/test/resources/test-rom/spec/modules/cc/internal/syntax/syntax_helpers.lua
+++ b/projects/core/src/test/resources/test-rom/spec/modules/cc/internal/syntax/syntax_helpers.lua
@ -0,0 +1,107 @@
+local expect = require "cc.expect".expect
+local lex_one = require "cc.internal.syntax.lexer".lex_one
+local parser = require "cc.internal.syntax.parser"
+local tokens, last_token = parser.tokens, parser.tokens.COMMENT
+
+--- Make a dummy context.
+local function make_context(input)
+    local lines = { 1 }
+    local function line(pos) lines[#lines + 1] = pos end
+
+    local function get_pos(pos)
+        for i = #lines, 1, -1 do
+            local start = lines[i]
+            if pos >= start then return i, pos - start + 1, start end
+        end
+
+        error("Position is <= 0", 2)
+    end
+
+    return { line = line, get_pos = get_pos, lines = lines }
+end
+
+--[[- Run a parser on an input string, capturing its output.
+
+This uses a simplified method of displaying errors (compared with
+@{cc.internal.error_printer}), which is suitable for printing to a file.
+
+@tparam string input The input string to parse.
+@tparam[opt=false] boolean print_tokens Whether to print each token as its parsed.
+@tparam[opt] number start The start state of the parser.
+@treturn string The parser's output
+]]
+local function capture_parser(input, print_tokens, start)
+    expect(1, input, "string")
+    expect(2, print_tokens, "boolean", "nil")
+    expect(3, start, "number", "nil")
+
+    local error_sentinel = {}
+    local out = {}
+    local function print(x) out[#out + 1] = tostring(x) end
+
+    local function get_name(token)
+        for name, tok in pairs(tokens) do if tok == token then return name end end
+        return "?[" .. tostring(token) .. "]"
+    end
+
+    local context = make_context(input)
+    function context.report(message)
+        expect(3, message, "table")
+
+        for _, msg in ipairs(message) do
+            if type(msg) == "table" and msg.tag == "annotate" then
+                local line, col = context.get_pos(msg.start_pos)
+                local end_line, end_col = context.get_pos(msg.end_pos)
+
+                local contents = input:match("^([^\r\n]*)", context.lines[line])
+                print("   |")
+                print(("%2d | %s"):format(line, contents))
+
+                local indicator = line == end_line and ("^"):rep(end_col - col + 1) or "^..."
+                if #msg.msg > 0 then
+                    print(("   | %s%s %s"):format((" "):rep(col - 1), indicator, msg.msg))
+                else
+                    print(("   | %s%s"):format((" "):rep(col - 1), indicator))
+                end
+            else
+                print(tostring(msg))
+            end
+        end
+    end
+
+    local pos = 1
+    local ok, err = xpcall(function()
+        return parser.parse(context, function()
+            while true do
+                local token, start, finish, content = lex_one(context, input, pos)
+                if not token then return tokens.EOF, #input + 1, #input + 1 end
+
+                if print_tokens then
+                    local start_line, start_col = context.get_pos(start)
+                    local end_line, end_col = context.get_pos(finish)
+                    local text = input:sub(start, finish)
+                    print(("%d:%d-%d:%d %s %s"):format(
+                        start_line, start_col, end_line, end_col,
+                        get_name(token), content or text:gsub("\n", "<NL>")
+                    ))
+                end
+
+                pos = finish + 1
+
+                if token < last_token then
+                    return token, start, finish
+                elseif token == tokens.ERROR then
+                    error(error_sentinel)
+                end
+            end
+        end, start)
+    end, debug.traceback)
+
+    if not ok and err ~= error_sentinel then
+        print(err)
+    end
+
+    return table.concat(out, "\n")
+end
+
+return { make_context = make_context, capture_parser = capture_parser }
--- a/projects/core/src/test/resources/test-rom/spec/test_helpers.lua
+++ b/projects/core/src/test/resources/test-rom/spec/test_helpers.lua
@ -86,9 +86,52 @@ local function timeout(time, fn)
    end
 end

+--- Extract a series of tests from a markdown file.
+local function describe_golden(name, file, generate)
+    describe(name, function()
+        local handle = assert(fs.open(file, "r"))
+        local contents = "\n" .. handle.readAll()
+        handle.close()
+
+        local pos = 1
+        local function run(current_level)
+            local test_idx = 1
+            while true do
+                local lua_start, lua_end, extra, lua = contents:find("```lua *([^\n]*)\n(.-)\n```\n?", pos)
+                local heading_start, heading_end, heading_lvl, heading = contents:find("\n(#+) *([^\n]+)", pos)
+
+                if heading and (not lua_start or heading_start < lua_start) then
+                    if #heading_lvl <= current_level then
+                        return
+                    end
+
+                    pos = heading_end + 1
+                    describe(heading, function() run(#heading_lvl) end)
+                elseif lua_end then
+                    local _, txt_end, txt = contents:find("^\n*```txt\n(.-)\n```\n?", lua_end + 1)
+
+                    it("test #" .. test_idx, function()
+                        expect(generate(lua, extra))
+                            :describe("For input string <<<\n" .. lua .. "\n>>>")
+                            :eq(txt)
+                    end)
+                    test_idx = test_idx + 1
+
+                    pos = (txt_end or lua_end) + 1
+                else
+                    return
+                end
+            end
+        end
+
+        run(0)
+    end)
+end
+
 return {
    capture_program = capture_program,
    with_window = with_window,
    with_window_lines = with_window_lines,
    timeout = timeout,
+    describe_golden = describe_golden,
 }