janet/core/parse.c

/*
* Copyright (c) 2017 Calvin Rose
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/

#include <gst/gst.h>
#include <gst/parse.h>

static const char UNEXPECTED_CLOSING_DELIM[] = "Unexpected closing delimiter";

/* The type of a ParseState */
typedef enum ParseType {
    PTYPE_ROOT,
    PTYPE_FORM,
    PTYPE_STRING,
    PTYPE_TOKEN
} ParseType;

/* Contain a parse state that goes on the parse stack */
struct GstParseState {
    ParseType type;
    uint32_t quoteCount;
    union {
        struct {
            uint8_t endDelimiter;
            GstArray *array;
        } form;
        struct {
            GstBuffer *buffer;
            uint32_t count;
            uint32_t accum;
            enum {
                STRING_STATE_BASE,
                STRING_STATE_ESCAPE,
                STRING_STATE_ESCAPE_UNICODE,
                STRING_STATE_ESCAPE_HEX
            } state;
        } string;
    } buf;
};

/* Handle error in parsing */
#define p_error(p, e) ((p)->error = (e), (p)->status = GST_PARSER_ERROR)

/* Get the top ParseState in the parse stack */
static GstParseState *parser_peek(GstParser *p) {
    if (!p->count) {
        p_error(p, "parser stack underflow");
        return NULL;
    }
    return p->data + p->count - 1;
}

/* Remove the top state from the ParseStack */
static GstParseState *parser_pop(GstParser * p) {
    if (!p->count) {
        p_error(p, "parser stack underflow");
        return NULL;
    }
    return p->data + --p->count;
}

/* Quote a value */
static GstValue quote(GstParser *p, GstValue x) {
    /* Load a quote form to get the string literal */
    GstValue tuplev;
    GstValue *tuple;
    tuple = gst_tuple_begin(p->vm, 2);
    tuple[0] = gst_string_cv(p->vm, "quote");
    tuple[1] = x;
    tuplev.type = GST_TUPLE;
    tuplev.data.tuple = gst_tuple_end(p->vm, tuple);
    return tuplev;
}

/* Add a new, empty ParseState to the ParseStack. */
static void parser_push(GstParser *p, ParseType type, uint8_t character) {
    GstParseState *top;
    if (p->count >= p->cap) {
        uint32_t newCap = 2 * p->count;
        GstParseState *data = gst_alloc(p->vm, newCap);
        p->data = data;
        p->cap = newCap;
    }
    if (p->count) {
        top = parser_peek(p);
        top->quoteCount = p->quoteCount;
        p->quoteCount = 0;
    }
    ++p->count;
    top = parser_peek(p);
    if (!top) return;
    top->type = type;
    switch (type) {
        case PTYPE_ROOT:
            break;
        case PTYPE_STRING:
            top->buf.string.state = STRING_STATE_BASE;
        case PTYPE_TOKEN:
            top->buf.string.buffer = gst_buffer(p->vm, 10);
            break;
        case PTYPE_FORM:
            top->buf.form.array = gst_array(p->vm, 10);
            if (character == '(') top->buf.form.endDelimiter = ')';
            if (character == '[') top->buf.form.endDelimiter = ']';
            if (character == '{') top->buf.form.endDelimiter = '}';
    }
}

/* Append a value to the top-most state in the Parser's stack. */
static void parser_append(GstParser *p, GstValue x) {
    GstParseState *top = parser_peek(p);
    if (!top) return;
    while (top->quoteCount--)
        x = quote(p, x);
    switch (top->type) {
        case PTYPE_ROOT:
            p->value = x;
            p->status = GST_PARSER_FULL;
            break;
        case PTYPE_FORM:
            gst_array_push(p->vm, top->buf.form.array, x);
            break;
        default:
            p_error(p, "expected container type");
            break;
    }
}

/* Check if a character is whitespace */
static int is_whitespace(uint8_t c) {
    return c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\0' || c == ',';
}

/* Check if a character is a valid symbol character */
static int is_symbol_char(uint8_t c) {
    if (c >= 'a' && c <= 'z') return 1;
    if (c >= 'A' && c <= 'Z') return 1;
    if (c >= '0' && c <= ':') return 1;
    if (c >= '<' && c <= '@') return 1;
    if (c >= '*' && c <= '/') return 1;
    if (c >= '#' && c <= '&') return 1;
    if (c == '_') return 1;
    if (c == '^') return 1;
    if (c == '!') return 1;
    return 0;
}

/* Get an integer power of 10 */
static double exp10(int power) {
    if (power == 0) return 1;
    if (power > 0) {
        double result = 10;
        int currentPower = 1;
        while (currentPower * 2 <= power) {
            result = result * result;
            currentPower *= 2;
        }
        return result * exp10(power - currentPower);
    } else {
        return 1 / exp10(-power);
    }
}

/* Read a real from a string. Returns if successfuly
 * parsed a real from the enitre input string.
 * If returned 1, output is int ret.*/
static int read_real(const uint8_t *string, const uint8_t *end, double *ret, int forceInt) {
    int sign = 1, x = 0;
    double accum = 0, exp = 1, place = 1;
    /* Check the sign */
    if (*string == '-') {
        sign = -1;
        ++string;
    } else if (*string == '+') {
        ++string;
    }
    if (string >= end) return 0;
    while (string < end) {
        if (*string == '.' && !forceInt) {
            place = 0.1;
        } else if (!forceInt && (*string == 'e' || *string == 'E')) {
            /* Read the exponent */
            ++string;
            if (string >= end) return 0;
            if (!read_real(string, end, &exp, 1))
                return 0;
            exp = exp10(exp);
            break;
        } else {
            x = *string;
            if (x < '0' || x > '9') return 0;
            x -= '0';
            if (place < 1) {
                accum += x * place;
                place *= 0.1;
            } else {
                accum *= 10;
                accum += x;
            }
        }
        ++string;
    }
    *ret = accum * sign * exp;
    return 1;
}

static int read_integer(const uint8_t *string, const uint8_t *end, int64_t *ret) {
    int sign = 1, x = 0;
    int64_t accum = 0;
    if (*string == '-') {
        sign = -1;
        ++string;
    } else if (*string == '+') {
        ++string;
    }
    if (string >= end) return 0;
    while (string < end) {
        x = *string;
        if (x < '0' || x > '9') return 0;
        x -= '0';
        accum = accum * 10 + x;
        ++string;
    }
    *ret = accum * sign;
    return 1;
}

/* Checks if a string slice is equal to a string constant */
static int check_str_const(const char *ref, const uint8_t *start, const uint8_t *end) {
    while (*ref && start < end) {
        if (*ref != *(char *)start) return 0;
        ++ref;
        ++start;
    }
    return !*ref && start == end;
}

/* Build from the token buffer */
static GstValue build_token(GstParser *p, GstBuffer *buf) {
    GstValue x;
    GstReal real;
    GstInteger integer;
    uint8_t *data = buf->data;
    uint8_t *back = data + buf->count;
    if (read_integer(data, back, &integer)) {
        x.type = GST_INTEGER;
        x.data.integer = integer;
    } else if (read_real(data, back, &real, 0)) {
        x.type = GST_REAL;
        x.data.real = real;
    } else if (check_str_const("nil", data, back)) {
        x.type = GST_NIL;
        x.data.boolean = 0;
    } else if (check_str_const("false", data, back)) {
        x.type = GST_BOOLEAN;
        x.data.boolean = 0;
    } else if (check_str_const("true", data, back)) {
        x.type = GST_BOOLEAN;
        x.data.boolean = 1;
    } else {
        if (buf->data[0] >= '0' && buf->data[0] <= '9') {
            p_error(p, "symbols cannot start with digits");
            x.type = GST_NIL;
        } else {
            x.type = GST_STRING;
            x.data.string = gst_buffer_to_string(p->vm, buf);
        }
    }
    return x;
}

/* Handle parsing a token */
static int token_state(GstParser *p, uint8_t c) {
    GstParseState *top = parser_peek(p);
    GstBuffer *buf = top->buf.string.buffer;
    if (is_whitespace(c) || c == ')' || c == ']' || c == '}') {
        parser_pop(p);
        parser_append(p, build_token(p, buf));
        return !(c == ')' || c == ']' || c == '}');
    } else if (is_symbol_char(c)) {
        gst_buffer_push(p->vm, buf, c);
        return 1;
    } else {
        p_error(p, "expected symbol character");
        return 1;
    }
}

/* Get hex digit from a letter */
static int to_hex(uint8_t c) {
    if (c >= '0' && c <= '9') {
        return c - '0';
    } else if (c >= 'a' && c <= 'f') {
        return 10 + c - 'a';
    } else if (c >= 'A' && c <= 'F') {
        return 10 + c - 'A';
    } else {
        return -1;
    }
}

/* Handle parsing a string literal */
static int string_state(GstParser *p, uint8_t c) {
    int digit;
    GstParseState *top = parser_peek(p);
    switch (top->buf.string.state) {
        case STRING_STATE_BASE:
            if (c == '\\') {
                top->buf.string.state = STRING_STATE_ESCAPE;
            } else if (c == '"') {
                GstValue x;
                x.type = GST_STRING;
                x.data.string = gst_buffer_to_string(p->vm, top->buf.string.buffer);
                parser_pop(p);
                parser_append(p, quote(p, x));
            } else {
                gst_buffer_push(p->vm, top->buf.string.buffer, c);
            }
            break;
        case STRING_STATE_ESCAPE:
            {
                uint8_t next;
                switch (c) {
                    case 'n': next = '\n'; break;
                    case 'r': next = '\r'; break;
                    case 't': next = '\t'; break;
                    case 'f': next = '\f'; break;
                    case '0': next = '\0'; break;
                    case '"': next = '"'; break;
                    case '\'': next = '\''; break;
                    case 'z': next = '\0'; break;
                    case 'h':
                        top->buf.string.state = STRING_STATE_ESCAPE_HEX;
                        top->buf.string.count = 0;
                        top->buf.string.accum = 0;
                        return 1;
                    default:
                        p_error(p, "unknown string escape sequence");
                        return 1;
                }
                gst_buffer_push(p->vm, top->buf.string.buffer, next);
                top->buf.string.state = STRING_STATE_BASE;
            }
            break;
        case STRING_STATE_ESCAPE_HEX:
            digit = to_hex(c);
            if (digit < 0) {
                p_error(p, "invalid hexidecimal digit");
                return 1;
            } else {
                top->buf.string.accum *= 16;
                top->buf.string.accum += digit;
            }
            top->buf.string.accum += digit;
            if (++top->buf.string.count == 2) {
                gst_buffer_push(p->vm, top->buf.string.buffer, top->buf.string.accum);
                top->buf.string.state = STRING_STATE_BASE;
            }
            break;
        case STRING_STATE_ESCAPE_UNICODE:
            break;
    }
    return 1;
}

/* Root state of the parser */
static int root_state(GstParser *p, uint8_t c) {
    if (is_whitespace(c)) return 1;
    p->status = GST_PARSER_PENDING;
    if (c == ']' || c == ')' || c == '}') {
        p_error(p, UNEXPECTED_CLOSING_DELIM);
        return 1;
    }
    if (c == '(' || c == '[' || c == '{') {
        parser_push(p, PTYPE_FORM, c);
        return 1;
    }
    if (c == '"') {
        parser_push(p, PTYPE_STRING, c);
        return 1;
    }
    if (c == '\'') {
        p->quoteCount++;
        return 1;
    }
    if (is_symbol_char(c)) {
        parser_push(p, PTYPE_TOKEN, c);
        return 0;
    }
    p_error(p, "unexpected character");
    return 1;
}

/* Handle parsing a form */
static int form_state(GstParser *p, uint8_t c) {
    GstParseState *top = parser_peek(p);
    if (c == top->buf.form.endDelimiter) {
        GstArray *array = top->buf.form.array;
        GstValue x;
        if (c == ']') {
            x.type = GST_ARRAY;
            x.data.array = array;
        } else if (c == ')') {
            GstValue *tup;
            tup = gst_tuple_begin(p->vm, array->count);
            gst_memcpy(tup, array->data, array->count * sizeof(GstValue));
            x.type = GST_TUPLE;
            x.data.tuple = gst_tuple_end(p->vm, tup);
        } else { /* c == '{' */
            uint32_t i;
            if (array->count % 2 != 0) {
                p_error(p, "table literal must have even number of elements");
                return 1;
            }
            x.type = GST_TABLE;
            x.data.table = gst_table(p->vm, array->count);
            for (i = 0; i < array->count; i += 2) {
                gst_table_put(p->vm, x.data.table, array->data[i], array->data[i + 1]);
            }
        }
        parser_pop(p);
        parser_append(p, x);
        return 1;
    }
    return root_state(p, c);
}

/* Handle a character */
static void dispatch_char(GstParser *p, uint8_t c) {
    int done = 0;
    ++p->index;
    /* Handle comments */
    if (p->flags & GST_PARSER_FLAG_INCOMMENT) {
        if (c == '\n') {
            p->flags = GST_PARSER_FLAG_EXPECTING_COMMENT;
        }
        return;
    } else if (p->flags & GST_PARSER_FLAG_EXPECTING_COMMENT) {
        if (c == '#') {
            p->flags = GST_PARSER_FLAG_INCOMMENT;
            return;
        } else if (!is_whitespace(c)) {
            p->flags = 0;
        } else {
            return;
        }
    }
    /* Dispatch character to state */
    while (!done) {
        GstParseState *top = parser_peek(p);
        switch (top->type) {
            case PTYPE_ROOT:
                done = root_state(p, c);
                break;
            case PTYPE_TOKEN:
                done = token_state(p, c);
                break;
            case PTYPE_FORM:
                done = form_state(p, c);
                break;
            case PTYPE_STRING:
                done = string_state(p, c);
                break;
        }
    }
}

/* Parse a C style string. The first value encountered when parsed is put
 * in p->value. The string variable is then updated to the next char that
 * was not read. Returns 1 if any values were read, otherwise returns 0.
 * Returns the number of bytes read.
 */
int gst_parse_cstring(GstParser *p, const char *string) {
    int bytesRead = 0;
    while ((p->status == GST_PARSER_PENDING || p->status == GST_PARSER_ROOT)
            && (string[bytesRead] != '\0')) {
        dispatch_char(p, string[bytesRead++]);
    }
    return bytesRead;
}

/* Parse a gst string */
int gst_parse_string(GstParser *p, const uint8_t *string) {
    uint32_t i;
    for (i = 0; i < gst_string_length(string); ++i) {
        if (p->status != GST_PARSER_PENDING && p->status != GST_PARSER_ROOT) break;
        dispatch_char(p, string[i]);
    }
    return i;
}

/* Check if a parser has a value that needs to be handled. If
 * so, the parser will not parse any more input until that value
 * is consumed. */
int gst_parse_hasvalue(GstParser *p) {
    return p->status == GST_PARSER_FULL;
}

/* Gets a value from the parser */
GstValue gst_parse_consume(GstParser *p) {
    p->status = GST_PARSER_ROOT;
    return p->value;
}

/* Parser initialization (memory allocation) */
void gst_parser(GstParser *p, Gst *vm) {
    p->vm = vm;
    GstParseState *data = gst_alloc(vm, sizeof(GstParseState) * 10);
    p->cap = 10;
    p->data = data;
    p->count = 0;
    p->index = 0;
    p->quoteCount = 0;
    p->error = NULL;
    p->status = GST_PARSER_ROOT;
    p->value.type = GST_NIL;
    p->flags = GST_PARSER_FLAG_EXPECTING_COMMENT;
    parser_push(p, PTYPE_ROOT, ' ');
}

/* GC mark a parser */
static void gst_stl_parser_mark(Gst *vm, void *data, uint32_t len) {
    uint32_t i;
    GstParser *p = (GstParser *) data;
    if (len != sizeof(GstParser))
        return;
    gst_mark_mem(vm, p->data);
    gst_mark_value(vm, p->value);
    for (i = 0; i < p->count; ++i) {
		GstParseState *ps = p->data + i;
		switch (ps->type) {
    		case PTYPE_ROOT:
    			break;
			case PTYPE_FORM:
    			gst_mark_value(vm, gst_wrap_array(ps->buf.form.array));
    			break;
    		case PTYPE_STRING:
        	case PTYPE_TOKEN:
            	gst_mark_value(vm, gst_wrap_buffer(ps->buf.string.buffer));
            	break;
		}
    }
}

/***/
/* Stl functions */
/***/

/* Parse filetype */
static const GstUserType gst_stl_parsetype = {
	"std.parser",
	NULL,
	NULL,
	NULL,
	&gst_stl_parser_mark
};

/* Create a parser */
int gst_stl_parser(Gst *vm) {
	GstParser *p = gst_userdata(vm, sizeof(GstParser), &gst_stl_parsetype);
	gst_parser(p, vm);
	gst_c_return(vm, gst_wrap_userdata(p));
}

/* Consume a value from the parser */
int gst_stl_parser_consume(Gst *vm) {
	GstParser *p = gst_check_userdata(vm, 0, &gst_stl_parsetype);
	if (p == NULL)
    	gst_c_throwc(vm, "expected parser");
    if (!gst_parse_hasvalue(p))
        gst_c_throwc(vm, "parser has no pending value");
	gst_c_return(vm, gst_parse_consume(p));
}

/* Check if the parser has a value to consume */
int gst_stl_parser_hasvalue(Gst *vm) {
	GstParser *p = gst_check_userdata(vm, 0, &gst_stl_parsetype);
	if (p == NULL)
    	gst_c_throwc(vm, "expected parser");
	gst_c_return(vm, gst_wrap_boolean(gst_parse_hasvalue(p)));
}

/* Parse a single byte. Returns if the byte was successfully parsed. */
int gst_stl_parser_byte(Gst *vm) {
    GstInteger b;
	GstParser *p = gst_check_userdata(vm, 0, &gst_stl_parsetype);
	if (p == NULL)
    	gst_c_throwc(vm, "expected parser");
	if (!gst_check_integer(vm, 1, &b))
    	gst_c_throwc(vm, "expected integer");
    if (p->status == GST_PARSER_PENDING || p->status == GST_PARSER_ROOT) {
        dispatch_char(p, b);
        gst_c_return(vm, gst_wrap_boolean(1));
    } else {
        gst_c_return(vm, gst_wrap_boolean(0));
    }
}

/* Parse a string or buffer. Returns nil if the entire char array is parsed,
* otherwise returns the remainder of what could not be parsed. */
int gst_stl_parser_charseq(Gst *vm) {
    uint32_t i;
	uint32_t len;
	const uint8_t *data;
	GstParser *p = gst_check_userdata(vm, 0, &gst_stl_parsetype);
	if (p == NULL)
    	gst_c_throwc(vm, "expected parser");
    if (!gst_chararray_view(gst_arg(vm, 1), &data, &len))
        gst_c_throwc(vm, "expected string/buffer");
    for (i = 0; i < len; ++i) {
        if (p->status != GST_PARSER_PENDING && p->status != GST_PARSER_ROOT) break;
        dispatch_char(p, data[i]);
    }
    if (i == len) {
		/* No remainder */
		gst_c_return(vm, gst_wrap_nil());
    } else {
		/* We have remaining characters */
		gst_c_return(vm, gst_wrap_string(gst_string_b(vm, data + i, len - i)));
    }
}

/* The module */
static const GstModuleItem gst_parser_module[] = {
	{"parser", gst_stl_parser},
	{"parse-byte", gst_stl_parser_byte},
	{"parse-consume", gst_stl_parser_consume},
	{"parse-hasvalue", gst_stl_parser_hasvalue},
	{"parse-charseq", gst_stl_parser_charseq},
	{NULL, NULL}
};

/* Load the module */
void gst_parse_load(Gst *vm) {
	gst_module_put(vm, "std.parse", gst_cmodule_struct(vm, gst_parser_module));
}