From efca2f5483256674c33110c0b4541a829b7ee2b2 Mon Sep 17 00:00:00 2001 From: matt Date: Fri, 1 Apr 2022 22:12:28 +0800 Subject: [PATCH] add tokenizer --- .gitignore | 2 + Makefile | 7 +- token.c | 256 +++++++++++++++++++++++++++++++++++++++++++++++++++++ token.h | 30 +++++++ 4 files changed, 292 insertions(+), 3 deletions(-) create mode 100644 .gitignore create mode 100644 token.c create mode 100644 token.h diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..a5e2663 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +*.o +woody diff --git a/Makefile b/Makefile index 9d1fc1e..95fcc79 100644 --- a/Makefile +++ b/Makefile @@ -1,8 +1,8 @@ .POSIX: -CC = pcc -CFLAGS = -std=c99 -OBJ = main.o +CC = tcc +CFLAGS = -Wall -Wunsupported -Wwrite-strings +OBJ = main.o token.o all: woody @@ -10,6 +10,7 @@ woody: $(OBJ) $(CC) $(CFLAGS) -o woody $(OBJ) main.o: main.c token.o +token.o: token.c token.h clean: rm -f woody $(OBJ) diff --git a/token.c b/token.c new file mode 100644 index 0000000..8250bed --- /dev/null +++ b/token.c @@ -0,0 +1,256 @@ +/** + * Plans: + * - maybe change exit(EXIT_FAILURE) to some kind of woody_exit? + */ +#include +#include +#include +#include +#include +#include "token.h" + +static const char *keywords[] = { + "if", "then", "elsif", "else", "while" +}; + +/** + * Gets the next character in the file, and increments + * the position/column/line in the struct tokenstate. + */ +static inline void +getch(struct tokenstate *ts) +{ + int look = getc(ts->fh); + + if (look == EOF) + ts->look = '\0'; + else { + if (look == '\n') { + ts->col = 0; + ts->line++; + } + ts->look = look; + ts->col++; + ts->pos++; + } +} + +/** + * Returns a pointer to where the next token should be put. + * In struct tokenstate, the field tokens is an array of + * pointers to ``blocks'' of 1024 struct tokens. This function + * checks whether tokenind is over the limit, and if so, + * it just returns tokens[tokenblk][tokenind++]. Otherwise, it + * allocates a new block. + * + * Note that I made this method up, it might have some other + * name already. If so, please contact me so I can change the + * comments. + */ +static inline struct token * +getnexttoken(struct tokenstate *ts) +{ + /* enough space in this block */ + if (ts->tokenind < 1024) + return &ts->tokens[ts->tokenblk][ts->tokenind++]; + /* not enough space in this block */ + else { + if (++ts->tokenblk < ts->tokensz) { +alloc_new_block: + /* allocate a new block */ + ts->tokens[ts->tokenblk] = calloc(1024, + sizeof(struct token)); + if (ts->tokens[ts->tokenblk] == NULL) { + perror("woody lexer"); + exit(EXIT_FAILURE); + } + return &ts->tokens[ts->tokenblk][ts->tokenind++]; + + } + /* all pointers in array exhausted, must reallocate */ + else { + ts->tokensz++; /* need larger tokensz */ + ts->tokenblk++; /* go to next (new) block */ + ts->tokens = realloc(ts->tokens, + ts->tokensz * sizeof(struct token)); + if (ts->tokens == NULL) { + perror("woody lexer"); + exit(EXIT_FAILURE); + } + + ts->tokenind = 0; + + goto alloc_new_block; + } + } + /* NOTREACHED */ +} + +/** + * Used to get a character like getch but in the context + * of a string. Thus also accepting escape sequences. + * \n newline + * \r carriage return + */ +static inline int +getcharfromstr(struct tokenstate *ts) +{ + if (ts->look == '\\') { + getch(ts); + switch (ts->look) { + case 'n': + getch(ts); + return '\n'; + case 'r': + getch(ts); + return '\r'; + default: + fprintf(stderr, "%s:%i:%i: " + "invalid escape sequence\n", + ts->filename, ts->line, ts->col); + exit(EXIT_FAILURE); + /* NOTREACHED */ + return -1; + } + } + else { + int look = ts->look; + getch(ts); + return look; + } +} + + +/** + * This function loops through the given file and tokenizes it. + * It works as follows: + * Step 1: Get character + * Step 2: Check if character is a string, number, operator + * or a keyword/variable + * Step 3: Do the appropriate action for parsing + * Step 4: Repeat until EOF + */ +int +tokenize(struct tokenstate *ts) +{ + struct token *token; + + if (ts == NULL) + return -1; + + getch(ts); + + while (ts->look != 0) { + /** + * Get a pointer to the next struct token and set + * information used across all cases. + */ + token = getnexttoken(ts); + token->filename = ts->filename; + token->col = ts->col; + token->line = ts->line; + token->pos = ts->pos; + + if (isdigit(ts->look)) { + /* number */ + int64_t num = ts->look - '0'; + + getch(ts); + while (isdigit(ts->look)) { + num *= 10; + num += ts->look - '0'; + getch(ts); + } + + token->type = NUM; + token->val.num = num; + } + else if (isalpha(ts->look)) { + /* variable or keyword */ + /* NOTE: maximum var size is 32, maybe change? */ + int i; + char *buf = malloc(32); + if (buf == NULL) { + perror("woody lexer"); + exit(EXIT_FAILURE); + } + + /* read the var/kw */ + for (int i = 0; i < 32 && isalpha(ts->look) + && isdigit(ts->look); i++) { + buf[i] = ts->look; + getch(ts); + } + buf[i] = '\0'; + + for (i = 0; i < ENDKWTYPE; i++) { + if (strcmp(buf, keywords[i]) == 0) { + /* keyword */ + token->type = KW; + token->val.kw = i; + } + } + + /* variable */ + if (i == ENDKWTYPE) { + token->type = VAR; + token->val.var = buf; + } + + } + else if (ts->look == '"') { + /* string */ + int i; + size_t bufsz = 512; + char *buf; + + getch(ts); + if (ts->look == '"') { + /* empty string is illegal */ + fprintf(stderr, + "%s:%i:%i: empty string" + " is illegal\n", + ts->filename, ts->line, + ts->col); + exit(EXIT_FAILURE); + } + + buf = malloc(bufsz); + if (buf == NULL) { + perror("woody lexer"); + exit(EXIT_FAILURE); + } + +read_string: + for (i = 0; i < bufsz && ts->look != '"' + && ts->look != '\0'; i++) + buf[i] = getcharfromstr(ts); + + if (ts->look != '"' && ts->look != '\0') { + /* buf too small */ + buf = realloc(buf, bufsz + 127); + if (buf == NULL) { + perror("woody lexer"); + exit(EXIT_FAILURE); + } + + goto read_string; + } + + buf[i] = '\0'; + + token->type = STRING; + token->val.string = buf; + + } + else { + /* something else, probably an operator */ + /* don't bother to check here, it's not worth it */ + token->type = OP; + token->val.op = ts->look; + getch(ts); + } + } + + return 0; +} diff --git a/token.h b/token.h new file mode 100644 index 0000000..1ebe3d7 --- /dev/null +++ b/token.h @@ -0,0 +1,30 @@ +/* please also see token.c */ +enum tokentype { KW, STRING, VAR, NUM, OP }; +enum kwtype { IF, THEN, ELSIF, ELSE, WHILE, ENDKWTYPE }; + +struct token { + union { + const char *string, *var; + int64_t num; + char op; + enum kwtype kw; + } val; + const char *filename; + int col, line, pos; + enum tokentype type; +}; + +struct tokenstate { + /* + * tokens + * | + * |_> | ptr to 1024 struct token | ... | + */ + struct token **tokens; + const char *filename; + FILE *fh; + size_t tokensz, tokenblk, tokenind; /* see get_next_token */ + int col, line, pos, look; +}; + +extern int tokenize(struct tokenstate *);