add tokenizer

2022-04-01 22:12:28 +08:00 · 2022-04-01 22:12:28 +08:00 · efca2f5483
commit efca2f5483
parent fbf7a1e7d3
4 changed files with 292 additions and 3 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,2 @@
+*.o
+woody
--- a/7
+++ b/7
@ -1,8 +1,8 @@
 .POSIX:

-CC	= pcc
-CFLAGS	= -std=c99
-OBJ	= main.o
+CC	= tcc
+CFLAGS	= -Wall -Wunsupported -Wwrite-strings
+OBJ	= main.o token.o

 all: woody

@ -10,6 +10,7 @@ woody: $(OBJ)
 	$(CC) $(CFLAGS) -o woody $(OBJ)

 main.o: main.c token.o
+token.o: token.c token.h

 clean:
 	rm -f woody $(OBJ)
--- a/token.c
+++ b/token.c
@ -0,0 +1,256 @@
+/**
+ * Plans:
+ * - maybe change exit(EXIT_FAILURE) to some kind of woody_exit?
+ */
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+#include "token.h"
+
+static const char *keywords[] = {
+	"if", "then", "elsif", "else", "while"
+};
+
+/**
+ * Gets the next character in the file, and increments
+ * the position/column/line in the struct tokenstate.
+ */
+static inline void
+getch(struct tokenstate *ts)
+{
+	int look = getc(ts->fh);
+
+	if (look == EOF)
+		ts->look = '\0';
+	else {
+		if (look == '\n') {
+			ts->col = 0;
+			ts->line++;
+		}
+		ts->look = look;
+		ts->col++;
+		ts->pos++;
+	}
+}
+
+/**
+ * Returns a pointer to where the next token should be put.
+ * In struct tokenstate, the field tokens is an array of
+ * pointers to ``blocks'' of 1024 struct tokens.  This function
+ * checks whether tokenind is over the limit, and if so,
+ * it just returns tokens[tokenblk][tokenind++].  Otherwise, it
+ * allocates a new block.
+ *
+ * Note that I made this method up, it might have some other
+ * name already. If so, please contact me so I can change the
+ * comments.
+ */
+static inline struct token *
+getnexttoken(struct tokenstate *ts)
+{
+	/* enough space in this block */
+	if (ts->tokenind < 1024)
+		return &ts->tokens[ts->tokenblk][ts->tokenind++];
+	/* not enough space in this block */
+	else {
+		if (++ts->tokenblk < ts->tokensz) {
+alloc_new_block:
+			/* allocate a new block */
+			ts->tokens[ts->tokenblk] = calloc(1024,
+			                                  sizeof(struct token));
+			if (ts->tokens[ts->tokenblk] == NULL) {
+				perror("woody lexer");
+				exit(EXIT_FAILURE);
+			}
+			return &ts->tokens[ts->tokenblk][ts->tokenind++];
+
+		}
+		/* all pointers in array exhausted, must reallocate */
+		else {
+			ts->tokensz++; /* need larger tokensz */
+			ts->tokenblk++; /* go to next (new) block */
+			ts->tokens = realloc(ts->tokens,
+				 ts->tokensz * sizeof(struct token));
+			if (ts->tokens == NULL) {
+				perror("woody lexer");
+				exit(EXIT_FAILURE);
+			}
+
+			ts->tokenind = 0;
+
+			goto alloc_new_block;
+		}
+	}
+	/* NOTREACHED */
+}
+
+/**
+ * Used to get a character like getch but in the context
+ * of a string.  Thus also accepting escape sequences.
+ * \n newline
+ * \r carriage return
+ */
+static inline int
+getcharfromstr(struct tokenstate *ts)
+{
+	if (ts->look == '\\') {
+		getch(ts);
+		switch (ts->look) {
+		case 'n':
+			getch(ts);
+			return '\n';
+		case 'r':
+			getch(ts);
+			return '\r';
+		default:
+			fprintf(stderr, "%s:%i:%i: "
+			                "invalid escape sequence\n",
+					ts->filename, ts->line, ts->col);
+			exit(EXIT_FAILURE);
+			/* NOTREACHED */
+			return -1;
+		}
+	}
+	else {
+		int look = ts->look;
+		getch(ts);
+		return look;
+	}
+}
+
+
+/**
+ * This function loops through the given file and tokenizes it.
+ * It works as follows:
+ * Step 1: Get character
+ * Step 2: Check if character is a string, number, operator
+ *         or a keyword/variable
+ * Step 3: Do the appropriate action for parsing
+ * Step 4: Repeat until EOF
+ */
+int
+tokenize(struct tokenstate *ts)
+{
+	struct token *token;
+
+	if (ts == NULL)
+		return -1;
+
+	getch(ts);
+
+	while (ts->look != 0) {
+		/**
+		 * Get a pointer to the next struct token and set
+		 * information used across all cases.
+		 */
+		token = getnexttoken(ts);
+		token->filename = ts->filename;
+		token->col = ts->col;
+		token->line = ts->line;
+		token->pos = ts->pos;
+
+		if (isdigit(ts->look)) {
+			/* number */
+			int64_t num = ts->look - '0';
+
+			getch(ts);
+			while (isdigit(ts->look)) {
+				num *= 10;
+				num += ts->look - '0';
+				getch(ts);
+			}
+
+			token->type = NUM;
+			token->val.num = num;
+		}
+		else if (isalpha(ts->look)) {
+			/* variable or keyword */
+			/* NOTE: maximum var size is 32, maybe change? */
+			int i;
+			char *buf = malloc(32);
+			if (buf == NULL) {
+				perror("woody lexer");
+				exit(EXIT_FAILURE);
+			}
+
+			/* read the var/kw */
+			for (int i = 0; i < 32 && isalpha(ts->look)
+					&& isdigit(ts->look); i++) {
+				buf[i] = ts->look;
+				getch(ts);
+			}
+			buf[i] = '\0';
+
+			for (i = 0; i < ENDKWTYPE; i++) {
+				if (strcmp(buf, keywords[i]) == 0) {
+					/* keyword */
+					token->type = KW;
+					token->val.kw = i;
+				}
+			}
+
+			/* variable */
+			if (i == ENDKWTYPE) {
+				token->type = VAR;
+				token->val.var = buf;
+			}
+
+		}
+		else if (ts->look == '"') {
+			/* string */
+			int i;
+			size_t bufsz = 512;
+			char *buf;
+
+			getch(ts);
+			if (ts->look == '"') {
+				/* empty string is illegal */
+				fprintf(stderr,
+				        "%s:%i:%i: empty string"
+				        " is illegal\n",
+					ts->filename, ts->line,
+					ts->col);
+				exit(EXIT_FAILURE);
+			}
+
+			buf = malloc(bufsz);
+			if (buf == NULL) {
+				perror("woody lexer");
+				exit(EXIT_FAILURE);
+			}
+
+read_string:
+			for (i = 0; i < bufsz && ts->look != '"'
+					&& ts->look != '\0'; i++)
+				buf[i] = getcharfromstr(ts);
+
+			if (ts->look != '"' && ts->look != '\0') {
+				/* buf too small */
+				buf = realloc(buf, bufsz + 127);
+				if (buf == NULL) {
+					perror("woody lexer");
+					exit(EXIT_FAILURE);
+				}
+
+				goto read_string;
+			}
+
+			buf[i] = '\0';
+
+			token->type = STRING;
+			token->val.string = buf;
+
+		}
+		else {
+			/* something else, probably an operator */
+			/* don't bother to check here, it's not worth it */
+			token->type = OP;
+			token->val.op = ts->look;
+			getch(ts);
+		}
+	}
+
+	return 0;
+}
--- a/token.h
+++ b/token.h
@ -0,0 +1,30 @@
+/* please also see token.c */
+enum tokentype { KW, STRING, VAR, NUM, OP };
+enum kwtype { IF, THEN, ELSIF, ELSE, WHILE, ENDKWTYPE };
+
+struct token {
+	union {
+		const char *string, *var;
+		int64_t num;
+		char op;
+		enum kwtype kw;
+	} val;
+	const char *filename;
+	int col, line, pos;
+	enum tokentype type;
+};
+
+struct tokenstate {
+	/*
+	 * tokens
+	 * |
+	 * |_> | ptr to 1024 struct token | ... |
+	 */
+	struct token **tokens;
+	const char *filename;
+	FILE *fh;
+	size_t tokensz, tokenblk, tokenind; /* see get_next_token */
+	int col, line, pos, look;
+};
+
+extern int tokenize(struct tokenstate *);