add tokenizer

2022-04-01 22:12:28 +08:00
parent fbf7a1e7d3
commit efca2f5483
4 changed files with 292 additions and 3 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,2 @@
 *.o
 woody
--- a/7
+++ b/7
@@ -1,8 +1,8 @@
 .POSIX:
-CC	= pcc
+CC	= tcc
-CFLAGS	= -std=c99
+CFLAGS	= -Wall -Wunsupported -Wwrite-strings
-OBJ	= main.o
+OBJ	= main.o token.o
 all: woody
@@ -10,6 +10,7 @@ woody: $(OBJ)
 	$(CC) $(CFLAGS) -o woody $(OBJ)
 main.o: main.c token.o
 token.o: token.c token.h
 clean:
 	rm -f woody $(OBJ)
--- a/token.c
+++ b/token.c
@@ -0,0 +1,256 @@
 /**
 * Plans:
 * - maybe change exit(EXIT_FAILURE) to some kind of woody_exit?
 */
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <ctype.h>
 #include "token.h"
 static const char *keywords[] = {
 	"if", "then", "elsif", "else", "while"
 };
 /**
 * Gets the next character in the file, and increments
 * the position/column/line in the struct tokenstate.
 */
 static inline void
 getch(struct tokenstate *ts)
 {
 	int look = getc(ts->fh);
 	if (look == EOF)
 		ts->look = '\0';
 	else {
 		if (look == '\n') {
 			ts->col = 0;
 			ts->line++;
 		}
 		ts->look = look;
 		ts->col++;
 		ts->pos++;
 	}
 }
 /**
 * Returns a pointer to where the next token should be put.
 * In struct tokenstate, the field tokens is an array of
 * pointers to ``blocks'' of 1024 struct tokens.  This function
 * checks whether tokenind is over the limit, and if so,
 * it just returns tokens[tokenblk][tokenind++].  Otherwise, it
 * allocates a new block.
 *
 * Note that I made this method up, it might have some other
 * name already. If so, please contact me so I can change the
 * comments.
 */
 static inline struct token *
 getnexttoken(struct tokenstate *ts)
 {
 	/* enough space in this block */
 	if (ts->tokenind < 1024)
 		return &ts->tokens[ts->tokenblk][ts->tokenind++];
 	/* not enough space in this block */
 	else {
 		if (++ts->tokenblk < ts->tokensz) {
 alloc_new_block:
 			/* allocate a new block */
 			ts->tokens[ts->tokenblk] = calloc(1024,
 			                                  sizeof(struct token));
 			if (ts->tokens[ts->tokenblk] == NULL) {
 				perror("woody lexer");
 				exit(EXIT_FAILURE);
 			}
 			return &ts->tokens[ts->tokenblk][ts->tokenind++];
 		}
 		/* all pointers in array exhausted, must reallocate */
 		else {
 			ts->tokensz++; /* need larger tokensz */
 			ts->tokenblk++; /* go to next (new) block */
 			ts->tokens = realloc(ts->tokens,
 				 ts->tokensz * sizeof(struct token));
 			if (ts->tokens == NULL) {
 				perror("woody lexer");
 				exit(EXIT_FAILURE);
 			}
 			ts->tokenind = 0;
 			goto alloc_new_block;
 		}
 	}
 	/* NOTREACHED */
 }
 /**
 * Used to get a character like getch but in the context
 * of a string.  Thus also accepting escape sequences.
 * \n newline
 * \r carriage return
 */
 static inline int
 getcharfromstr(struct tokenstate *ts)
 {
 	if (ts->look == '\\') {
 		getch(ts);
 		switch (ts->look) {
 		case 'n':
 			getch(ts);
 			return '\n';
 		case 'r':
 			getch(ts);
 			return '\r';
 		default:
 			fprintf(stderr, "%s:%i:%i: "
 			                "invalid escape sequence\n",
 					ts->filename, ts->line, ts->col);
 			exit(EXIT_FAILURE);
 			/* NOTREACHED */
 			return -1;
 		}
 	}
 	else {
 		int look = ts->look;
 		getch(ts);
 		return look;
 	}
 }
 /**
 * This function loops through the given file and tokenizes it.
 * It works as follows:
 * Step 1: Get character
 * Step 2: Check if character is a string, number, operator
 *         or a keyword/variable
 * Step 3: Do the appropriate action for parsing
 * Step 4: Repeat until EOF
 */
 int
 tokenize(struct tokenstate *ts)
 {
 	struct token *token;
 	if (ts == NULL)
 		return -1;
 	getch(ts);
 	while (ts->look != 0) {
 		/**
 		 * Get a pointer to the next struct token and set
 		 * information used across all cases.
 		 */
 		token = getnexttoken(ts);
 		token->filename = ts->filename;
 		token->col = ts->col;
 		token->line = ts->line;
 		token->pos = ts->pos;
 		if (isdigit(ts->look)) {
 			/* number */
 			int64_t num = ts->look - '0';
 			getch(ts);
 			while (isdigit(ts->look)) {
 				num *= 10;
 				num += ts->look - '0';
 				getch(ts);
 			}
 			token->type = NUM;
 			token->val.num = num;
 		}
 		else if (isalpha(ts->look)) {
 			/* variable or keyword */
 			/* NOTE: maximum var size is 32, maybe change? */
 			int i;
 			char *buf = malloc(32);
 			if (buf == NULL) {
 				perror("woody lexer");
 				exit(EXIT_FAILURE);
 			}
 			/* read the var/kw */
 			for (int i = 0; i < 32 && isalpha(ts->look)
 					&& isdigit(ts->look); i++) {
 				buf[i] = ts->look;
 				getch(ts);
 			}
 			buf[i] = '\0';
 			for (i = 0; i < ENDKWTYPE; i++) {
 				if (strcmp(buf, keywords[i]) == 0) {
 					/* keyword */
 					token->type = KW;
 					token->val.kw = i;
 				}
 			}
 			/* variable */
 			if (i == ENDKWTYPE) {
 				token->type = VAR;
 				token->val.var = buf;
 			}
 		}
 		else if (ts->look == '"') {
 			/* string */
 			int i;
 			size_t bufsz = 512;
 			char *buf;
 			getch(ts);
 			if (ts->look == '"') {
 				/* empty string is illegal */
 				fprintf(stderr,
 				        "%s:%i:%i: empty string"
 				        " is illegal\n",
 					ts->filename, ts->line,
 					ts->col);
 				exit(EXIT_FAILURE);
 			}
 			buf = malloc(bufsz);
 			if (buf == NULL) {
 				perror("woody lexer");
 				exit(EXIT_FAILURE);
 			}
 read_string:
 			for (i = 0; i < bufsz && ts->look != '"'
 					&& ts->look != '\0'; i++)
 				buf[i] = getcharfromstr(ts);
 			if (ts->look != '"' && ts->look != '\0') {
 				/* buf too small */
 				buf = realloc(buf, bufsz + 127);
 				if (buf == NULL) {
 					perror("woody lexer");
 					exit(EXIT_FAILURE);
 				}
 				goto read_string;
 			}
 			buf[i] = '\0';
 			token->type = STRING;
 			token->val.string = buf;
 		}
 		else {
 			/* something else, probably an operator */
 			/* don't bother to check here, it's not worth it */
 			token->type = OP;
 			token->val.op = ts->look;
 			getch(ts);
 		}
 	}
 	return 0;
 }
--- a/token.h
+++ b/token.h
@@ -0,0 +1,30 @@
 /* please also see token.c */
 enum tokentype { KW, STRING, VAR, NUM, OP };
 enum kwtype { IF, THEN, ELSIF, ELSE, WHILE, ENDKWTYPE };
 struct token {
 	union {
 		const char *string, *var;
 		int64_t num;
 		char op;
 		enum kwtype kw;
 	} val;
 	const char *filename;
 	int col, line, pos;
 	enum tokentype type;
 };
 struct tokenstate {
 	/*
 	 * tokens
 	 * |
 	 * |_> | ptr to 1024 struct token | ... |
 	 */
 	struct token **tokens;
 	const char *filename;
 	FILE *fh;
 	size_t tokensz, tokenblk, tokenind; /* see get_next_token */
 	int col, line, pos, look;
 };
 extern int tokenize(struct tokenstate *);