add tokenizer

This commit is contained in:
matt 2022-04-01 22:12:28 +08:00
parent fbf7a1e7d3
commit efca2f5483
4 changed files with 292 additions and 3 deletions

2
.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
*.o
woody

View File

@ -1,8 +1,8 @@
.POSIX:
CC = pcc
CFLAGS = -std=c99
OBJ = main.o
CC = tcc
CFLAGS = -Wall -Wunsupported -Wwrite-strings
OBJ = main.o token.o
all: woody
@ -10,6 +10,7 @@ woody: $(OBJ)
$(CC) $(CFLAGS) -o woody $(OBJ)
main.o: main.c token.o
token.o: token.c token.h
clean:
rm -f woody $(OBJ)

256
token.c Normal file
View File

@ -0,0 +1,256 @@
/**
* Plans:
* - maybe change exit(EXIT_FAILURE) to some kind of woody_exit?
*/
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include "token.h"
static const char *keywords[] = {
"if", "then", "elsif", "else", "while"
};
/**
* Gets the next character in the file, and increments
* the position/column/line in the struct tokenstate.
*/
static inline void
getch(struct tokenstate *ts)
{
int look = getc(ts->fh);
if (look == EOF)
ts->look = '\0';
else {
if (look == '\n') {
ts->col = 0;
ts->line++;
}
ts->look = look;
ts->col++;
ts->pos++;
}
}
/**
* Returns a pointer to where the next token should be put.
* In struct tokenstate, the field tokens is an array of
* pointers to ``blocks'' of 1024 struct tokens. This function
* checks whether tokenind is over the limit, and if so,
* it just returns tokens[tokenblk][tokenind++]. Otherwise, it
* allocates a new block.
*
* Note that I made this method up, it might have some other
* name already. If so, please contact me so I can change the
* comments.
*/
static inline struct token *
getnexttoken(struct tokenstate *ts)
{
/* enough space in this block */
if (ts->tokenind < 1024)
return &ts->tokens[ts->tokenblk][ts->tokenind++];
/* not enough space in this block */
else {
if (++ts->tokenblk < ts->tokensz) {
alloc_new_block:
/* allocate a new block */
ts->tokens[ts->tokenblk] = calloc(1024,
sizeof(struct token));
if (ts->tokens[ts->tokenblk] == NULL) {
perror("woody lexer");
exit(EXIT_FAILURE);
}
return &ts->tokens[ts->tokenblk][ts->tokenind++];
}
/* all pointers in array exhausted, must reallocate */
else {
ts->tokensz++; /* need larger tokensz */
ts->tokenblk++; /* go to next (new) block */
ts->tokens = realloc(ts->tokens,
ts->tokensz * sizeof(struct token));
if (ts->tokens == NULL) {
perror("woody lexer");
exit(EXIT_FAILURE);
}
ts->tokenind = 0;
goto alloc_new_block;
}
}
/* NOTREACHED */
}
/**
* Used to get a character like getch but in the context
* of a string. Thus also accepting escape sequences.
* \n newline
* \r carriage return
*/
static inline int
getcharfromstr(struct tokenstate *ts)
{
if (ts->look == '\\') {
getch(ts);
switch (ts->look) {
case 'n':
getch(ts);
return '\n';
case 'r':
getch(ts);
return '\r';
default:
fprintf(stderr, "%s:%i:%i: "
"invalid escape sequence\n",
ts->filename, ts->line, ts->col);
exit(EXIT_FAILURE);
/* NOTREACHED */
return -1;
}
}
else {
int look = ts->look;
getch(ts);
return look;
}
}
/**
* This function loops through the given file and tokenizes it.
* It works as follows:
* Step 1: Get character
* Step 2: Check if character is a string, number, operator
* or a keyword/variable
* Step 3: Do the appropriate action for parsing
* Step 4: Repeat until EOF
*/
int
tokenize(struct tokenstate *ts)
{
struct token *token;
if (ts == NULL)
return -1;
getch(ts);
while (ts->look != 0) {
/**
* Get a pointer to the next struct token and set
* information used across all cases.
*/
token = getnexttoken(ts);
token->filename = ts->filename;
token->col = ts->col;
token->line = ts->line;
token->pos = ts->pos;
if (isdigit(ts->look)) {
/* number */
int64_t num = ts->look - '0';
getch(ts);
while (isdigit(ts->look)) {
num *= 10;
num += ts->look - '0';
getch(ts);
}
token->type = NUM;
token->val.num = num;
}
else if (isalpha(ts->look)) {
/* variable or keyword */
/* NOTE: maximum var size is 32, maybe change? */
int i;
char *buf = malloc(32);
if (buf == NULL) {
perror("woody lexer");
exit(EXIT_FAILURE);
}
/* read the var/kw */
for (int i = 0; i < 32 && isalpha(ts->look)
&& isdigit(ts->look); i++) {
buf[i] = ts->look;
getch(ts);
}
buf[i] = '\0';
for (i = 0; i < ENDKWTYPE; i++) {
if (strcmp(buf, keywords[i]) == 0) {
/* keyword */
token->type = KW;
token->val.kw = i;
}
}
/* variable */
if (i == ENDKWTYPE) {
token->type = VAR;
token->val.var = buf;
}
}
else if (ts->look == '"') {
/* string */
int i;
size_t bufsz = 512;
char *buf;
getch(ts);
if (ts->look == '"') {
/* empty string is illegal */
fprintf(stderr,
"%s:%i:%i: empty string"
" is illegal\n",
ts->filename, ts->line,
ts->col);
exit(EXIT_FAILURE);
}
buf = malloc(bufsz);
if (buf == NULL) {
perror("woody lexer");
exit(EXIT_FAILURE);
}
read_string:
for (i = 0; i < bufsz && ts->look != '"'
&& ts->look != '\0'; i++)
buf[i] = getcharfromstr(ts);
if (ts->look != '"' && ts->look != '\0') {
/* buf too small */
buf = realloc(buf, bufsz + 127);
if (buf == NULL) {
perror("woody lexer");
exit(EXIT_FAILURE);
}
goto read_string;
}
buf[i] = '\0';
token->type = STRING;
token->val.string = buf;
}
else {
/* something else, probably an operator */
/* don't bother to check here, it's not worth it */
token->type = OP;
token->val.op = ts->look;
getch(ts);
}
}
return 0;
}

30
token.h Normal file
View File

@ -0,0 +1,30 @@
/* please also see token.c */
enum tokentype { KW, STRING, VAR, NUM, OP };
enum kwtype { IF, THEN, ELSIF, ELSE, WHILE, ENDKWTYPE };
struct token {
union {
const char *string, *var;
int64_t num;
char op;
enum kwtype kw;
} val;
const char *filename;
int col, line, pos;
enum tokentype type;
};
struct tokenstate {
/*
* tokens
* |
* |_> | ptr to 1024 struct token | ... |
*/
struct token **tokens;
const char *filename;
FILE *fh;
size_t tokensz, tokenblk, tokenind; /* see get_next_token */
int col, line, pos, look;
};
extern int tokenize(struct tokenstate *);