woody/token.c

/**
 * Plans:
 * - maybe change exit(EXIT_FAILURE) to some kind of woody_exit?
 */
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include "token.h"

static const char *keywords[] = {
	"if", "then", "elsif", "else", "while"
};

/**
 * Gets the next character in the file, and increments
 * the position/column/line in the struct tokenstate.
 */
static inline void
getch(struct tokenstate *ts)
{
	int look = getc(ts->fh);

	if (look == EOF)
		ts->look = '\0';
	else {
		if (look == '\n') {
			ts->col = 0;
			ts->line++;
		}
		ts->look = look;
		ts->col++;
		ts->pos++;
	}
}

/**
 * Skips whitespace in the input.
 */
static inline void
skip_whitespace(struct tokenstate *ts)
{
	while (isspace(ts->look) && ts->look != '\n')
		getch(ts);
}

/**
 * Returns a pointer to where the next token should be put.
 * In struct tokenstate, the field tokens is an array of
 * pointers to ``blocks'' of 1024 struct tokens.  This function
 * checks whether tokenind is over the limit, and if so,
 * it just returns tokens[tokenblk][tokenind++].  Otherwise, it
 * allocates a new block.
 *
 * Note that I made this method up, it might have some other
 * name already. If so, please contact me so I can change the
 * comments.
 */
static inline struct token *
getnexttoken(struct tokenstate *ts)
{
	/* enough space in this block */
	if (ts->tokenind < 1024)
		return &ts->tokens[ts->tokenblk][ts->tokenind++];
	/* not enough space in this block */
	else {
		if (++ts->tokenblk < ts->tokensz) {
alloc_new_block:
			/* allocate a new block */
			ts->tokens[ts->tokenblk] = calloc(1024,
			                                  sizeof(struct token));
			if (ts->tokens[ts->tokenblk] == NULL) {
				perror("woody lexer");
				exit(EXIT_FAILURE);
			}
			return &ts->tokens[ts->tokenblk][ts->tokenind++];

		}
		/* all pointers in array exhausted, must reallocate */
		else {
			ts->tokensz++; /* need larger tokensz */
			ts->tokenblk++; /* go to next (new) block */
			ts->tokens = realloc(ts->tokens,
				 ts->tokensz * sizeof(struct token));
			if (ts->tokens == NULL) {
				perror("woody lexer");
				exit(EXIT_FAILURE);
			}

			ts->tokenind = 0;

			goto alloc_new_block;
		}
	}
	/* NOTREACHED */
}

/**
 * Used to get a character like getch but in the context
 * of a string.  Thus also accepting escape sequences.
 * \n newline
 * \r carriage return
 */
static inline int
getcharfromstr(struct tokenstate *ts)
{
	if (ts->look == '\\') {
		getch(ts);
		switch (ts->look) {
		case 'n':
			getch(ts);
			return '\n';
		case 'r':
			getch(ts);
			return '\r';
		default:
			fprintf(stderr, "%s:%i:%i: "
			                "invalid escape sequence\n",
					ts->filename, ts->line, ts->col);
			exit(EXIT_FAILURE);
			/* NOTREACHED */
			return -1;
		}
	}
	else {
		int look = ts->look;
		getch(ts);
		return look;
	}
}

static inline void
tokenizenum(struct token *token, struct tokenstate *ts)
{
	int64_t num = ts->look - '0';

	getch(ts);
	while (isdigit(ts->look)) {
		num *= 10;
		num += ts->look - '0';
		getch(ts);
	}

	token->type = NUM;
	token->val.num = num;

}

/* tokenize a variable or keyword */
static inline void
tokenizevarkw(struct token *token, struct tokenstate *ts)
{
	/* NOTE: maximum var size is 32, maybe change? */
	int i;
	char *buf = malloc(32);
	if (buf == NULL) {
		perror("woody lexer");
		exit(EXIT_FAILURE);
	}

	/* read the var/kw */
	for (i = 0; i < 32 && (isalpha(ts->look)
			|| isdigit(ts->look)); i++) {
		buf[i] = ts->look;
		getch(ts);
	}
	buf[i] = '\0';

	for (i = 0; i < ENDKWTYPE; i++) {
		if (strcmp(buf, keywords[i]) == 0) {
			/* keyword */
			token->type = KW;
			token->val.kw = i;
		}
	}

	/* variable */
	if (i == ENDKWTYPE) {
		token->type = VAR;
		token->val.var = buf;
	}
}

static inline void
tokenizestring(struct token *token, struct tokenstate *ts)
{
	int i = 0;
	size_t bufsz = 512;
	char *buf;

	getch(ts);
	if (ts->look == '"') {
		/* empty string is illegal */
		fprintf(stderr,
		        "%s:%i:%i: empty string"
		        " is illegal\n",
			ts->filename, ts->line,
			ts->col);
		exit(EXIT_FAILURE);
	}

	buf = malloc(bufsz);
	if (buf == NULL) {
		perror("woody lexer");
		exit(EXIT_FAILURE);
	}

read_string:
	for (; i < bufsz && ts->look != '"'
			&& ts->look != '\0'; i++)
		buf[i] = getcharfromstr(ts);

	if (ts->look != '"' && ts->look != '\0') {
		/* buf too small */
		buf = realloc(buf, bufsz + 127);
		if (buf == NULL) {
			perror("woody lexer");
			exit(EXIT_FAILURE);
		}

		goto read_string;
	}

	buf[i] = '\0';

	token->type = STRING;
	token->val.string = buf;
}

/**
 * This function loops through the given file and tokenizes it.
 * It works as follows:
 * Step 1: Get character
 * Step 2: Check if character is a string, number, operator
 *         or a keyword/variable
 * Step 3: Do the appropriate action for parsing
 * Step 4: Repeat until EOF
 */
int
tokenize(struct tokenstate *ts)
{
	struct token *token;

	if (ts == NULL)
		return -1;

	getch(ts);

	while (ts->look != 0) {
		skip_whitespace(ts);
		if (ts->look == 0)
			break;

		/**
		 * Get a pointer to the next struct token and set
		 * information used across all cases.
		 */
		token = getnexttoken(ts);
		token->filename = ts->filename;
		token->col = ts->col;
		token->line = ts->line;
		token->pos = ts->pos;

		if (isdigit(ts->look))
			tokenizenum(token, ts);
		else if (isalpha(ts->look))
			tokenizevarkw(token, ts);
		else if (ts->look == '"')
			tokenizestring(token, ts);
		else if (ts->look == '\n') {
			token->type = NEWLINE;
			getch(ts);
		}
		else {
			/* something else, probably an operator */
			/* don't bother to check here, it's not worth it */
			token->type = OP;
			token->val.op = ts->look;
			getch(ts);
		}
	}

	return 0;
}
add tokenizer 2022-04-01 14:12:28 +00:00			`/**`
			`* Plans:`
			`* - maybe change exit(EXIT_FAILURE) to some kind of woody_exit?`
			`*/`
			`#include <stdint.h>`
			`#include <stdio.h>`
			`#include <stdlib.h>`
			`#include <string.h>`
			`#include <ctype.h>`
			`#include "token.h"`

			`static const char *keywords[] = {`
			`"if", "then", "elsif", "else", "while"`
			`};`

			`/**`
			`* Gets the next character in the file, and increments`
			`* the position/column/line in the struct tokenstate.`
			`*/`
			`static inline void`
			`getch(struct tokenstate *ts)`
			`{`
			`int look = getc(ts->fh);`

			`if (look == EOF)`
			`ts->look = '\0';`
			`else {`
			`if (look == '\n') {`
			`ts->col = 0;`
			`ts->line++;`
			`}`
			`ts->look = look;`
			`ts->col++;`
			`ts->pos++;`
			`}`
			`}`

Fix EOF bug and add token printing to main.c test() 2022-04-02 08:09:37 +00:00			`/**`
			`* Skips whitespace in the input.`
			`*/`
			`static inline void`
			`skip_whitespace(struct tokenstate *ts)`
			`{`
Add newline as token type and add skeleton for parser 2022-04-02 10:10:01 +00:00			`while (isspace(ts->look) && ts->look != '\n')`
Fix EOF bug and add token printing to main.c test() 2022-04-02 08:09:37 +00:00			`getch(ts);`
			`}`

add tokenizer 2022-04-01 14:12:28 +00:00			`/**`
			`* Returns a pointer to where the next token should be put.`
			`* In struct tokenstate, the field tokens is an array of`
			* pointers to ``blocks'' of 1024 struct tokens. This function
			`* checks whether tokenind is over the limit, and if so,`
			`* it just returns tokens[tokenblk][tokenind++]. Otherwise, it`
			`* allocates a new block.`
			`*`
			`* Note that I made this method up, it might have some other`
			`* name already. If so, please contact me so I can change the`
			`* comments.`
			`*/`
			`static inline struct token *`
			`getnexttoken(struct tokenstate *ts)`
			`{`
			`/* enough space in this block */`
			`if (ts->tokenind < 1024)`
			`return &ts->tokens[ts->tokenblk][ts->tokenind++];`
			`/* not enough space in this block */`
			`else {`
			`if (++ts->tokenblk < ts->tokensz) {`
			`alloc_new_block:`
			`/* allocate a new block */`
			`ts->tokens[ts->tokenblk] = calloc(1024,`
			`sizeof(struct token));`
			`if (ts->tokens[ts->tokenblk] == NULL) {`
			`perror("woody lexer");`
			`exit(EXIT_FAILURE);`
			`}`
			`return &ts->tokens[ts->tokenblk][ts->tokenind++];`

			`}`
			`/* all pointers in array exhausted, must reallocate */`
			`else {`
			`ts->tokensz++; /* need larger tokensz */`
			`ts->tokenblk++; /* go to next (new) block */`
			`ts->tokens = realloc(ts->tokens,`
			`ts->tokensz * sizeof(struct token));`
			`if (ts->tokens == NULL) {`
			`perror("woody lexer");`
			`exit(EXIT_FAILURE);`
			`}`

			`ts->tokenind = 0;`

			`goto alloc_new_block;`
			`}`
			`}`
			`/* NOTREACHED */`
			`}`

			`/**`
			`* Used to get a character like getch but in the context`
			`* of a string. Thus also accepting escape sequences.`
			`* \n newline`
			`* \r carriage return`
			`*/`
			`static inline int`
			`getcharfromstr(struct tokenstate *ts)`
			`{`
			`if (ts->look == '\\') {`
			`getch(ts);`
			`switch (ts->look) {`
			`case 'n':`
			`getch(ts);`
			`return '\n';`
			`case 'r':`
			`getch(ts);`
			`return '\r';`
			`default:`
			`fprintf(stderr, "%s:%i:%i: "`
			`"invalid escape sequence\n",`
			`ts->filename, ts->line, ts->col);`
			`exit(EXIT_FAILURE);`
			`/* NOTREACHED */`
			`return -1;`
			`}`
			`}`
			`else {`
			`int look = ts->look;`
			`getch(ts);`
			`return look;`
			`}`
			`}`

refactor tokenize function 2022-04-01 15:42:22 +00:00			`static inline void`
			`tokenizenum(struct token token, struct tokenstate ts)`
			`{`
			`int64_t num = ts->look - '0';`

			`getch(ts);`
			`while (isdigit(ts->look)) {`
			`num *= 10;`
			`num += ts->look - '0';`
			`getch(ts);`
			`}`

			`token->type = NUM;`
			`token->val.num = num;`

			`}`

			`/* tokenize a variable or keyword */`
			`static inline void`
			`tokenizevarkw(struct token token, struct tokenstate ts)`
			`{`
			`/* NOTE: maximum var size is 32, maybe change? */`
			`int i;`
			`char *buf = malloc(32);`
			`if (buf == NULL) {`
			`perror("woody lexer");`
			`exit(EXIT_FAILURE);`
			`}`

			`/* read the var/kw */`
Fix segfault in tokenizevarkw Also added debug flags to CFLAGS and added little testing thing in main.c. 2022-04-02 04:33:43 +00:00			`for (i = 0; i < 32 && (isalpha(ts->look)`
			`\|\| isdigit(ts->look)); i++) {`
refactor tokenize function 2022-04-01 15:42:22 +00:00			`buf[i] = ts->look;`
			`getch(ts);`
			`}`
			`buf[i] = '\0';`

			`for (i = 0; i < ENDKWTYPE; i++) {`
			`if (strcmp(buf, keywords[i]) == 0) {`
			`/* keyword */`
			`token->type = KW;`
			`token->val.kw = i;`
			`}`
			`}`

			`/* variable */`
			`if (i == ENDKWTYPE) {`
			`token->type = VAR;`
			`token->val.var = buf;`
			`}`
			`}`

			`static inline void`
			`tokenizestring(struct token token, struct tokenstate ts)`
			`{`
			`int i = 0;`
			`size_t bufsz = 512;`
			`char *buf;`

			`getch(ts);`
			`if (ts->look == '"') {`
			`/* empty string is illegal */`
			`fprintf(stderr,`
			`"%s:%i:%i: empty string"`
			`" is illegal\n",`
			`ts->filename, ts->line,`
			`ts->col);`
			`exit(EXIT_FAILURE);`
			`}`

			`buf = malloc(bufsz);`
			`if (buf == NULL) {`
			`perror("woody lexer");`
			`exit(EXIT_FAILURE);`
			`}`

			`read_string:`
			`for (; i < bufsz && ts->look != '"'`
			`&& ts->look != '\0'; i++)`
			`buf[i] = getcharfromstr(ts);`

			`if (ts->look != '"' && ts->look != '\0') {`
			`/* buf too small */`
			`buf = realloc(buf, bufsz + 127);`
			`if (buf == NULL) {`
			`perror("woody lexer");`
			`exit(EXIT_FAILURE);`
			`}`

			`goto read_string;`
			`}`

			`buf[i] = '\0';`

			`token->type = STRING;`
			`token->val.string = buf;`
			`}`
add tokenizer 2022-04-01 14:12:28 +00:00
			`/**`
			`* This function loops through the given file and tokenizes it.`
			`* It works as follows:`
			`* Step 1: Get character`
			`* Step 2: Check if character is a string, number, operator`
			`* or a keyword/variable`
			`* Step 3: Do the appropriate action for parsing`
			`* Step 4: Repeat until EOF`
			`*/`
			`int`
			`tokenize(struct tokenstate *ts)`
			`{`
			`struct token *token;`

			`if (ts == NULL)`
			`return -1;`

			`getch(ts);`

			`while (ts->look != 0) {`
Fix EOF bug and add token printing to main.c test() 2022-04-02 08:09:37 +00:00			`skip_whitespace(ts);`
			`if (ts->look == 0)`
			`break;`

add tokenizer 2022-04-01 14:12:28 +00:00			`/**`
			`* Get a pointer to the next struct token and set`
			`* information used across all cases.`
			`*/`
			`token = getnexttoken(ts);`
			`token->filename = ts->filename;`
			`token->col = ts->col;`
			`token->line = ts->line;`
			`token->pos = ts->pos;`

refactor tokenize function 2022-04-01 15:42:22 +00:00			`if (isdigit(ts->look))`
			`tokenizenum(token, ts);`
			`else if (isalpha(ts->look))`
			`tokenizevarkw(token, ts);`
			`else if (ts->look == '"')`
			`tokenizestring(token, ts);`
Add newline as token type and add skeleton for parser 2022-04-02 10:10:01 +00:00			`else if (ts->look == '\n') {`
			`token->type = NEWLINE;`
			`getch(ts);`
			`}`
add tokenizer 2022-04-01 14:12:28 +00:00			`else {`
			`/* something else, probably an operator */`
			`/* don't bother to check here, it's not worth it */`
			`token->type = OP;`
			`token->val.op = ts->look;`
			`getch(ts);`
			`}`
			`}`

			`return 0;`
			`}`