woody/token.c

/**
 * Plans:
 * - maybe change exit(EXIT_FAILURE) to some kind of woody_exit?
 */
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include "token.h"

static const char *keywords[] = {
	"if", "then", "elsif", "else", "while"
};

/**
 * Gets the next character in the file, and increments
 * the position/column/line in the struct tokenstate.
 */
static inline void
getch(struct tokenstate *ts)
{
	int look = getc(ts->fh);

	if (look == EOF)
		ts->look = '\0';
	else {
		if (look == '\n') {
			ts->col = 0;
			ts->line++;
		}
		ts->look = look;
		ts->col++;
		ts->pos++;
	}
}

/**
 * Skips whitespace in the input.
 */
static inline void
skip_whitespace(struct tokenstate *ts)
{
	while (isspace(ts->look) && ts->look != '\n')
		getch(ts);
}

/**
 * Returns a pointer to where the next token should be put.
 * In struct tokenstate, the field tokens is an array of
 * pointers to ``blocks'' of 1024 struct tokens.  This function
 * checks whether tokenind is over the limit, and if so,
 * it just returns tokens[tokenblk][tokenind++].  Otherwise, it
 * allocates a new block.
 *
 * Note that I made this method up, it might have some other
 * name already. If so, please contact me so I can change the
 * comments.
 */
static inline struct token *
getnexttoken(struct tokenstate *ts)
{
	/* enough space in this block */
	if (ts->tokenind < 1024)
		return &ts->tokens[ts->tokenblk][ts->tokenind++];
	/* not enough space in this block */
	else {
		if (++ts->tokenblk < ts->tokensz) {
alloc_new_block:
			/* allocate a new block */
			ts->tokens[ts->tokenblk] = calloc(1024,
			                                  sizeof(struct token));
			if (ts->tokens[ts->tokenblk] == NULL) {
				perror("woody lexer");
				exit(EXIT_FAILURE);
			}
			return &ts->tokens[ts->tokenblk][ts->tokenind++];

		}
		/* all pointers in array exhausted, must reallocate */
		else {
			ts->tokensz++; /* need larger tokensz */
			ts->tokenblk++; /* go to next (new) block */
			ts->tokens = realloc(ts->tokens,
				 ts->tokensz * sizeof(struct token));
			if (ts->tokens == NULL) {
				perror("woody lexer");
				exit(EXIT_FAILURE);
			}

			ts->tokenind = 0;

			goto alloc_new_block;
		}
	}
	/* NOTREACHED */
}

/**
 * Used to get a character like getch but in the context
 * of a string.  Thus also accepting escape sequences.
 * \n newline
 * \r carriage return
 */
static inline int
getcharfromstr(struct tokenstate *ts)
{
	if (ts->look == '\\') {
		getch(ts);
		switch (ts->look) {
		case 'n':
			getch(ts);
			return '\n';
		case 'r':
			getch(ts);
			return '\r';
		default:
			fprintf(stderr, "%s:%i:%i: "
			                "invalid escape sequence\n",
					ts->filename, ts->line, ts->col);
			exit(EXIT_FAILURE);
			/* NOTREACHED */
			return -1;
		}
	}
	else {
		int look = ts->look;
		getch(ts);
		return look;
	}
}

static inline void
tokenizenum(struct token *token, struct tokenstate *ts)
{
	int64_t num = ts->look - '0';

	getch(ts);
	while (isdigit(ts->look)) {
		num *= 10;
		num += ts->look - '0';
		getch(ts);
	}

	token->type = NUM;
	token->val.num = num;

}

/* tokenize a variable or keyword */
static inline void
tokenizevarkw(struct token *token, struct tokenstate *ts)
{
	/* NOTE: maximum var size is 32, maybe change? */
	int i;
	char *buf = malloc(32);
	if (buf == NULL) {
		perror("woody lexer");
		exit(EXIT_FAILURE);
	}

	/* read the var/kw */
	for (i = 0; i < 32 && (isalpha(ts->look)
			|| isdigit(ts->look)); i++) {
		buf[i] = ts->look;
		getch(ts);
	}
	buf[i] = '\0';

	for (i = 0; i < ENDKWTYPE; i++) {
		if (strcmp(buf, keywords[i]) == 0) {
			/* keyword */
			token->type = KW;
			token->val.kw = i;
		}
	}

	/* variable */
	if (i == ENDKWTYPE) {
		token->type = VAR;
		token->val.var = buf;
	}
}

static inline void
tokenizestring(struct token *token, struct tokenstate *ts)
{
	int i = 0;
	size_t bufsz = 512;
	char *buf;

	getch(ts);
	if (ts->look == '"') {
		/* empty string is illegal */
		fprintf(stderr,
		        "%s:%i:%i: empty string"
		        " is illegal\n",
			ts->filename, ts->line,
			ts->col);
		exit(EXIT_FAILURE);
	}

	buf = malloc(bufsz);
	if (buf == NULL) {
		perror("woody lexer");
		exit(EXIT_FAILURE);
	}

read_string:
	for (; i < bufsz && ts->look != '"'
			&& ts->look != '\0'; i++)
		buf[i] = getcharfromstr(ts);

	if (ts->look != '"' && ts->look != '\0') {
		/* buf too small */
		buf = realloc(buf, bufsz + 127);
		if (buf == NULL) {
			perror("woody lexer");
			exit(EXIT_FAILURE);
		}

		goto read_string;
	}

	buf[i] = '\0';

	token->type = STRING;
	token->val.string = buf;
}

/**
 * This function loops through the given file and tokenizes it.
 * It works as follows:
 * Step 1: Get character
 * Step 2: Check if character is a string, number, operator
 *         or a keyword/variable
 * Step 3: Do the appropriate action for parsing
 * Step 4: Repeat until EOF
 */
int
tokenize(struct tokenstate *ts)
{
	struct token *token;

	if (ts == NULL)
		return -1;

	getch(ts);

	while (ts->look != 0) {
		skip_whitespace(ts);
		if (ts->look == 0)
			break;

		/**
		 * Get a pointer to the next struct token and set
		 * information used across all cases.
		 */
		token = getnexttoken(ts);
		token->filename = ts->filename;
		token->col = ts->col;
		token->line = ts->line;
		token->pos = ts->pos;

		if (isdigit(ts->look))
			tokenizenum(token, ts);
		else if (isalpha(ts->look))
			tokenizevarkw(token, ts);
		else if (ts->look == '"')
			tokenizestring(token, ts);
		else if (ts->look == '\n') {
			token->type = NEWLINE;
			getch(ts);
		}
		else {
			/* something else, probably an operator */
			/* don't bother to check here, it's not worth it */
			token->type = OP;
			token->val.op = ts->look;
			getch(ts);
		}
	}

	return 0;
}