/** * Plans: * - maybe change exit(EXIT_FAILURE) to some kind of woody_exit? */ #include #include #include #include #include #include "token.h" static const char *keywords[] = { "if", "then", "elsif", "else", "while" }; /** * Gets the next character in the file, and increments * the position/column/line in the struct tokenstate. */ static inline void getch(struct tokenstate *ts) { int look = getc(ts->fh); if (look == EOF) ts->look = '\0'; else { if (look == '\n') { ts->col = 0; ts->line++; } ts->look = look; ts->col++; ts->pos++; } } /** * Skips whitespace in the input. */ static inline void skip_whitespace(struct tokenstate *ts) { while (isspace(ts->look) && ts->look != '\n') getch(ts); } /** * Returns a pointer to where the next token should be put. * In struct tokenstate, the field tokens is an array of * pointers to ``blocks'' of 1024 struct tokens. This function * checks whether tokenind is over the limit, and if so, * it just returns tokens[tokenblk][tokenind++]. Otherwise, it * allocates a new block. * * Note that I made this method up, it might have some other * name already. If so, please contact me so I can change the * comments. */ static inline struct token * getnexttoken(struct tokenstate *ts) { /* enough space in this block */ if (ts->tokenind < 1024) return &ts->tokens[ts->tokenblk][ts->tokenind++]; /* not enough space in this block */ else { if (++ts->tokenblk < ts->tokensz) { alloc_new_block: /* allocate a new block */ ts->tokens[ts->tokenblk] = calloc(1024, sizeof(struct token)); if (ts->tokens[ts->tokenblk] == NULL) { perror("woody lexer"); exit(EXIT_FAILURE); } return &ts->tokens[ts->tokenblk][ts->tokenind++]; } /* all pointers in array exhausted, must reallocate */ else { ts->tokensz++; /* need larger tokensz */ ts->tokenblk++; /* go to next (new) block */ ts->tokens = realloc(ts->tokens, ts->tokensz * sizeof(struct token)); if (ts->tokens == NULL) { perror("woody lexer"); exit(EXIT_FAILURE); } ts->tokenind = 0; goto alloc_new_block; } } /* NOTREACHED */ } /** * Used to get a character like getch but in the context * of a string. Thus also accepting escape sequences. * \n newline * \r carriage return */ static inline int getcharfromstr(struct tokenstate *ts) { if (ts->look == '\\') { getch(ts); switch (ts->look) { case 'n': getch(ts); return '\n'; case 'r': getch(ts); return '\r'; default: fprintf(stderr, "%s:%i:%i: " "invalid escape sequence\n", ts->filename, ts->line, ts->col); exit(EXIT_FAILURE); /* NOTREACHED */ return -1; } } else { int look = ts->look; getch(ts); return look; } } static inline void tokenizenum(struct token *token, struct tokenstate *ts) { int64_t num = ts->look - '0'; getch(ts); while (isdigit(ts->look)) { num *= 10; num += ts->look - '0'; getch(ts); } token->type = NUM; token->val.num = num; } /* tokenize a variable or keyword */ static inline void tokenizevarkw(struct token *token, struct tokenstate *ts) { /* NOTE: maximum var size is 32, maybe change? */ int i; char *buf = malloc(32); if (buf == NULL) { perror("woody lexer"); exit(EXIT_FAILURE); } /* read the var/kw */ for (i = 0; i < 32 && (isalpha(ts->look) || isdigit(ts->look)); i++) { buf[i] = ts->look; getch(ts); } buf[i] = '\0'; for (i = 0; i < ENDKWTYPE; i++) { if (strcmp(buf, keywords[i]) == 0) { /* keyword */ token->type = KW; token->val.kw = i; } } /* variable */ if (i == ENDKWTYPE) { token->type = VAR; token->val.var = buf; } } static inline void tokenizestring(struct token *token, struct tokenstate *ts) { int i = 0; size_t bufsz = 512; char *buf; getch(ts); if (ts->look == '"') { /* empty string is illegal */ fprintf(stderr, "%s:%i:%i: empty string" " is illegal\n", ts->filename, ts->line, ts->col); exit(EXIT_FAILURE); } buf = malloc(bufsz); if (buf == NULL) { perror("woody lexer"); exit(EXIT_FAILURE); } read_string: for (; i < bufsz && ts->look != '"' && ts->look != '\0'; i++) buf[i] = getcharfromstr(ts); if (ts->look != '"' && ts->look != '\0') { /* buf too small */ buf = realloc(buf, bufsz + 127); if (buf == NULL) { perror("woody lexer"); exit(EXIT_FAILURE); } goto read_string; } buf[i] = '\0'; token->type = STRING; token->val.string = buf; } /** * This function loops through the given file and tokenizes it. * It works as follows: * Step 1: Get character * Step 2: Check if character is a string, number, operator * or a keyword/variable * Step 3: Do the appropriate action for parsing * Step 4: Repeat until EOF */ int tokenize(struct tokenstate *ts) { struct token *token; if (ts == NULL) return -1; getch(ts); while (ts->look != 0) { skip_whitespace(ts); if (ts->look == 0) break; /** * Get a pointer to the next struct token and set * information used across all cases. */ token = getnexttoken(ts); token->filename = ts->filename; token->col = ts->col; token->line = ts->line; token->pos = ts->pos; if (isdigit(ts->look)) tokenizenum(token, ts); else if (isalpha(ts->look)) tokenizevarkw(token, ts); else if (ts->look == '"') tokenizestring(token, ts); else if (ts->look == '\n') { token->type = NEWLINE; getch(ts); } else { /* something else, probably an operator */ /* don't bother to check here, it's not worth it */ token->type = OP; token->val.op = ts->look; getch(ts); } } return 0; }