#include "parse.h" #include #include #include const char *token_type_to_str(TokenType type) { switch (type) { case TOKEN_TYPE_EOF: return "EOF"; case TOKEN_TYPE_COMMENT: return "COMMENT"; case TOKEN_TYPE_PAREN: return "PAREN"; case TOKEN_TYPE_BRACKET: return "BRACKET"; case TOKEN_TYPE_SYMBOL: return "SYMBOL"; case TOKEN_TYPE_PROPERTY: return "PROPERTY"; case TOKEN_TYPE_QUOTE: return "QUOTE"; case TOKEN_TYPE_NUMBER: return "NUMBER"; case TOKEN_TYPE_CHAR: return "CHAR"; case TOKEN_TYPE_STRING: return "STRING"; case TOKEN_TYPE_COMMA: return "COMMA"; case TOKEN_TYPE_BACKQUOTE: return "BACKQUOTE"; case TOKEN_TYPE_SPLICE: return "SPLICE"; case TOKEN_TYPE_UNKNOWN: return "UNKNOWN"; } } static void append_char(Token *token, char new_char) { if (token->len >= token->buf_len) { token->buf_len = token->len + 1; token->text = realloc(token->text, token->buf_len); } token->text[token->len++] = new_char; } static void append_null_byte(Token *token) { if (token->len >= token->buf_len) { token->buf_len = token->len + 1; token->text = realloc(token->text, token->buf_len); } token->text[token->len] = '\0'; } static void copy_to_buffer(Token *token, const char *src, size_t src_len) { if (src_len >= token->buf_len) { token->buf_len = src_len + 1; token->text = realloc(token->text, token->buf_len); } memcpy(token->text, src, src_len); token->len = src_len; token->text[src_len] = '\0'; } static int issymbolend(int c) { return isspace(c) || c == ')' || c == ']' || c == '(' || c == '[' || c == ','; } // takes the string arguments static void token_stream_push_error(TokenStream *stream, Token *token, char *desc, bool at_end) { ParseError *err = malloc(sizeof(ParseError)); err->next = NULL; err->col = token->col; err->line = token->line; err->desc = desc; err->context = malloc(token->len + 1); err->at_end = at_end; memcpy(err->context, token->text, token->len); err->context[token->len] = '\0'; if (stream->error_tail) { stream->error_tail->next = err; } else { stream->error_head = err; } stream->error_tail = err; ++stream->error_count; token->type = TOKEN_TYPE_UNKNOWN; copy_to_buffer(token, "", 0); } // src is taken by this function TokenStream *make_token_stream(FILE *src) { TokenStream *stream = malloc(sizeof(TokenStream)); stream->src = src; stream->col = 0; stream->line = 1; stream->error_tail = NULL; stream->error_head = NULL; stream->error_count = 0; return stream; } void destroy_token_stream(TokenStream *stream) { while (stream->error_head) { void *next = stream->error_head->next; free(stream->error_head); stream->error_head = next; } fclose(stream->src); free(stream); } static void next_comment(TokenStream *stream, Token *token) { char c; while ((c = fgetc(stream->src)) != EOF) { if (c == '\n') { break; } append_char(token, c); } append_null_byte(token); ++stream->line; stream->col = 0; } static void skip_while(TokenStream *stream, int(*pred)(int c), bool inv) { char c; while ((c = fgetc(stream->src)) != EOF && ((!inv && pred(c)) || (inv && !pred(c)))) { if (c == '\n') { ++stream->line; stream->col = 0; } else { ++stream->col; } } ungetc(c, stream->src); } static void next_char(TokenStream *stream, Token *token) { token->len = 1; char c = fgetc(stream->src); copy_to_buffer(token, &c, 1); ++stream->col; } static void next_string(TokenStream *stream, Token *token) { bool backslash = false; char c = fgetc(stream->src); // opening " append_char(token, c); while ((c = fgetc(stream->src)) != EOF && (backslash || c != '"')) { if (c == '\\' && !backslash) { backslash = true; } else { if (backslash && c != '"') { append_char(token, '\\'); } backslash = false; append_char(token, c); if (c == '\n') { ++stream->line; stream->col = 0; } else { ++stream->col; } } } ++stream->col; if (feof(stream->src)) { token_stream_push_error(stream, token, strdup("expected '\"', got EOF"), true); append_null_byte(token); } else { append_char(token, '"'); append_null_byte(token); } } static void next_symbol(TokenStream *stream, Token *token) { char c; bool backslash = false; while ((c = fgetc(stream->src)) != EOF && (backslash || !issymbolend(c))) { if (c == '\\' && !backslash) { backslash = true; } else { backslash = false; append_char(token, c); } } append_null_byte(token); ungetc(c, stream->src); } static void next_char_literal(TokenStream *stream, Token *token) { fgetc(stream->src); append_char(token, '#'); char c = fgetc(stream->src); append_char(token, c); stream->col += 2; if (c == EOF) { token_stream_push_error(stream, token, strdup("expected character literal, got EOF"), true); token->len = 0; token->type = TOKEN_TYPE_UNKNOWN; c = fgetc(stream->src); } else if (c == '\\') { // named character literal, like "#\n" while ((c = fgetc(stream->src)) != EOF && (isalpha(c) || isdigit(c) || c == '\\')) { append_char(token, c); ++stream->col; } } else { c = fgetc(stream->src); } append_null_byte(token); // the ifs above do this // c = fgetc(stream->src); if (c != EOF && !issymbolend(c)) { token_stream_push_error(stream, token, strdup("character literal too long"), false); skip_while(stream, &issymbolend, true); } else { ungetc(c, stream->src); } } // we accept base = 2, 8, 10, and 16 static int isbasechar(char c, int base) { if (c < '0') { return false; } switch (base) { case 16: c = tolower(c); if (c >= 'a' && c <= 'f') { return true; } base = 10; case 2: case 8: case 10: return c <= '0' + (base - 1); default: return false; } } static int get_base_from_token(Token *token) { size_t i; for (i = 0; i < token->len; ++i) { if (token->text[i] != '0') { break; } } if (token->len - i == 1) { switch (token->text[i]) { case '2': return 2; case '8': return 8; } return 0; } else if (token->len - i == 2 && token->text[i] == '1') { switch (token->text[i + 1]) { case '0': return 10; case '6': return 16; } return 0; } return 0; } static void next_number_or_symbol(TokenStream *stream, Token *token, char first_char) { token->type = TOKEN_TYPE_NUMBER; bool has_decimal = false; bool has_base = false; bool only_nums = true; bool has_exp = false; bool allow_plus_minus = false; int base = 10; bool has_num = isbasechar(first_char, 10); if (first_char == '.') { has_decimal = true; only_nums = false; } else if (first_char == '-' || first_char == '+') { only_nums = false; } append_char(token, first_char); char c; while ((c = fgetc(stream->src)) != EOF && !issymbolend(c)) { ++stream->col; if (c == '.') { if (has_decimal || has_base) { token->type = TOKEN_TYPE_SYMBOL; ungetc(c, stream->src); next_symbol(stream, token); return; } only_nums = false; has_decimal = true; } else if (c == '#') { if (has_base || !only_nums) { token->type = TOKEN_TYPE_SYMBOL; ungetc(c, stream->src); next_symbol(stream, token); return; } else { base = get_base_from_token(token); if (base == 0) { token->type = TOKEN_TYPE_SYMBOL; ungetc(c, stream->src); next_symbol(stream, token); return; } } has_base = true; has_num = false; allow_plus_minus = true; append_char(token, c); continue; } else if (base == 10 && c == 'e') { if (has_exp) { token->type = TOKEN_TYPE_SYMBOL; ungetc(c, stream->src); next_symbol(stream, token); return; } allow_plus_minus = true; only_nums = false; has_exp = true; has_decimal = false; // the exponent can have a decimal point append_char(token, c); continue; } else if (allow_plus_minus && (c == '+' || c == '-')) { // fall-through } else if (!isbasechar(c, base)) { token->type = TOKEN_TYPE_SYMBOL; ungetc(c, stream->src); next_symbol(stream, token); return; } has_num = true; allow_plus_minus = false; append_char(token, c); } if (!has_num) { token->type = TOKEN_TYPE_SYMBOL; } append_null_byte(token); ungetc(c, stream->src); } // return the number of errors after parsing size_t token_stream_next(TokenStream *stream, Token *token) { skip_while(stream, &isspace, false); token->line = stream->line; token->col = stream->col + 1; token->len = 0; char nc = fgetc(stream->src); if (nc == EOF) { token->type = TOKEN_TYPE_EOF; copy_to_buffer(token, "", 0); return stream->error_count; } ungetc(nc, stream->src); if (nc == ';') { token->type = TOKEN_TYPE_COMMENT; next_comment(stream, token); } else if (nc == '(' || nc == ')') { token->type = TOKEN_TYPE_PAREN; next_char(stream, token); } else if (nc == '[' || nc == ']') { token->type = TOKEN_TYPE_BRACKET; next_char(stream, token); } else if (nc == '\'') { token->type = TOKEN_TYPE_QUOTE; next_char(stream, token); } else if (nc == '`') { token->type = TOKEN_TYPE_BACKQUOTE; next_char(stream, token); } else if (nc == ',') { // look at character after the m char chars[2]; chars[0] = fgetc(stream->src); chars[1] = fgetc(stream->src); if (chars[1] == '@') { token->type = TOKEN_TYPE_SPLICE; copy_to_buffer(token, chars, 2); } else { ungetc(chars[1], stream->src); token->type = TOKEN_TYPE_COMMA; copy_to_buffer(token, chars, 1); } } else if (nc == '"') { token->type = TOKEN_TYPE_STRING; next_string(stream, token); } else if (nc == '.') { // look at character after the . char chars[2]; chars[0] = fgetc(stream->src); chars[1] = fgetc(stream->src); ungetc(chars[1], stream->src); if (isspace(chars[1])) { ++stream->col; token->type = TOKEN_TYPE_SYMBOL; copy_to_buffer(token, ".", 1); } else { // the . is part of something bigger next_number_or_symbol(stream, token, chars[0]); } } else if (nc == '#') { token->type = TOKEN_TYPE_CHAR; next_char_literal(stream, token); } else if (nc == '-' || nc == '+' || (nc >= '0' && nc <= '9')) { next_number_or_symbol(stream, token, fgetc(stream->src)); } else { token->type = nc == ':' ? TOKEN_TYPE_PROPERTY : TOKEN_TYPE_SYMBOL; next_symbol(stream, token); } return stream->error_count; } void token_free(Token *token) { free(token->text); } // return the number of errors left ParseError *token_stream_error(TokenStream *stream) { if (stream->error_count == 0) { return NULL; } ParseError *error = stream->error_head; stream->error_head = stream->error_head->next; if (!stream->error_head) { stream->error_tail = NULL; } error->next = NULL; --stream->error_count; return error; } void parse_error_free(ParseError *error) { if (error) { free(error->desc); free(error->context); free(error); } } bool token_stream_is_eof(TokenStream *stream) { return feof(stream->src); }