463 lines
13 KiB
C
463 lines
13 KiB
C
#include "parse.h"
|
|
|
|
#include <ctype.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
|
|
const char *token_type_to_str(TokenType type) {
|
|
switch (type) {
|
|
case TOKEN_TYPE_EOF:
|
|
return "EOF";
|
|
case TOKEN_TYPE_COMMENT:
|
|
return "COMMENT";
|
|
case TOKEN_TYPE_PAREN:
|
|
return "PAREN";
|
|
case TOKEN_TYPE_BRACKET:
|
|
return "BRACKET";
|
|
case TOKEN_TYPE_SYMBOL:
|
|
return "SYMBOL";
|
|
case TOKEN_TYPE_PROPERTY:
|
|
return "PROPERTY";
|
|
case TOKEN_TYPE_QUOTE:
|
|
return "QUOTE";
|
|
case TOKEN_TYPE_NUMBER:
|
|
return "NUMBER";
|
|
case TOKEN_TYPE_CHAR:
|
|
return "CHAR";
|
|
case TOKEN_TYPE_STRING:
|
|
return "STRING";
|
|
case TOKEN_TYPE_COMMA:
|
|
return "COMMA";
|
|
case TOKEN_TYPE_BACKQUOTE:
|
|
return "BACKQUOTE";
|
|
case TOKEN_TYPE_SPLICE:
|
|
return "SPLICE";
|
|
case TOKEN_TYPE_UNKNOWN:
|
|
return "UNKNOWN";
|
|
}
|
|
}
|
|
|
|
static void append_char(Token *token, char new_char) {
|
|
if (token->len >= token->buf_len) {
|
|
token->buf_len = token->len + 1;
|
|
token->text = realloc(token->text, token->buf_len);
|
|
}
|
|
token->text[token->len++] = new_char;
|
|
}
|
|
|
|
static void append_null_byte(Token *token) {
|
|
if (token->len >= token->buf_len) {
|
|
token->buf_len = token->len + 1;
|
|
token->text = realloc(token->text, token->buf_len);
|
|
}
|
|
token->text[token->len] = '\0';
|
|
}
|
|
|
|
static void copy_to_buffer(Token *token, const char *src, size_t src_len) {
|
|
if (src_len >= token->buf_len) {
|
|
token->buf_len = src_len + 1;
|
|
token->text = realloc(token->text, token->buf_len);
|
|
}
|
|
memcpy(token->text, src, src_len);
|
|
token->len = src_len;
|
|
token->text[src_len] = '\0';
|
|
}
|
|
|
|
static int issymbolend(int c) {
|
|
return isspace(c) || c == ')' || c == ']' || c == '(' ||
|
|
c == '[' || c == ',';
|
|
}
|
|
|
|
// takes the string arguments
|
|
static void token_stream_push_error(TokenStream *stream, Token *token,
|
|
char *desc, bool at_end) {
|
|
ParseError *err = malloc(sizeof(ParseError));
|
|
err->next = NULL;
|
|
err->col = token->col;
|
|
err->line = token->line;
|
|
err->desc = desc;
|
|
err->context = malloc(token->len + 1);
|
|
err->at_end = at_end;
|
|
memcpy(err->context, token->text, token->len);
|
|
err->context[token->len] = '\0';
|
|
if (stream->error_tail) {
|
|
stream->error_tail->next = err;
|
|
} else {
|
|
stream->error_head = err;
|
|
}
|
|
stream->error_tail = err;
|
|
++stream->error_count;
|
|
token->type = TOKEN_TYPE_UNKNOWN;
|
|
copy_to_buffer(token, "", 0);
|
|
}
|
|
|
|
// src is taken by this function
|
|
TokenStream *make_token_stream(FILE *src) {
|
|
TokenStream *stream = malloc(sizeof(TokenStream));
|
|
stream->src = src;
|
|
stream->col = 0;
|
|
stream->line = 1;
|
|
stream->error_tail = NULL;
|
|
stream->error_head = NULL;
|
|
stream->error_count = 0;
|
|
return stream;
|
|
}
|
|
|
|
void destroy_token_stream(TokenStream *stream) {
|
|
while (stream->error_head) {
|
|
void *next = stream->error_head->next;
|
|
free(stream->error_head);
|
|
stream->error_head = next;
|
|
}
|
|
fclose(stream->src);
|
|
free(stream);
|
|
}
|
|
|
|
static void next_comment(TokenStream *stream, Token *token) {
|
|
char c;
|
|
while ((c = fgetc(stream->src)) != EOF) {
|
|
if (c == '\n') {
|
|
break;
|
|
}
|
|
append_char(token, c);
|
|
}
|
|
append_null_byte(token);
|
|
++stream->line;
|
|
stream->col = 0;
|
|
}
|
|
|
|
static void skip_while(TokenStream *stream, int(*pred)(int c), bool inv) {
|
|
char c;
|
|
while ((c = fgetc(stream->src)) != EOF &&
|
|
((!inv && pred(c)) || (inv && !pred(c)))) {
|
|
if (c == '\n') {
|
|
++stream->line;
|
|
stream->col = 0;
|
|
} else {
|
|
++stream->col;
|
|
}
|
|
}
|
|
ungetc(c, stream->src);
|
|
}
|
|
|
|
static void next_char(TokenStream *stream, Token *token) {
|
|
token->len = 1;
|
|
char c = fgetc(stream->src);
|
|
copy_to_buffer(token, &c, 1);
|
|
++stream->col;
|
|
}
|
|
|
|
static void next_string(TokenStream *stream, Token *token) {
|
|
bool backslash = false;
|
|
char c = fgetc(stream->src); // opening "
|
|
append_char(token, c);
|
|
while ((c = fgetc(stream->src)) != EOF && (backslash || c != '"')) {
|
|
if (c == '\\' && !backslash) {
|
|
backslash = true;
|
|
} else {
|
|
if (backslash && c != '"') {
|
|
append_char(token, '\\');
|
|
}
|
|
backslash = false;
|
|
append_char(token, c);
|
|
if (c == '\n') {
|
|
++stream->line;
|
|
stream->col = 0;
|
|
} else {
|
|
++stream->col;
|
|
}
|
|
}
|
|
}
|
|
++stream->col;
|
|
if (feof(stream->src)) {
|
|
token_stream_push_error(stream, token, strdup("expected '\"', got EOF"),
|
|
true);
|
|
append_null_byte(token);
|
|
} else {
|
|
append_char(token, '"');
|
|
append_null_byte(token);
|
|
}
|
|
}
|
|
|
|
static void next_symbol(TokenStream *stream, Token *token) {
|
|
char c;
|
|
bool backslash = false;
|
|
while ((c = fgetc(stream->src)) != EOF &&
|
|
(backslash || !issymbolend(c))) {
|
|
if (c == '\\' && !backslash) {
|
|
backslash = true;
|
|
} else {
|
|
backslash = false;
|
|
append_char(token, c);
|
|
}
|
|
}
|
|
append_null_byte(token);
|
|
ungetc(c, stream->src);
|
|
}
|
|
|
|
static void next_char_literal(TokenStream *stream, Token *token) {
|
|
fgetc(stream->src);
|
|
append_char(token, '#');
|
|
char c = fgetc(stream->src);
|
|
append_char(token, c);
|
|
stream->col += 2;
|
|
if (c == EOF) {
|
|
token_stream_push_error(stream, token,
|
|
strdup("expected character literal, got EOF"),
|
|
true);
|
|
token->len = 0;
|
|
token->type = TOKEN_TYPE_UNKNOWN;
|
|
c = fgetc(stream->src);
|
|
} else if (c == '\\') {
|
|
// named character literal, like "#\n"
|
|
while ((c = fgetc(stream->src)) != EOF &&
|
|
(isalpha(c) || isdigit(c) || c == '\\')) {
|
|
append_char(token, c);
|
|
++stream->col;
|
|
}
|
|
} else {
|
|
c = fgetc(stream->src);
|
|
}
|
|
append_null_byte(token);
|
|
// the ifs above do this
|
|
// c = fgetc(stream->src);
|
|
if (c != EOF && !issymbolend(c)) {
|
|
token_stream_push_error(stream, token,
|
|
strdup("character literal too long"),
|
|
false);
|
|
skip_while(stream, &issymbolend, true);
|
|
} else {
|
|
ungetc(c, stream->src);
|
|
}
|
|
}
|
|
|
|
// we accept base = 2, 8, 10, and 16
|
|
static int isbasechar(char c, int base) {
|
|
if (c < '0') {
|
|
return false;
|
|
}
|
|
switch (base) {
|
|
case 16:
|
|
c = tolower(c);
|
|
if (c >= 'a' && c <= 'f') {
|
|
return true;
|
|
}
|
|
base = 10;
|
|
case 2:
|
|
case 8:
|
|
case 10:
|
|
return c <= '0' + (base - 1);
|
|
default:
|
|
return false;
|
|
}
|
|
}
|
|
|
|
static int get_base_from_token(Token *token) {
|
|
size_t i;
|
|
for (i = 0; i < token->len; ++i) {
|
|
if (token->text[i] != '0') {
|
|
break;
|
|
}
|
|
}
|
|
if (token->len - i == 1) {
|
|
switch (token->text[i]) {
|
|
case '2':
|
|
return 2;
|
|
case '8':
|
|
return 8;
|
|
}
|
|
return 0;
|
|
} else if (token->len - i == 2 && token->text[i] == '1') {
|
|
switch (token->text[i + 1]) {
|
|
case '0':
|
|
return 10;
|
|
case '6':
|
|
return 16;
|
|
}
|
|
return 0;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
static void next_number_or_symbol(TokenStream *stream, Token *token, char first_char) {
|
|
token->type = TOKEN_TYPE_NUMBER;
|
|
bool has_decimal = false;
|
|
bool has_base = false;
|
|
bool only_nums = true;
|
|
bool has_exp = false;
|
|
bool allow_plus_minus = false;
|
|
int base = 10;
|
|
bool has_num = isbasechar(first_char, 10);
|
|
if (first_char == '.') {
|
|
has_decimal = true;
|
|
only_nums = false;
|
|
} else if (first_char == '-' || first_char == '+') {
|
|
only_nums = false;
|
|
}
|
|
append_char(token, first_char);
|
|
char c;
|
|
while ((c = fgetc(stream->src)) != EOF && !issymbolend(c)) {
|
|
++stream->col;
|
|
if (c == '.') {
|
|
if (has_decimal || has_base) {
|
|
token->type = TOKEN_TYPE_SYMBOL;
|
|
ungetc(c, stream->src);
|
|
next_symbol(stream, token);
|
|
return;
|
|
}
|
|
only_nums = false;
|
|
has_decimal = true;
|
|
} else if (c == '#') {
|
|
if (has_base || !only_nums) {
|
|
token->type = TOKEN_TYPE_SYMBOL;
|
|
|
|
ungetc(c, stream->src);
|
|
next_symbol(stream, token);
|
|
return;
|
|
} else {
|
|
base = get_base_from_token(token);
|
|
if (base == 0) {
|
|
token->type = TOKEN_TYPE_SYMBOL;
|
|
ungetc(c, stream->src);
|
|
next_symbol(stream, token);
|
|
return;
|
|
}
|
|
}
|
|
has_base = true;
|
|
has_num = false;
|
|
allow_plus_minus = true;
|
|
append_char(token, c);
|
|
continue;
|
|
} else if (base == 10 && c == 'e') {
|
|
if (has_exp) {
|
|
token->type = TOKEN_TYPE_SYMBOL;
|
|
ungetc(c, stream->src);
|
|
next_symbol(stream, token);
|
|
return;
|
|
}
|
|
allow_plus_minus = true;
|
|
only_nums = false;
|
|
has_exp = true;
|
|
has_decimal = false; // the exponent can have a decimal point
|
|
append_char(token, c);
|
|
continue;
|
|
} else if (allow_plus_minus && (c == '+' || c == '-')) {
|
|
// fall-through
|
|
} else if (!isbasechar(c, base)) {
|
|
token->type = TOKEN_TYPE_SYMBOL;
|
|
ungetc(c, stream->src);
|
|
next_symbol(stream, token);
|
|
return;
|
|
}
|
|
has_num = true;
|
|
allow_plus_minus = false;
|
|
append_char(token, c);
|
|
}
|
|
if (!has_num) {
|
|
token->type = TOKEN_TYPE_SYMBOL;
|
|
}
|
|
append_null_byte(token);
|
|
ungetc(c, stream->src);
|
|
}
|
|
|
|
// return the number of errors after parsing
|
|
size_t token_stream_next(TokenStream *stream, Token *token) {
|
|
skip_while(stream, &isspace, false);
|
|
token->line = stream->line;
|
|
token->col = stream->col + 1;
|
|
token->len = 0;
|
|
char nc = fgetc(stream->src);
|
|
if (nc == EOF) {
|
|
token->type = TOKEN_TYPE_EOF;
|
|
copy_to_buffer(token, "", 0);
|
|
return stream->error_count;
|
|
}
|
|
ungetc(nc, stream->src);
|
|
if (nc == ';') {
|
|
token->type = TOKEN_TYPE_COMMENT;
|
|
next_comment(stream, token);
|
|
} else if (nc == '(' || nc == ')') {
|
|
token->type = TOKEN_TYPE_PAREN;
|
|
next_char(stream, token);
|
|
} else if (nc == '[' || nc == ']') {
|
|
token->type = TOKEN_TYPE_BRACKET;
|
|
next_char(stream, token);
|
|
} else if (nc == '\'') {
|
|
token->type = TOKEN_TYPE_QUOTE;
|
|
next_char(stream, token);
|
|
} else if (nc == '`') {
|
|
token->type = TOKEN_TYPE_BACKQUOTE;
|
|
next_char(stream, token);
|
|
} else if (nc == ',') {
|
|
// look at character after the m
|
|
char chars[2];
|
|
chars[0] = fgetc(stream->src);
|
|
chars[1] = fgetc(stream->src);
|
|
if (chars[1] == '@') {
|
|
token->type = TOKEN_TYPE_SPLICE;
|
|
copy_to_buffer(token, chars, 2);
|
|
} else {
|
|
ungetc(chars[1], stream->src);
|
|
token->type = TOKEN_TYPE_COMMA;
|
|
copy_to_buffer(token, chars, 1);
|
|
}
|
|
} else if (nc == '"') {
|
|
token->type = TOKEN_TYPE_STRING;
|
|
next_string(stream, token);
|
|
} else if (nc == '.') {
|
|
// look at character after the .
|
|
char chars[2];
|
|
chars[0] = fgetc(stream->src);
|
|
chars[1] = fgetc(stream->src);
|
|
ungetc(chars[1], stream->src);
|
|
if (isspace(chars[1])) {
|
|
++stream->col;
|
|
token->type = TOKEN_TYPE_SYMBOL;
|
|
copy_to_buffer(token, ".", 1);
|
|
} else {
|
|
// the . is part of something bigger
|
|
next_number_or_symbol(stream, token, chars[0]);
|
|
}
|
|
} else if (nc == '#') {
|
|
token->type = TOKEN_TYPE_CHAR;
|
|
next_char_literal(stream, token);
|
|
} else if (nc == '-' || nc == '+' || (nc >= '0' && nc <= '9')) {
|
|
next_number_or_symbol(stream, token, fgetc(stream->src));
|
|
} else {
|
|
token->type = nc == ':' ? TOKEN_TYPE_PROPERTY : TOKEN_TYPE_SYMBOL;
|
|
next_symbol(stream, token);
|
|
}
|
|
return stream->error_count;
|
|
}
|
|
|
|
void token_free(Token *token) {
|
|
free(token->text);
|
|
}
|
|
|
|
// return the number of errors left
|
|
ParseError *token_stream_error(TokenStream *stream) {
|
|
if (stream->error_count == 0) {
|
|
return NULL;
|
|
}
|
|
ParseError *error = stream->error_head;
|
|
stream->error_head = stream->error_head->next;
|
|
if (!stream->error_head) {
|
|
stream->error_tail = NULL;
|
|
}
|
|
error->next = NULL;
|
|
--stream->error_count;
|
|
return error;
|
|
}
|
|
|
|
void parse_error_free(ParseError *error) {
|
|
if (error) {
|
|
free(error->desc);
|
|
free(error->context);
|
|
free(error);
|
|
}
|
|
}
|
|
|
|
bool token_stream_is_eof(TokenStream *stream) {
|
|
return feof(stream->src);
|
|
}
|