simple-lisp/bootstrap/parse.c

463 lines
13 KiB
C

#include "parse.h"
#include <ctype.h>
#include <stdlib.h>
#include <string.h>
const char *token_type_to_str(TokenType type) {
switch (type) {
case TOKEN_TYPE_EOF:
return "EOF";
case TOKEN_TYPE_COMMENT:
return "COMMENT";
case TOKEN_TYPE_PAREN:
return "PAREN";
case TOKEN_TYPE_BRACKET:
return "BRACKET";
case TOKEN_TYPE_SYMBOL:
return "SYMBOL";
case TOKEN_TYPE_PROPERTY:
return "PROPERTY";
case TOKEN_TYPE_QUOTE:
return "QUOTE";
case TOKEN_TYPE_NUMBER:
return "NUMBER";
case TOKEN_TYPE_CHAR:
return "CHAR";
case TOKEN_TYPE_STRING:
return "STRING";
case TOKEN_TYPE_COMMA:
return "COMMA";
case TOKEN_TYPE_BACKQUOTE:
return "BACKQUOTE";
case TOKEN_TYPE_SPLICE:
return "SPLICE";
case TOKEN_TYPE_UNKNOWN:
return "UNKNOWN";
}
}
static void append_char(Token *token, char new_char) {
if (token->len >= token->buf_len) {
token->buf_len = token->len + 1;
token->text = realloc(token->text, token->buf_len);
}
token->text[token->len++] = new_char;
}
static void append_null_byte(Token *token) {
if (token->len >= token->buf_len) {
token->buf_len = token->len + 1;
token->text = realloc(token->text, token->buf_len);
}
token->text[token->len] = '\0';
}
static void copy_to_buffer(Token *token, const char *src, size_t src_len) {
if (src_len >= token->buf_len) {
token->buf_len = src_len + 1;
token->text = realloc(token->text, token->buf_len);
}
memcpy(token->text, src, src_len);
token->len = src_len;
token->text[src_len] = '\0';
}
static int issymbolend(int c) {
return isspace(c) || c == ')' || c == ']' || c == '(' ||
c == '[' || c == ',';
}
// takes the string arguments
static void token_stream_push_error(TokenStream *stream, Token *token,
char *desc, bool at_end) {
ParseError *err = malloc(sizeof(ParseError));
err->next = NULL;
err->col = token->col;
err->line = token->line;
err->desc = desc;
err->context = malloc(token->len + 1);
err->at_end = at_end;
memcpy(err->context, token->text, token->len);
err->context[token->len] = '\0';
if (stream->error_tail) {
stream->error_tail->next = err;
} else {
stream->error_head = err;
}
stream->error_tail = err;
++stream->error_count;
token->type = TOKEN_TYPE_UNKNOWN;
copy_to_buffer(token, "", 0);
}
// src is taken by this function
TokenStream *make_token_stream(FILE *src) {
TokenStream *stream = malloc(sizeof(TokenStream));
stream->src = src;
stream->col = 0;
stream->line = 1;
stream->error_tail = NULL;
stream->error_head = NULL;
stream->error_count = 0;
return stream;
}
void destroy_token_stream(TokenStream *stream) {
while (stream->error_head) {
void *next = stream->error_head->next;
free(stream->error_head);
stream->error_head = next;
}
fclose(stream->src);
free(stream);
}
static void next_comment(TokenStream *stream, Token *token) {
char c;
while ((c = fgetc(stream->src)) != EOF) {
if (c == '\n') {
break;
}
append_char(token, c);
}
append_null_byte(token);
++stream->line;
stream->col = 0;
}
static void skip_while(TokenStream *stream, int(*pred)(int c), bool inv) {
char c;
while ((c = fgetc(stream->src)) != EOF &&
((!inv && pred(c)) || (inv && !pred(c)))) {
if (c == '\n') {
++stream->line;
stream->col = 0;
} else {
++stream->col;
}
}
ungetc(c, stream->src);
}
static void next_char(TokenStream *stream, Token *token) {
token->len = 1;
char c = fgetc(stream->src);
copy_to_buffer(token, &c, 1);
++stream->col;
}
static void next_string(TokenStream *stream, Token *token) {
bool backslash = false;
char c = fgetc(stream->src); // opening "
append_char(token, c);
while ((c = fgetc(stream->src)) != EOF && (backslash || c != '"')) {
if (c == '\\' && !backslash) {
backslash = true;
} else {
if (backslash && c != '"') {
append_char(token, '\\');
}
backslash = false;
append_char(token, c);
if (c == '\n') {
++stream->line;
stream->col = 0;
} else {
++stream->col;
}
}
}
++stream->col;
if (feof(stream->src)) {
token_stream_push_error(stream, token, strdup("expected '\"', got EOF"),
true);
append_null_byte(token);
} else {
append_char(token, '"');
append_null_byte(token);
}
}
static void next_symbol(TokenStream *stream, Token *token) {
char c;
bool backslash = false;
while ((c = fgetc(stream->src)) != EOF &&
(backslash || !issymbolend(c))) {
if (c == '\\' && !backslash) {
backslash = true;
} else {
backslash = false;
append_char(token, c);
}
}
append_null_byte(token);
ungetc(c, stream->src);
}
static void next_char_literal(TokenStream *stream, Token *token) {
fgetc(stream->src);
append_char(token, '#');
char c = fgetc(stream->src);
append_char(token, c);
stream->col += 2;
if (c == EOF) {
token_stream_push_error(stream, token,
strdup("expected character literal, got EOF"),
true);
token->len = 0;
token->type = TOKEN_TYPE_UNKNOWN;
c = fgetc(stream->src);
} else if (c == '\\') {
// named character literal, like "#\n"
while ((c = fgetc(stream->src)) != EOF &&
(isalpha(c) || isdigit(c) || c == '\\')) {
append_char(token, c);
++stream->col;
}
} else {
c = fgetc(stream->src);
}
append_null_byte(token);
// the ifs above do this
// c = fgetc(stream->src);
if (c != EOF && !issymbolend(c)) {
token_stream_push_error(stream, token,
strdup("character literal too long"),
false);
skip_while(stream, &issymbolend, true);
} else {
ungetc(c, stream->src);
}
}
// we accept base = 2, 8, 10, and 16
static int isbasechar(char c, int base) {
if (c < '0') {
return false;
}
switch (base) {
case 16:
c = tolower(c);
if (c >= 'a' && c <= 'f') {
return true;
}
base = 10;
case 2:
case 8:
case 10:
return c <= '0' + (base - 1);
default:
return false;
}
}
static int get_base_from_token(Token *token) {
size_t i;
for (i = 0; i < token->len; ++i) {
if (token->text[i] != '0') {
break;
}
}
if (token->len - i == 1) {
switch (token->text[i]) {
case '2':
return 2;
case '8':
return 8;
}
return 0;
} else if (token->len - i == 2 && token->text[i] == '1') {
switch (token->text[i + 1]) {
case '0':
return 10;
case '6':
return 16;
}
return 0;
}
return 0;
}
static void next_number_or_symbol(TokenStream *stream, Token *token, char first_char) {
token->type = TOKEN_TYPE_NUMBER;
bool has_decimal = false;
bool has_base = false;
bool only_nums = true;
bool has_exp = false;
bool allow_plus_minus = false;
int base = 10;
bool has_num = isbasechar(first_char, 10);
if (first_char == '.') {
has_decimal = true;
only_nums = false;
} else if (first_char == '-' || first_char == '+') {
only_nums = false;
}
append_char(token, first_char);
char c;
while ((c = fgetc(stream->src)) != EOF && !issymbolend(c)) {
++stream->col;
if (c == '.') {
if (has_decimal || has_base) {
token->type = TOKEN_TYPE_SYMBOL;
ungetc(c, stream->src);
next_symbol(stream, token);
return;
}
only_nums = false;
has_decimal = true;
} else if (c == '#') {
if (has_base || !only_nums) {
token->type = TOKEN_TYPE_SYMBOL;
ungetc(c, stream->src);
next_symbol(stream, token);
return;
} else {
base = get_base_from_token(token);
if (base == 0) {
token->type = TOKEN_TYPE_SYMBOL;
ungetc(c, stream->src);
next_symbol(stream, token);
return;
}
}
has_base = true;
has_num = false;
allow_plus_minus = true;
append_char(token, c);
continue;
} else if (base == 10 && c == 'e') {
if (has_exp) {
token->type = TOKEN_TYPE_SYMBOL;
ungetc(c, stream->src);
next_symbol(stream, token);
return;
}
allow_plus_minus = true;
only_nums = false;
has_exp = true;
has_decimal = false; // the exponent can have a decimal point
append_char(token, c);
continue;
} else if (allow_plus_minus && (c == '+' || c == '-')) {
// fall-through
} else if (!isbasechar(c, base)) {
token->type = TOKEN_TYPE_SYMBOL;
ungetc(c, stream->src);
next_symbol(stream, token);
return;
}
has_num = true;
allow_plus_minus = false;
append_char(token, c);
}
if (!has_num) {
token->type = TOKEN_TYPE_SYMBOL;
}
append_null_byte(token);
ungetc(c, stream->src);
}
// return the number of errors after parsing
size_t token_stream_next(TokenStream *stream, Token *token) {
skip_while(stream, &isspace, false);
token->line = stream->line;
token->col = stream->col + 1;
token->len = 0;
char nc = fgetc(stream->src);
if (nc == EOF) {
token->type = TOKEN_TYPE_EOF;
copy_to_buffer(token, "", 0);
return stream->error_count;
}
ungetc(nc, stream->src);
if (nc == ';') {
token->type = TOKEN_TYPE_COMMENT;
next_comment(stream, token);
} else if (nc == '(' || nc == ')') {
token->type = TOKEN_TYPE_PAREN;
next_char(stream, token);
} else if (nc == '[' || nc == ']') {
token->type = TOKEN_TYPE_BRACKET;
next_char(stream, token);
} else if (nc == '\'') {
token->type = TOKEN_TYPE_QUOTE;
next_char(stream, token);
} else if (nc == '`') {
token->type = TOKEN_TYPE_BACKQUOTE;
next_char(stream, token);
} else if (nc == ',') {
// look at character after the m
char chars[2];
chars[0] = fgetc(stream->src);
chars[1] = fgetc(stream->src);
if (chars[1] == '@') {
token->type = TOKEN_TYPE_SPLICE;
copy_to_buffer(token, chars, 2);
} else {
ungetc(chars[1], stream->src);
token->type = TOKEN_TYPE_COMMA;
copy_to_buffer(token, chars, 1);
}
} else if (nc == '"') {
token->type = TOKEN_TYPE_STRING;
next_string(stream, token);
} else if (nc == '.') {
// look at character after the .
char chars[2];
chars[0] = fgetc(stream->src);
chars[1] = fgetc(stream->src);
ungetc(chars[1], stream->src);
if (isspace(chars[1])) {
++stream->col;
token->type = TOKEN_TYPE_SYMBOL;
copy_to_buffer(token, ".", 1);
} else {
// the . is part of something bigger
next_number_or_symbol(stream, token, chars[0]);
}
} else if (nc == '#') {
token->type = TOKEN_TYPE_CHAR;
next_char_literal(stream, token);
} else if (nc == '-' || nc == '+' || (nc >= '0' && nc <= '9')) {
next_number_or_symbol(stream, token, fgetc(stream->src));
} else {
token->type = nc == ':' ? TOKEN_TYPE_PROPERTY : TOKEN_TYPE_SYMBOL;
next_symbol(stream, token);
}
return stream->error_count;
}
void token_free(Token *token) {
free(token->text);
}
// return the number of errors left
ParseError *token_stream_error(TokenStream *stream) {
if (stream->error_count == 0) {
return NULL;
}
ParseError *error = stream->error_head;
stream->error_head = stream->error_head->next;
if (!stream->error_head) {
stream->error_tail = NULL;
}
error->next = NULL;
--stream->error_count;
return error;
}
void parse_error_free(ParseError *error) {
if (error) {
free(error->desc);
free(error->context);
free(error);
}
}
bool token_stream_is_eof(TokenStream *stream) {
return feof(stream->src);
}