simple-lisp/bootstrap/parse.c

#include "parse.h"

#include <ctype.h>
#include <stdlib.h>
#include <string.h>

const char *token_type_to_str(TokenType type) {
    switch (type) {
    case TOKEN_TYPE_EOF:
        return "EOF";
    case TOKEN_TYPE_COMMENT:
        return "COMMENT";
    case TOKEN_TYPE_PAREN:
        return "PAREN";
    case TOKEN_TYPE_BRACKET:
        return "BRACKET";
    case TOKEN_TYPE_SYMBOL:
        return "SYMBOL";
    case TOKEN_TYPE_PROPERTY:
        return "PROPERTY";
    case TOKEN_TYPE_QUOTE:
        return "QUOTE";
    case TOKEN_TYPE_NUMBER:
        return "NUMBER";
    case TOKEN_TYPE_CHAR:
        return "CHAR";
    case TOKEN_TYPE_STRING:
        return "STRING";
    case TOKEN_TYPE_COMMA:
        return "COMMA";
    case TOKEN_TYPE_BACKQUOTE:
        return "BACKQUOTE";
    case TOKEN_TYPE_SPLICE:
        return "SPLICE";
    case TOKEN_TYPE_UNKNOWN:
        return "UNKNOWN";
    }
}

static void append_char(Token *token, char new_char) {
    if (token->len >= token->buf_len) {
        token->buf_len = token->len + 1;
        token->text = realloc(token->text, token->buf_len);
    }
    token->text[token->len++] = new_char;
}

static void append_null_byte(Token *token) {
    if (token->len >= token->buf_len) {
        token->buf_len = token->len + 1;
        token->text = realloc(token->text, token->buf_len);
    }
    token->text[token->len] = '\0';
}

static void copy_to_buffer(Token *token, const char *src, size_t src_len) {
    if (src_len >= token->buf_len) {
        token->buf_len = src_len + 1;
        token->text = realloc(token->text, token->buf_len);
    }
    memcpy(token->text, src, src_len);
    token->len = src_len;
    token->text[src_len] = '\0';
}

static int issymbolend(int c) {
    return isspace(c) || c == ')' || c == ']' || c == '(' ||
           c == '[' || c == ',';
}

// takes the string arguments
static void token_stream_push_error(TokenStream *stream, Token *token,
                                    char *desc, bool at_end) {
    ParseError *err = malloc(sizeof(ParseError));
    err->next = NULL;
    err->col = token->col;
    err->line = token->line;
    err->desc = desc;
    err->context = malloc(token->len + 1);
    err->at_end = at_end;
    memcpy(err->context, token->text, token->len);
    err->context[token->len] = '\0';
    if (stream->error_tail) {
        stream->error_tail->next = err;
    } else {
        stream->error_head = err;
    }
    stream->error_tail = err;
    ++stream->error_count;
    token->type = TOKEN_TYPE_UNKNOWN;
    copy_to_buffer(token, "", 0);
}

// src is taken by this function
TokenStream *make_token_stream(FILE *src) {
    TokenStream *stream = malloc(sizeof(TokenStream));
    stream->src = src;
    stream->col = 0;
    stream->line = 1;
    stream->error_tail = NULL;
    stream->error_head = NULL;
    stream->error_count = 0;
    return stream;
}

void destroy_token_stream(TokenStream *stream) {
    while (stream->error_head) {
        void *next = stream->error_head->next;
        free(stream->error_head);
        stream->error_head = next;
    }
    fclose(stream->src);
    free(stream);
}

static void next_comment(TokenStream *stream, Token *token) {
    char c;
    while ((c = fgetc(stream->src)) != EOF) {
        if (c == '\n') {
            break;
        }
        append_char(token, c);
    }
    append_null_byte(token);
    ++stream->line;
    stream->col = 0;
}

static void skip_while(TokenStream *stream, int(*pred)(int c), bool inv) {
    char c;
    while ((c = fgetc(stream->src)) != EOF &&
           ((!inv && pred(c)) || (inv && !pred(c)))) {
        if (c == '\n') {
            ++stream->line;
            stream->col = 0;
        } else {
            ++stream->col;
        }
    }
    ungetc(c, stream->src);
}

static void next_char(TokenStream *stream, Token *token) {
    token->len = 1;
    char c = fgetc(stream->src);
    copy_to_buffer(token, &c, 1);
    ++stream->col;
}

static void next_string(TokenStream *stream, Token *token) {
    bool backslash = false;
    char c = fgetc(stream->src); // opening "
    append_char(token, c);
    while ((c = fgetc(stream->src)) != EOF && (backslash || c != '"')) {
        if (c == '\\' && !backslash) {
            backslash = true;
        } else {
            if (backslash && c != '"') {
                append_char(token, '\\');
            }
            backslash = false;
            append_char(token, c);
            if (c == '\n') {
                ++stream->line;
                stream->col = 0;
            } else {
                ++stream->col;
            }
        }
    }
    ++stream->col;
    if (feof(stream->src)) {
        token_stream_push_error(stream, token, strdup("expected '\"', got EOF"),
                                true);
        append_null_byte(token);
    } else {
        append_char(token, '"');
        append_null_byte(token);
    }
}

static void next_symbol(TokenStream *stream, Token *token) {
    char c;
    bool backslash = false;
    while ((c = fgetc(stream->src)) != EOF &&
           (backslash || !issymbolend(c))) {
        if (c == '\\' && !backslash) {
            backslash = true;
        } else {
            backslash = false;
            append_char(token, c);
        }
    }
    append_null_byte(token);
    ungetc(c, stream->src);
}

static void next_char_literal(TokenStream *stream, Token *token) {
    fgetc(stream->src);
    append_char(token, '#');
    char c = fgetc(stream->src);
    append_char(token, c);
    stream->col += 2;
    if (c == EOF) {
        token_stream_push_error(stream, token,
                                strdup("expected character literal, got EOF"),
                                true);
        token->len = 0;
        token->type = TOKEN_TYPE_UNKNOWN;
        c = fgetc(stream->src);
    } else if (c == '\\') {
        // named character literal, like "#\n"
        while ((c = fgetc(stream->src)) != EOF &&
               (isalpha(c) || isdigit(c) || c == '\\')) {
            append_char(token, c);
            ++stream->col;
        }
    } else {
        c = fgetc(stream->src);
    }
    append_null_byte(token);
    // the ifs above do this
    // c = fgetc(stream->src);
    if (c != EOF && !issymbolend(c)) {
        token_stream_push_error(stream, token,
                                strdup("character literal too long"),
                                false);
        skip_while(stream, &issymbolend, true);
    } else {
        ungetc(c, stream->src);
    }
}

// we accept base = 2, 8, 10, and 16
static int isbasechar(char c, int base) {
    if (c < '0') {
        return false;
    }
    switch (base) {
    case 16:
        c = tolower(c);
        if (c >= 'a' && c <= 'f') {
            return true;
        }
        base = 10;
    case 2:
    case 8:
    case 10:
        return c <= '0' + (base - 1);
    default:
        return false;
    }
}

static int get_base_from_token(Token *token) {
    size_t i;
    for (i = 0; i < token->len; ++i) {
        if (token->text[i] != '0') {
            break;
        }
    }
    if (token->len - i == 1) {
        switch (token->text[i]) {
        case '2':
            return 2;
        case '8':
            return 8;
        }
        return 0;
    } else if (token->len - i == 2 && token->text[i] == '1') {
        switch (token->text[i + 1]) {
        case '0':
            return 10;
        case '6':
            return 16;
        }
        return 0;
    }
    return 0;
}

static void next_number_or_symbol(TokenStream *stream, Token *token, char first_char) {
    token->type = TOKEN_TYPE_NUMBER;
    bool has_decimal = false;
    bool has_base = false;
    bool only_nums = true;
    bool has_exp = false;
    bool allow_plus_minus = false;
    int base = 10;
    bool has_num = isbasechar(first_char, 10);
    if (first_char == '.') {
        has_decimal = true;
        only_nums = false;
    } else if (first_char == '-' || first_char == '+') {
        only_nums = false;
    }
    append_char(token, first_char);
    char c;
    while ((c = fgetc(stream->src)) != EOF && !issymbolend(c)) {
        ++stream->col;
        if (c == '.') {
            if (has_decimal || has_base) {
                token->type = TOKEN_TYPE_SYMBOL;
                ungetc(c, stream->src);
                next_symbol(stream, token);
                return;
            }
            only_nums = false;
            has_decimal = true;
        } else if (c == '#') {
            if (has_base || !only_nums) {
                token->type = TOKEN_TYPE_SYMBOL;

                ungetc(c, stream->src);
                next_symbol(stream, token);
                return;
            } else {
                base = get_base_from_token(token);
                if (base == 0) {
                    token->type = TOKEN_TYPE_SYMBOL;
                    ungetc(c, stream->src);
                    next_symbol(stream, token);
                    return;
                }
            }
            has_base = true;
            has_num = false;
            allow_plus_minus = true;
            append_char(token, c);
            continue;
        } else if (base == 10 && c == 'e') {
            if (has_exp) {
                token->type = TOKEN_TYPE_SYMBOL;
                ungetc(c, stream->src);
                next_symbol(stream, token);
                return;
            }
            allow_plus_minus = true;
            only_nums = false;
            has_exp = true;
            has_decimal = false; // the exponent can have a decimal point
            append_char(token, c);
            continue;
        } else if (allow_plus_minus && (c == '+' || c == '-')) {
            // fall-through
        } else if (!isbasechar(c, base)) {
            token->type = TOKEN_TYPE_SYMBOL;
            ungetc(c, stream->src);
            next_symbol(stream, token);
            return;
        }
        has_num = true;
        allow_plus_minus = false;
        append_char(token, c);
    }
    if (!has_num) {
        token->type = TOKEN_TYPE_SYMBOL;
    }
    append_null_byte(token);
    ungetc(c, stream->src);
}

// return the number of errors after parsing
size_t token_stream_next(TokenStream *stream, Token *token) {
    skip_while(stream, &isspace, false);
    token->line = stream->line;
    token->col = stream->col + 1;
    token->len = 0;
    char nc = fgetc(stream->src);
    if (nc == EOF) {
        token->type = TOKEN_TYPE_EOF;
        copy_to_buffer(token, "", 0);
        return stream->error_count;
    }
    ungetc(nc, stream->src);
    if (nc == ';') {
        token->type = TOKEN_TYPE_COMMENT;
        next_comment(stream, token);
    } else if (nc == '(' || nc == ')') {
        token->type = TOKEN_TYPE_PAREN;
        next_char(stream, token);
    } else if (nc == '[' || nc == ']') {
        token->type = TOKEN_TYPE_BRACKET;
        next_char(stream, token);
    } else if (nc == '\'') {
        token->type = TOKEN_TYPE_QUOTE;
        next_char(stream, token);
    } else if (nc == '`') {
        token->type = TOKEN_TYPE_BACKQUOTE;
        next_char(stream, token);
    } else if (nc == ',') {
        // look at character after the m
        char chars[2];
        chars[0] = fgetc(stream->src);
        chars[1] = fgetc(stream->src);
        if (chars[1] == '@') {
            token->type = TOKEN_TYPE_SPLICE;
            copy_to_buffer(token, chars, 2);
        } else {
            ungetc(chars[1], stream->src);
            token->type = TOKEN_TYPE_COMMA;
            copy_to_buffer(token, chars, 1);
        }
    } else if (nc == '"') {
        token->type = TOKEN_TYPE_STRING;
        next_string(stream, token);
    } else if (nc == '.') {
        // look at character after the .
        char chars[2];
        chars[0] = fgetc(stream->src);
        chars[1] = fgetc(stream->src);
        ungetc(chars[1], stream->src);
        if (isspace(chars[1])) {
            ++stream->col;
            token->type = TOKEN_TYPE_SYMBOL;
            copy_to_buffer(token, ".", 1);
        } else {
            // the . is part of something bigger
            next_number_or_symbol(stream, token, chars[0]);
        }
    } else if (nc == '#') {
        token->type = TOKEN_TYPE_CHAR;
        next_char_literal(stream, token);
    } else if (nc == '-' || nc == '+' || (nc >= '0' && nc <= '9')) {
        next_number_or_symbol(stream, token, fgetc(stream->src));
    } else {
        token->type = nc == ':' ? TOKEN_TYPE_PROPERTY : TOKEN_TYPE_SYMBOL;
        next_symbol(stream, token);
    }
    return stream->error_count;
}

void token_free(Token *token) {
    free(token->text);
}

// return the number of errors left
ParseError *token_stream_error(TokenStream *stream) {
    if (stream->error_count == 0) {
        return NULL;
    }
    ParseError *error = stream->error_head;
    stream->error_head = stream->error_head->next;
    if (!stream->error_head) {
        stream->error_tail = NULL;
    }
    error->next = NULL;
    --stream->error_count;
    return error;
}

void parse_error_free(ParseError *error) {
    if (error) {
        free(error->desc);
        free(error->context);
        free(error);
    }
}

bool token_stream_is_eof(TokenStream *stream) {
    return feof(stream->src);
}