lambda/token.c

#include "token.h"

#include "util.h"

#include <assert.h>
#include <ctype.h>
#include <stdarg.h>
#include <string.h>

const char *token_type_to_string(TokenType type) {
    static const char *const TYPES[] = {
        [TOKEN_LET] = "LET",
        [TOKEN_EVAL] = "EVAL",
        [TOKEN_CONF] = "CONF",
        [TOKEN_EQUALS] = "EQUALS",
        [TOKEN_BACKSLASH] = "BACKSLASH",
        [TOKEN_COLON] = "COLON",
        [TOKEN_OPEN_PAREN] = "OPEN_PAREN",
        [TOKEN_CLOSE_PAREN] = "CLOSE_PAREN",
        [TOKEN_DEFINE] = "DEFINE",
        [TOKEN_IDENT] = "IDENT",
        [TOKEN_REDUCE] = "REDUCE",
        [TOKEN_EOF] = "EOF",
    };
    assert(type >= 0 && type < (sizeof(TYPES) / sizeof(TYPES[0])));
    return TYPES[type];
}

#define EOS -1

static bool is_whitespace(char c) {
    return c == ' ' || c == '\t' || c == '\n';
}

static bool is_identifier(char c) {
    return (c >= 'a' && c <= 'z') || c == '_';
}

static bool is_reduce(char c) {
    return (c >= 'a' && c <= 'z') || c == '*' || c == '~' || c == ':';
}

static int peekc(TokenStream *stream) {
    if (stream->pos.offset >= stream->src_len) {
        return EOS;
    }
    return stream->src[stream->pos.offset];
}

static int popc(TokenStream *stream) {
    if (stream->pos.offset >= stream->src_len) {
        return EOS;
    }
    int c = stream->src[stream->pos.offset++];
    if (c == '\n') {
        stream->pos.line_offset = stream->pos.offset;
        ++stream->pos.line;
        stream->pos.column = 0;
    } else {
        ++stream->pos.column;
    }
    return c;
}

static void skip_whitespace(TokenStream *stream) {
    while (is_whitespace(peekc(stream))) {
        popc(stream);
    }
}

static void discard_until_newline(TokenStream *stream) {
    int c;
    while ((c = popc(stream)) != '\n' && c != EOS) {
    }
}

static TokenType typeof_single_char_token(char c) {
    switch (c) {
    case '=':
        return TOKEN_EQUALS;
    case '\\':
        return TOKEN_BACKSLASH;
    case ':':
        return TOKEN_COLON;
    case '(':
        return TOKEN_OPEN_PAREN;
    case ')':
        return TOKEN_CLOSE_PAREN;
    default:
        abort();
    }
}

static inline void set_token(Token *out, const SrcPos *pos, TokenType type,
                             size_t length) {
    out->pos = *pos;
    out->type = type;
    out->length = length;
}

static int pprint_character(int c, char *buf, size_t buf_size) {
    if (c == EOS) {
        return snprintf(buf, buf_size, "EOF");
    } else if (isprint(c)) {
        return snprintf(buf, buf_size, "'%c'", (char) c);
    } else {
        return snprintf(buf, buf_size, "'0x%02Xd'", c);
    }
}

ATTR_FORMAT(3, 4)
static void sprintf_error(ParseError *error, const SrcPos *pos,
                          const char *restrict fmt, ...) {
    if (error) {
        error->set = true;
        error->pos = *pos;
        va_list args;
        va_start(args, fmt);
        vsnprintf(error->message, sizeof(error->message), fmt, args);
        va_end(args);
    }
}

// and now for the actual parser
#define POS              (&stream->pos)
#define SET_OUT(p, t, l) set_token(out, (p), (t), (l))
#define SET_EOF()        SET_OUT(POS, TOKEN_EOF, 0)
#define DISCARD_CHAR()   popc(stream)
#define DEF_PPRINT(var, c) \
    char var[8];           \
    pprint_character((c), var, sizeof(var))
#define BYTES_LEFT (stream->src_len - stream->pos.offset)

static bool read_next_ident(TokenStream *restrict stream, Token *restrict out);
// consume = before using
static bool read_next_reduce(TokenStream *restrict stream, Token *restrict out,
                             ParseError *restrict error);

bool token_stream_next(TokenStream *restrict stream, Token *restrict out,
                       ParseError *restrict error) {
    if (error) {
        error->set = false;
    }
restart:
    skip_whitespace(stream);
    int c = peekc(stream);
    switch (c) {
    case EOS:
        SET_EOF();
        return true;
    case '\\':
    case ':':
    case '(':
    case ')':
        SET_OUT(POS, typeof_single_char_token(c), 1);
        DISCARD_CHAR();
        return true;
    case '=': {
        SrcPos start = *POS;
        DISCARD_CHAR();
        int next = peekc(stream);
        if (next == EOS || is_whitespace(next)) {
            SET_OUT(&start, typeof_single_char_token(c), 1);
            return true;
        }
        // note the '=' was already consumed
        return read_next_reduce(stream, out, error);
    }
    case '-': {
        SrcPos start = *POS;
        popc(stream); // first '-'
        int next = popc(stream);
        if (next == EOS) {
            sprintf_error(error, POS, "unexpected EOF");
            return false;
        } else if (next == '-') {
            discard_until_newline(stream);
            goto restart;
        }
        SET_OUT(&start, TOKEN_DEFINE, 2);
        return true;
    }
    default:
        if (!is_identifier(c)) {
            DEF_PPRINT(pp, c);
            sprintf_error(error, POS, "expected identifier, got %s", pp);
            return false;
        }
        return read_next_ident(stream, out);
    }
}

static bool read_next_ident(TokenStream *restrict stream, Token *restrict out) {
    SrcPos start = *POS;
    size_t len = 0;
    int c;
    while (is_identifier((c = peekc(stream)))) {
        popc(stream);
        ++len;
    }
    if (len == 3 && BYTES_LEFT >= 3
        && memcmp(&stream->src[start.offset], "let", 3) == 0) {
        SET_OUT(&start, TOKEN_LET, len);
    } else if (len == 4 && BYTES_LEFT >= 4
               && memcmp(&stream->src[start.offset], "conf", 4) == 0) {
        SET_OUT(&start, TOKEN_CONF, len);
    } else if (len == 4 && BYTES_LEFT >= 4
               && memcmp(&stream->src[start.offset], "eval", 4) == 0) {
        SET_OUT(&start, TOKEN_EVAL, len);
    } else {
        SET_OUT(&start, TOKEN_IDENT, len);
    }
    return true;
}

static bool read_next_reduce(TokenStream *restrict stream, Token *restrict out,
                             ParseError *restrict error) {
    size_t len = 0;
    SrcPos start = *POS;
    int c;
    while (is_reduce(c = peekc(stream))) {
        popc(stream);
        ++len;
    }
    if (c == '>') {
        SET_OUT(&start, TOKEN_REDUCE, len);
        DISCARD_CHAR();
        return true;
    } else {
        DEF_PPRINT(pp, c);
        sprintf_error(error, POS, "expected '>', got %s", pp);
        return false;
    }
}