235 lines
6.2 KiB
C
235 lines
6.2 KiB
C
#include "token.h"
|
|
|
|
#include "util.h"
|
|
|
|
#include <assert.h>
|
|
#include <ctype.h>
|
|
#include <stdarg.h>
|
|
#include <string.h>
|
|
|
|
const char *token_type_to_string(TokenType type) {
|
|
static const char *const TYPES[] = {
|
|
[TOKEN_LET] = "LET",
|
|
[TOKEN_EVAL] = "EVAL",
|
|
[TOKEN_CONF] = "CONF",
|
|
[TOKEN_EQUALS] = "EQUALS",
|
|
[TOKEN_BACKSLASH] = "BACKSLASH",
|
|
[TOKEN_COLON] = "COLON",
|
|
[TOKEN_OPEN_PAREN] = "OPEN_PAREN",
|
|
[TOKEN_CLOSE_PAREN] = "CLOSE_PAREN",
|
|
[TOKEN_DEFINE] = "DEFINE",
|
|
[TOKEN_IDENT] = "IDENT",
|
|
[TOKEN_REDUCE] = "REDUCE",
|
|
[TOKEN_EOF] = "EOF",
|
|
};
|
|
assert(type >= 0 && type < (sizeof(TYPES) / sizeof(TYPES[0])));
|
|
return TYPES[type];
|
|
}
|
|
|
|
#define EOS -1
|
|
|
|
static bool is_whitespace(char c) {
|
|
return c == ' ' || c == '\t' || c == '\n';
|
|
}
|
|
|
|
static bool is_identifier(char c) {
|
|
return (c >= 'a' && c <= 'z') || c == '_';
|
|
}
|
|
|
|
static bool is_reduce(char c) {
|
|
return (c >= 'a' && c <= 'z') || c == '*' || c == '~' || c == ':';
|
|
}
|
|
|
|
static int peekc(TokenStream *stream) {
|
|
if (stream->pos.offset >= stream->src_len) {
|
|
return EOS;
|
|
}
|
|
return stream->src[stream->pos.offset];
|
|
}
|
|
|
|
static int popc(TokenStream *stream) {
|
|
if (stream->pos.offset >= stream->src_len) {
|
|
return EOS;
|
|
}
|
|
int c = stream->src[stream->pos.offset++];
|
|
if (c == '\n') {
|
|
stream->pos.line_offset = stream->pos.offset;
|
|
++stream->pos.line;
|
|
stream->pos.column = 0;
|
|
} else {
|
|
++stream->pos.column;
|
|
}
|
|
return c;
|
|
}
|
|
|
|
static void skip_whitespace(TokenStream *stream) {
|
|
while (is_whitespace(peekc(stream))) {
|
|
popc(stream);
|
|
}
|
|
}
|
|
|
|
static void discard_until_newline(TokenStream *stream) {
|
|
int c;
|
|
while ((c = popc(stream)) != '\n' && c != EOS) {
|
|
}
|
|
}
|
|
|
|
static TokenType typeof_single_char_token(char c) {
|
|
switch (c) {
|
|
case '=':
|
|
return TOKEN_EQUALS;
|
|
case '\\':
|
|
return TOKEN_BACKSLASH;
|
|
case ':':
|
|
return TOKEN_COLON;
|
|
case '(':
|
|
return TOKEN_OPEN_PAREN;
|
|
case ')':
|
|
return TOKEN_CLOSE_PAREN;
|
|
default:
|
|
abort();
|
|
}
|
|
}
|
|
|
|
static inline void set_token(Token *out, const SrcPos *pos, TokenType type,
|
|
size_t length) {
|
|
out->pos = *pos;
|
|
out->type = type;
|
|
out->length = length;
|
|
}
|
|
|
|
static int pprint_character(int c, char *buf, size_t buf_size) {
|
|
if (c == EOS) {
|
|
return snprintf(buf, buf_size, "EOF");
|
|
} else if (isprint(c)) {
|
|
return snprintf(buf, buf_size, "'%c'", (char) c);
|
|
} else {
|
|
return snprintf(buf, buf_size, "'0x%02Xd'", c);
|
|
}
|
|
}
|
|
|
|
ATTR_FORMAT(3, 4)
|
|
static void sprintf_error(ParseError *error, const SrcPos *pos,
|
|
const char *restrict fmt, ...) {
|
|
if (error) {
|
|
error->set = true;
|
|
error->pos = *pos;
|
|
va_list args;
|
|
va_start(args, fmt);
|
|
vsnprintf(error->message, sizeof(error->message), fmt, args);
|
|
va_end(args);
|
|
}
|
|
}
|
|
|
|
// and now for the actual parser
|
|
#define POS (&stream->pos)
|
|
#define SET_OUT(p, t, l) set_token(out, (p), (t), (l))
|
|
#define SET_EOF() SET_OUT(POS, TOKEN_EOF, 0)
|
|
#define DISCARD_CHAR() popc(stream)
|
|
#define DEF_PPRINT(var, c) \
|
|
char var[8]; \
|
|
pprint_character((c), var, sizeof(var))
|
|
#define BYTES_LEFT (stream->src_len - stream->pos.offset)
|
|
|
|
static bool read_next_ident(TokenStream *restrict stream, Token *restrict out);
|
|
// consume = before using
|
|
static bool read_next_reduce(TokenStream *restrict stream, Token *restrict out,
|
|
ParseError *restrict error);
|
|
|
|
bool token_stream_next(TokenStream *restrict stream, Token *restrict out,
|
|
ParseError *restrict error) {
|
|
if (error) {
|
|
error->set = false;
|
|
}
|
|
restart:
|
|
skip_whitespace(stream);
|
|
int c = peekc(stream);
|
|
switch (c) {
|
|
case EOS:
|
|
SET_EOF();
|
|
return true;
|
|
case '\\':
|
|
case ':':
|
|
case '(':
|
|
case ')':
|
|
SET_OUT(POS, typeof_single_char_token(c), 1);
|
|
DISCARD_CHAR();
|
|
return true;
|
|
case '=': {
|
|
SrcPos start = *POS;
|
|
DISCARD_CHAR();
|
|
int next = peekc(stream);
|
|
if (next == EOS || is_whitespace(next)) {
|
|
SET_OUT(&start, typeof_single_char_token(c), 1);
|
|
return true;
|
|
}
|
|
// note the '=' was already consumed
|
|
return read_next_reduce(stream, out, error);
|
|
}
|
|
case '-': {
|
|
SrcPos start = *POS;
|
|
popc(stream); // first '-'
|
|
int next = popc(stream);
|
|
if (next == EOS) {
|
|
sprintf_error(error, POS, "unexpected EOF");
|
|
return false;
|
|
} else if (next == '-') {
|
|
discard_until_newline(stream);
|
|
goto restart;
|
|
}
|
|
SET_OUT(&start, TOKEN_DEFINE, 2);
|
|
return true;
|
|
}
|
|
default:
|
|
if (!is_identifier(c)) {
|
|
DEF_PPRINT(pp, c);
|
|
sprintf_error(error, POS, "expected identifier, got %s", pp);
|
|
return false;
|
|
}
|
|
return read_next_ident(stream, out);
|
|
}
|
|
}
|
|
|
|
static bool read_next_ident(TokenStream *restrict stream, Token *restrict out) {
|
|
SrcPos start = *POS;
|
|
size_t len = 0;
|
|
int c;
|
|
while (is_identifier((c = peekc(stream)))) {
|
|
popc(stream);
|
|
++len;
|
|
}
|
|
if (len == 3 && BYTES_LEFT >= 3
|
|
&& memcmp(&stream->src[start.offset], "let", 3) == 0) {
|
|
SET_OUT(&start, TOKEN_LET, len);
|
|
} else if (len == 4 && BYTES_LEFT >= 4
|
|
&& memcmp(&stream->src[start.offset], "conf", 4) == 0) {
|
|
SET_OUT(&start, TOKEN_CONF, len);
|
|
} else if (len == 4 && BYTES_LEFT >= 4
|
|
&& memcmp(&stream->src[start.offset], "eval", 4) == 0) {
|
|
SET_OUT(&start, TOKEN_EVAL, len);
|
|
} else {
|
|
SET_OUT(&start, TOKEN_IDENT, len);
|
|
}
|
|
return true;
|
|
}
|
|
|
|
static bool read_next_reduce(TokenStream *restrict stream, Token *restrict out,
|
|
ParseError *restrict error) {
|
|
size_t len = 0;
|
|
SrcPos start = *POS;
|
|
int c;
|
|
while (is_reduce(c = peekc(stream))) {
|
|
popc(stream);
|
|
++len;
|
|
}
|
|
if (c == '>') {
|
|
SET_OUT(&start, TOKEN_REDUCE, len);
|
|
DISCARD_CHAR();
|
|
return true;
|
|
} else {
|
|
DEF_PPRINT(pp, c);
|
|
sprintf_error(error, POS, "expected '>', got %s", pp);
|
|
return false;
|
|
}
|
|
}
|