diff options
Diffstat (limited to 'Parser/tokenizer.c')
-rw-r--r-- | Parser/tokenizer.c | 490 |
1 files changed, 490 insertions, 0 deletions
diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c new file mode 100644 index 0000000..38f76ed --- /dev/null +++ b/Parser/tokenizer.c @@ -0,0 +1,490 @@ +/* Tokenizer implementation */ + +/* XXX This is rather old, should be restructured perhaps */ +/* XXX Need a better interface to report errors than writing to stderr */ + +#include <stdio.h> +#include <ctype.h> +#include "string.h" + +#include "PROTO.h" +#include "malloc.h" +#include "tokenizer.h" +#include "errcode.h" + +#ifdef THINK_C +#define TABSIZE 4 +#endif + +#ifndef TABSIZE +#define TABSIZE 8 +#endif + +/* Token names */ + +char *tok_name[] = { + "ENDMARKER", + "NAME", + "NUMBER", + "STRING", + "NEWLINE", + "INDENT", + "DEDENT", + "LPAR", + "RPAR", + "LSQB", + "RSQB", + "COLON", + "COMMA", + "SEMI", + "PLUS", + "MINUS", + "STAR", + "SLASH", + "VBAR", + "AMPER", + "LESS", + "GREATER", + "EQUAL", + "DOT", + "PERCENT", + "BACKQUOTE", + "LBRACE", + "RBRACE", + "OP", + "<ERRORTOKEN>", + "<N_TOKENS>" +}; + + +/* Create and initialize a new tok_state structure */ + +static struct tok_state * +tok_new() +{ + struct tok_state *tok = NEW(struct tok_state, 1); + if (tok == NULL) + return NULL; + tok->buf = tok->cur = tok->end = tok->inp = NULL; + tok->done = E_OK; + tok->fp = NULL; + tok->tabsize = TABSIZE; + tok->indent = 0; + tok->indstack[0] = 0; + tok->atbol = 1; + tok->pendin = 0; + tok->prompt = tok->nextprompt = NULL; + tok->lineno = 0; + return tok; +} + + +/* Set up tokenizer for string */ + +struct tok_state * +tok_setups(str) + char *str; +{ + struct tok_state *tok = tok_new(); + if (tok == NULL) + return NULL; + tok->buf = tok->cur = str; + tok->end = tok->inp = strchr(str, '\0'); + return tok; +} + + +/* Set up tokenizer for string */ + +struct tok_state * +tok_setupf(fp, ps1, ps2) + FILE *fp; + char *ps1, *ps2; +{ + struct tok_state *tok = tok_new(); + if (tok == NULL) + return NULL; + if ((tok->buf = NEW(char, BUFSIZ)) == NULL) { + DEL(tok); + return NULL; + } + tok->cur = tok->inp = tok->buf; + tok->end = tok->buf + BUFSIZ; + tok->fp = fp; + tok->prompt = ps1; + tok->nextprompt = ps2; + return tok; +} + + +/* Free a tok_state structure */ + +void +tok_free(tok) + struct tok_state *tok; +{ + /* XXX really need a separate flag to say 'my buffer' */ + if (tok->fp != NULL && tok->buf != NULL) + DEL(tok->buf); + DEL(tok); +} + + +/* Get next char, updating state; error code goes into tok->done */ + +static int +tok_nextc(tok) + register struct tok_state *tok; +{ + if (tok->done != E_OK) + return EOF; + + for (;;) { + if (tok->cur < tok->inp) + return *tok->cur++; + if (tok->fp == NULL) { + tok->done = E_EOF; + return EOF; + } + if (tok->inp > tok->buf && tok->inp[-1] == '\n') + tok->inp = tok->buf; + if (tok->inp == tok->end) { + int n = tok->end - tok->buf; + char *new = tok->buf; + RESIZE(new, char, n+n); + if (new == NULL) { + fprintf(stderr, "tokenizer out of mem\n"); + tok->done = E_NOMEM; + return EOF; + } + tok->buf = new; + tok->inp = tok->buf + n; + tok->end = tok->inp + n; + } +#ifdef USE_READLINE + if (tok->prompt != NULL) { + extern char *readline PROTO((char *prompt)); + static int been_here; + if (!been_here) { + /* Force rebind of TAB to insert-tab */ + extern int rl_insert(); + rl_bind_key('\t', rl_insert); + been_here++; + } + if (tok->buf != NULL) + free(tok->buf); + tok->buf = readline(tok->prompt); + (void) intrcheck(); /* Clear pending interrupt */ + if (tok->nextprompt != NULL) + tok->prompt = tok->nextprompt; + /* XXX different semantics w/o readline()! */ + if (tok->buf == NULL) { + tok->done = E_EOF; + } + else { + unsigned int n = strlen(tok->buf); + if (n > 0) + add_history(tok->buf); + /* Append the '\n' that readline() + doesn't give us, for the tokenizer... */ + tok->buf = realloc(tok->buf, n+2); + if (tok->buf == NULL) + tok->done = E_NOMEM; + else { + tok->end = tok->buf + n; + *tok->end++ = '\n'; + *tok->end = '\0'; + tok->inp = tok->end; + tok->cur = tok->buf; + } + } + } + else +#endif + { + tok->cur = tok->inp; + if (tok->prompt != NULL && tok->inp == tok->buf) { + fprintf(stderr, "%s", tok->prompt); + tok->prompt = tok->nextprompt; + } + tok->done = fgets_intr(tok->inp, + (int)(tok->end - tok->inp), tok->fp); + } + if (tok->done != E_OK) { + if (tok->prompt != NULL) + fprintf(stderr, "\n"); + return EOF; + } + tok->inp = strchr(tok->inp, '\0'); + } +} + + +/* Back-up one character */ + +static void +tok_backup(tok, c) + register struct tok_state *tok; + register int c; +{ + if (c != EOF) { + if (--tok->cur < tok->buf) { + fprintf(stderr, "tok_backup: begin of buffer\n"); + abort(); + } + if (*tok->cur != c) + *tok->cur = c; + } +} + + +/* Return the token corresponding to a single character */ + +int +tok_1char(c) + int c; +{ + switch (c) { + case '(': return LPAR; + case ')': return RPAR; + case '[': return LSQB; + case ']': return RSQB; + case ':': return COLON; + case ',': return COMMA; + case ';': return SEMI; + case '+': return PLUS; + case '-': return MINUS; + case '*': return STAR; + case '/': return SLASH; + case '|': return VBAR; + case '&': return AMPER; + case '<': return LESS; + case '>': return GREATER; + case '=': return EQUAL; + case '.': return DOT; + case '%': return PERCENT; + case '`': return BACKQUOTE; + case '{': return LBRACE; + case '}': return RBRACE; + default: return OP; + } +} + + +/* Get next token, after space stripping etc. */ + +int +tok_get(tok, p_start, p_end) + register struct tok_state *tok; /* In/out: tokenizer state */ + char **p_start, **p_end; /* Out: point to start/end of token */ +{ + register int c; + + /* Get indentation level */ + if (tok->atbol) { + register int col = 0; + tok->atbol = 0; + tok->lineno++; + for (;;) { + c = tok_nextc(tok); + if (c == ' ') + col++; + else if (c == '\t') + col = (col/tok->tabsize + 1) * tok->tabsize; + else + break; + } + tok_backup(tok, c); + if (col == tok->indstack[tok->indent]) { + /* No change */ + } + else if (col > tok->indstack[tok->indent]) { + /* Indent -- always one */ + if (tok->indent+1 >= MAXINDENT) { + fprintf(stderr, "excessive indent\n"); + tok->done = E_TOKEN; + return ERRORTOKEN; + } + tok->pendin++; + tok->indstack[++tok->indent] = col; + } + else /* col < tok->indstack[tok->indent] */ { + /* Dedent -- any number, must be consistent */ + while (tok->indent > 0 && + col < tok->indstack[tok->indent]) { + tok->indent--; + tok->pendin--; + } + if (col != tok->indstack[tok->indent]) { + fprintf(stderr, "inconsistent dedent\n"); + tok->done = E_TOKEN; + return ERRORTOKEN; + } + } + } + + *p_start = *p_end = tok->cur; + + /* Return pending indents/dedents */ + if (tok->pendin != 0) { + if (tok->pendin < 0) { + tok->pendin++; + return DEDENT; + } + else { + tok->pendin--; + return INDENT; + } + } + + again: + /* Skip spaces */ + do { + c = tok_nextc(tok); + } while (c == ' ' || c == '\t'); + + /* Set start of current token */ + *p_start = tok->cur - 1; + + /* Skip comment */ + if (c == '#') { + /* Hack to allow overriding the tabsize in the file. + This is also recognized by vi, when it occurs near the + beginning or end of the file. (Will vi never die...?) */ + int x; + if (sscanf(tok->cur, " vi:set tabsize=%d:", &x) == 1 && + x >= 1 && x <= 40) { + fprintf(stderr, "# vi:set tabsize=%d:\n", x); + tok->tabsize = x; + } + do { + c = tok_nextc(tok); + } while (c != EOF && c != '\n'); + } + + /* Check for EOF and errors now */ + if (c == EOF) + return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN; + + /* Identifier (most frequent token!) */ + if (isalpha(c) || c == '_') { + do { + c = tok_nextc(tok); + } while (isalnum(c) || c == '_'); + tok_backup(tok, c); + *p_end = tok->cur; + return NAME; + } + + /* Newline */ + if (c == '\n') { + tok->atbol = 1; + *p_end = tok->cur - 1; /* Leave '\n' out of the string */ + return NEWLINE; + } + + /* Number */ + if (isdigit(c)) { + if (c == '0') { + /* Hex or octal */ + c = tok_nextc(tok); + if (c == '.') + goto fraction; + if (c == 'x' || c == 'X') { + /* Hex */ + do { + c = tok_nextc(tok); + } while (isxdigit(c)); + } + else { + /* Octal; c is first char of it */ + /* There's no 'isoctdigit' macro, sigh */ + while ('0' <= c && c < '8') { + c = tok_nextc(tok); + } + } + } + else { + /* Decimal */ + do { + c = tok_nextc(tok); + } while (isdigit(c)); + /* Accept floating point numbers. + XXX This accepts incomplete things like 12e or 1e+; + worry about that at run-time. + XXX Doesn't accept numbers starting with a dot */ + if (c == '.') { + fraction: + /* Fraction */ + do { + c = tok_nextc(tok); + } while (isdigit(c)); + } + if (c == 'e' || c == 'E') { + /* Exponent part */ + c = tok_nextc(tok); + if (c == '+' || c == '-') + c = tok_nextc(tok); + while (isdigit(c)) { + c = tok_nextc(tok); + } + } + } + tok_backup(tok, c); + *p_end = tok->cur; + return NUMBER; + } + + /* String */ + if (c == '\'') { + for (;;) { + c = tok_nextc(tok); + if (c == '\n' || c == EOF) { + tok->done = E_TOKEN; + return ERRORTOKEN; + } + if (c == '\\') { + c = tok_nextc(tok); + *p_end = tok->cur; + if (c == '\n' || c == EOF) { + tok->done = E_TOKEN; + return ERRORTOKEN; + } + continue; + } + if (c == '\'') + break; + } + *p_end = tok->cur; + return STRING; + } + + /* Line continuation */ + if (c == '\\') { + c = tok_nextc(tok); + if (c != '\n') { + tok->done = E_TOKEN; + return ERRORTOKEN; + } + goto again; /* Read next line */ + } + + /* Punctuation character */ + *p_end = tok->cur; + return tok_1char(c); +} + + +#ifdef DEBUG + +void +tok_dump(type, start, end) + int type; + char *start, *end; +{ + printf("%s", tok_name[type]); + if (type == NAME || type == NUMBER || type == STRING || type == OP) + printf("(%.*s)", (int)(end - start), start); +} + +#endif |