1 files changed, 490 insertions, 0 deletions
diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c
new file mode 100644
index 0000000..38f76ed
--- /dev/null
+++ b/Parser/tokenizer.c
@@ -0,0 +1,490 @@
+/* Tokenizer implementation */
+
+/* XXX This is rather old, should be restructured perhaps */
+/* XXX Need a better interface to report errors than writing to stderr */
+
+#include <stdio.h>
+#include <ctype.h>
+#include "string.h"
+
+#include "PROTO.h"
+#include "malloc.h"
+#include "tokenizer.h"
+#include "errcode.h"
+
+#ifdef THINK_C
+#define TABSIZE 4
+#endif
+
+#ifndef TABSIZE
+#define TABSIZE 8
+#endif
+
+/* Token names */
+
+char *tok_name[] = {
+	"ENDMARKER",
+	"NAME",
+	"NUMBER",
+	"STRING",
+	"NEWLINE",
+	"INDENT",
+	"DEDENT",
+	"LPAR",
+	"RPAR",
+	"LSQB",
+	"RSQB",
+	"COLON",
+	"COMMA",
+	"SEMI",
+	"PLUS",
+	"MINUS",
+	"STAR",
+	"SLASH",
+	"VBAR",
+	"AMPER",
+	"LESS",
+	"GREATER",
+	"EQUAL",
+	"DOT",
+	"PERCENT",
+	"BACKQUOTE",
+	"LBRACE",
+	"RBRACE",
+	"OP",
+	"<ERRORTOKEN>",
+	"<N_TOKENS>"
+};
+
+
+/* Create and initialize a new tok_state structure */
+
+static struct tok_state *
+tok_new()
+{
+	struct tok_state *tok = NEW(struct tok_state, 1);
+	if (tok == NULL)
+		return NULL;
+	tok->buf = tok->cur = tok->end = tok->inp = NULL;
+	tok->done = E_OK;
+	tok->fp = NULL;
+	tok->tabsize = TABSIZE;
+	tok->indent = 0;
+	tok->indstack[0] = 0;
+	tok->atbol = 1;
+	tok->pendin = 0;
+	tok->prompt = tok->nextprompt = NULL;
+	tok->lineno = 0;
+	return tok;
+}
+
+
+/* Set up tokenizer for string */
+
+struct tok_state *
+tok_setups(str)
+	char *str;
+{
+	struct tok_state *tok = tok_new();
+	if (tok == NULL)
+		return NULL;
+	tok->buf = tok->cur = str;
+	tok->end = tok->inp = strchr(str, '\0');
+	return tok;
+}
+
+
+/* Set up tokenizer for string */
+
+struct tok_state *
+tok_setupf(fp, ps1, ps2)
+	FILE *fp;
+	char *ps1, *ps2;
+{
+	struct tok_state *tok = tok_new();
+	if (tok == NULL)
+		return NULL;
+	if ((tok->buf = NEW(char, BUFSIZ)) == NULL) {
+		DEL(tok);
+		return NULL;
+	}
+	tok->cur = tok->inp = tok->buf;
+	tok->end = tok->buf + BUFSIZ;
+	tok->fp = fp;
+	tok->prompt = ps1;
+	tok->nextprompt = ps2;
+	return tok;
+}
+
+
+/* Free a tok_state structure */
+
+void
+tok_free(tok)
+	struct tok_state *tok;
+{
+	/* XXX really need a separate flag to say 'my buffer' */
+	if (tok->fp != NULL && tok->buf != NULL)
+		DEL(tok->buf);
+	DEL(tok);
+}
+
+
+/* Get next char, updating state; error code goes into tok->done */
+
+static int
+tok_nextc(tok)
+	register struct tok_state *tok;
+{
+	if (tok->done != E_OK)
+		return EOF;
+	
+	for (;;) {
+		if (tok->cur < tok->inp)
+			return *tok->cur++;
+		if (tok->fp == NULL) {
+			tok->done = E_EOF;
+			return EOF;
+		}
+		if (tok->inp > tok->buf && tok->inp[-1] == '\n')
+			tok->inp = tok->buf;
+		if (tok->inp == tok->end) {
+			int n = tok->end - tok->buf;
+			char *new = tok->buf;
+			RESIZE(new, char, n+n);
+			if (new == NULL) {
+				fprintf(stderr, "tokenizer out of mem\n");
+				tok->done = E_NOMEM;
+				return EOF;
+			}
+			tok->buf = new;
+			tok->inp = tok->buf + n;
+			tok->end = tok->inp + n;
+		}
+#ifdef USE_READLINE
+		if (tok->prompt != NULL) {
+			extern char *readline PROTO((char *prompt));
+			static int been_here;
+			if (!been_here) {
+				/* Force rebind of TAB to insert-tab */
+				extern int rl_insert();
+				rl_bind_key('\t', rl_insert);
+				been_here++;
+			}
+			if (tok->buf != NULL)
+				free(tok->buf);
+			tok->buf = readline(tok->prompt);
+			(void) intrcheck(); /* Clear pending interrupt */
+			if (tok->nextprompt != NULL)
+				tok->prompt = tok->nextprompt;
+				/* XXX different semantics w/o readline()! */
+			if (tok->buf == NULL) {
+				tok->done = E_EOF;
+			}
+			else {
+				unsigned int n = strlen(tok->buf);
+				if (n > 0)
+					add_history(tok->buf);
+				/* Append the '\n' that readline()
+				   doesn't give us, for the tokenizer... */
+				tok->buf = realloc(tok->buf, n+2);
+				if (tok->buf == NULL)
+					tok->done = E_NOMEM;
+				else {
+					tok->end = tok->buf + n;
+					*tok->end++ = '\n';
+					*tok->end = '\0';
+					tok->inp = tok->end;
+					tok->cur = tok->buf;
+				}
+			}
+		}
+		else
+#endif
+		{
+			tok->cur = tok->inp;
+			if (tok->prompt != NULL && tok->inp == tok->buf) {
+				fprintf(stderr, "%s", tok->prompt);
+				tok->prompt = tok->nextprompt;
+			}
+			tok->done = fgets_intr(tok->inp,
+				(int)(tok->end - tok->inp), tok->fp);
+		}
+		if (tok->done != E_OK) {
+			if (tok->prompt != NULL)
+				fprintf(stderr, "\n");
+			return EOF;
+		}
+		tok->inp = strchr(tok->inp, '\0');
+	}
+}
+
+
+/* Back-up one character */
+
+static void
+tok_backup(tok, c)
+	register struct tok_state *tok;
+	register int c;
+{
+	if (c != EOF) {
+		if (--tok->cur < tok->buf) {
+			fprintf(stderr, "tok_backup: begin of buffer\n");
+			abort();
+		}
+		if (*tok->cur != c)
+			*tok->cur = c;
+	}
+}
+
+
+/* Return the token corresponding to a single character */
+
+int
+tok_1char(c)
+	int c;
+{
+	switch (c) {
+	case '(':	return LPAR;
+	case ')':	return RPAR;
+	case '[':	return LSQB;
+	case ']':	return RSQB;
+	case ':':	return COLON;
+	case ',':	return COMMA;
+	case ';':	return SEMI;
+	case '+':	return PLUS;
+	case '-':	return MINUS;
+	case '*':	return STAR;
+	case '/':	return SLASH;
+	case '|':	return VBAR;
+	case '&':	return AMPER;
+	case '<':	return LESS;
+	case '>':	return GREATER;
+	case '=':	return EQUAL;
+	case '.':	return DOT;
+	case '%':	return PERCENT;
+	case '`':	return BACKQUOTE;
+	case '{':	return LBRACE;
+	case '}':	return RBRACE;
+	default:	return OP;
+	}
+}
+
+
+/* Get next token, after space stripping etc. */
+
+int
+tok_get(tok, p_start, p_end)
+	register struct tok_state *tok; /* In/out: tokenizer state */
+	char **p_start, **p_end; /* Out: point to start/end of token */
+{
+	register int c;
+	
+	/* Get indentation level */
+	if (tok->atbol) {
+		register int col = 0;
+		tok->atbol = 0;
+		tok->lineno++;
+		for (;;) {
+			c = tok_nextc(tok);
+			if (c == ' ')
+				col++;
+			else if (c == '\t')
+				col = (col/tok->tabsize + 1) * tok->tabsize;
+			else
+				break;
+		}
+		tok_backup(tok, c);
+		if (col == tok->indstack[tok->indent]) {
+			/* No change */
+		}
+		else if (col > tok->indstack[tok->indent]) {
+			/* Indent -- always one */
+			if (tok->indent+1 >= MAXINDENT) {
+				fprintf(stderr, "excessive indent\n");
+				tok->done = E_TOKEN;
+				return ERRORTOKEN;
+			}
+			tok->pendin++;
+			tok->indstack[++tok->indent] = col;
+		}
+		else /* col < tok->indstack[tok->indent] */ {
+			/* Dedent -- any number, must be consistent */
+			while (tok->indent > 0 &&
+				col < tok->indstack[tok->indent]) {
+				tok->indent--;
+				tok->pendin--;
+			}
+			if (col != tok->indstack[tok->indent]) {
+				fprintf(stderr, "inconsistent dedent\n");
+				tok->done = E_TOKEN;
+				return ERRORTOKEN;
+			}
+		}
+	}
+	
+	*p_start = *p_end = tok->cur;
+	
+	/* Return pending indents/dedents */
+	if (tok->pendin != 0) {
+		if (tok->pendin < 0) {
+			tok->pendin++;
+			return DEDENT;
+		}
+		else {
+			tok->pendin--;
+			return INDENT;
+		}
+	}
+	
+ again:
+	/* Skip spaces */
+	do {
+		c = tok_nextc(tok);
+	} while (c == ' ' || c == '\t');
+	
+	/* Set start of current token */
+	*p_start = tok->cur - 1;
+	
+	/* Skip comment */
+	if (c == '#') {
+		/* Hack to allow overriding the tabsize in the file.
+		   This is also recognized by vi, when it occurs near the
+		   beginning or end of the file.  (Will vi never die...?) */
+		int x;
+		if (sscanf(tok->cur, " vi:set tabsize=%d:", &x) == 1 &&
+						x >= 1 && x <= 40) {
+			fprintf(stderr, "# vi:set tabsize=%d:\n", x);
+			tok->tabsize = x;
+		}
+		do {
+			c = tok_nextc(tok);
+		} while (c != EOF && c != '\n');
+	}
+	
+	/* Check for EOF and errors now */
+	if (c == EOF)
+		return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
+	
+	/* Identifier (most frequent token!) */
+	if (isalpha(c) || c == '_') {
+		do {
+			c = tok_nextc(tok);
+		} while (isalnum(c) || c == '_');
+		tok_backup(tok, c);
+		*p_end = tok->cur;
+		return NAME;
+	}
+	
+	/* Newline */
+	if (c == '\n') {
+		tok->atbol = 1;
+		*p_end = tok->cur - 1; /* Leave '\n' out of the string */
+		return NEWLINE;
+	}
+	
+	/* Number */
+	if (isdigit(c)) {
+		if (c == '0') {
+			/* Hex or octal */
+			c = tok_nextc(tok);
+			if (c == '.')
+				goto fraction;
+			if (c == 'x' || c == 'X') {
+				/* Hex */
+				do {
+					c = tok_nextc(tok);
+				} while (isxdigit(c));
+			}
+			else {
+				/* Octal; c is first char of it */
+				/* There's no 'isoctdigit' macro, sigh */
+				while ('0' <= c && c < '8') {
+					c = tok_nextc(tok);
+				}
+			}
+		}
+		else {
+			/* Decimal */
+			do {
+				c = tok_nextc(tok);
+			} while (isdigit(c));
+			/* Accept floating point numbers.
+			   XXX This accepts incomplete things like 12e or 1e+;
+			       worry about that at run-time.
+			   XXX Doesn't accept numbers starting with a dot */
+			if (c == '.') {
+	fraction:
+				/* Fraction */
+				do {
+					c = tok_nextc(tok);
+				} while (isdigit(c));
+			}
+			if (c == 'e' || c == 'E') {
+				/* Exponent part */
+				c = tok_nextc(tok);
+				if (c == '+' || c == '-')
+					c = tok_nextc(tok);
+				while (isdigit(c)) {
+					c = tok_nextc(tok);
+				}
+			}
+		}
+		tok_backup(tok, c);
+		*p_end = tok->cur;
+		return NUMBER;
+	}
+	
+	/* String */
+	if (c == '\'') {
+		for (;;) {
+			c = tok_nextc(tok);
+			if (c == '\n' || c == EOF) {
+				tok->done = E_TOKEN;
+				return ERRORTOKEN;
+			}
+			if (c == '\\') {
+				c = tok_nextc(tok);
+				*p_end = tok->cur;
+				if (c == '\n' || c == EOF) {
+					tok->done = E_TOKEN;
+					return ERRORTOKEN;
+				}
+				continue;
+			}
+			if (c == '\'')
+				break;
+		}
+		*p_end = tok->cur;
+		return STRING;
+	}
+	
+	/* Line continuation */
+	if (c == '\\') {
+		c = tok_nextc(tok);
+		if (c != '\n') {
+			tok->done = E_TOKEN;
+			return ERRORTOKEN;
+		}
+		goto again; /* Read next line */
+	}
+	
+	/* Punctuation character */
+	*p_end = tok->cur;
+	return tok_1char(c);
+}
+
+
+#ifdef DEBUG
+
+void
+tok_dump(type, start, end)
+	int type;
+	char *start, *end;
+{
+	printf("%s", tok_name[type]);
+	if (type == NAME || type == NUMBER || type == STRING || type == OP)
+		printf("(%.*s)", (int)(end - start), start);
+}
+
+#endif