diff options
author | Guido van Rossum <guido@python.org> | 1990-10-14 12:07:46 (GMT) |
---|---|---|
committer | Guido van Rossum <guido@python.org> | 1990-10-14 12:07:46 (GMT) |
commit | 85a5fbbdfea617f6cc8fae82c9e8c2b5c424436d (patch) | |
tree | a1bf57db1c75e2a7029c8f2fad5f8dba4b9ba25c /Parser | |
parent | c636014c430620325f8d213e9ba10d925991b8d7 (diff) | |
download | cpython-85a5fbbdfea617f6cc8fae82c9e8c2b5c424436d.zip cpython-85a5fbbdfea617f6cc8fae82c9e8c2b5c424436d.tar.gz cpython-85a5fbbdfea617f6cc8fae82c9e8c2b5c424436d.tar.bz2 |
Initial revision
Diffstat (limited to 'Parser')
-rw-r--r-- | Parser/acceler.c | 101 | ||||
-rw-r--r-- | Parser/assert.h | 1 | ||||
-rw-r--r-- | Parser/bitset.c | 76 | ||||
-rw-r--r-- | Parser/firstsets.c | 109 | ||||
-rw-r--r-- | Parser/grammar.c | 207 | ||||
-rw-r--r-- | Parser/grammar1.c | 52 | ||||
-rw-r--r-- | Parser/intrcheck.c | 95 | ||||
-rw-r--r-- | Parser/listnode.c | 68 | ||||
-rw-r--r-- | Parser/metagrammar.c | 151 | ||||
-rw-r--r-- | Parser/node.c | 47 | ||||
-rw-r--r-- | Parser/parser.c | 396 | ||||
-rw-r--r-- | Parser/parser.h | 25 | ||||
-rw-r--r-- | Parser/parsetok.c | 131 | ||||
-rw-r--r-- | Parser/pgen.c | 729 | ||||
-rw-r--r-- | Parser/pgen.h | 6 | ||||
-rw-r--r-- | Parser/pgenmain.c | 111 | ||||
-rw-r--r-- | Parser/printgrammar.c | 121 | ||||
-rw-r--r-- | Parser/tokenizer.c | 490 | ||||
-rw-r--r-- | Parser/tokenizer.h | 29 |
19 files changed, 2945 insertions, 0 deletions
diff --git a/Parser/acceler.c b/Parser/acceler.c new file mode 100644 index 0000000..7ff10b2 --- /dev/null +++ b/Parser/acceler.c @@ -0,0 +1,101 @@ +/* Parser accelerator module */ + +#include <stdio.h> + +#include "PROTO.h" +#include "grammar.h" +#include "token.h" +#include "malloc.h" + +static void +fixstate(g, d, s) + grammar *g; + dfa *d; + state *s; +{ + arc *a; + int k; + int *accel; + int nl = g->g_ll.ll_nlabels; + s->s_accept = 0; + accel = NEW(int, nl); + for (k = 0; k < nl; k++) + accel[k] = -1; + a = s->s_arc; + for (k = s->s_narcs; --k >= 0; a++) { + int lbl = a->a_lbl; + label *l = &g->g_ll.ll_label[lbl]; + int type = l->lb_type; + if (a->a_arrow >= (1 << 7)) { + printf("XXX too many states!\n"); + continue; + } + if (ISNONTERMINAL(type)) { + dfa *d1 = finddfa(g, type); + int ibit; + if (type - NT_OFFSET >= (1 << 7)) { + printf("XXX too high nonterminal number!\n"); + continue; + } + for (ibit = 0; ibit < g->g_ll.ll_nlabels; ibit++) { + if (testbit(d1->d_first, ibit)) { + if (accel[ibit] != -1) + printf("XXX ambiguity!\n"); + accel[ibit] = a->a_arrow | (1 << 7) | + ((type - NT_OFFSET) << 8); + } + } + } + else if (lbl == EMPTY) + s->s_accept = 1; + else if (lbl >= 0 && lbl < nl) + accel[lbl] = a->a_arrow; + } + while (nl > 0 && accel[nl-1] == -1) + nl--; + for (k = 0; k < nl && accel[k] == -1;) + k++; + if (k < nl) { + int i; + s->s_accel = NEW(int, nl-k); + if (s->s_accel == NULL) { + fprintf(stderr, "no mem to add parser accelerators\n"); + exit(1); + } + s->s_lower = k; + s->s_upper = nl; + for (i = 0; k < nl; i++, k++) + s->s_accel[i] = accel[k]; + } + DEL(accel); +} + +static void +fixdfa(g, d) + grammar *g; + dfa *d; +{ + state *s; + int j; + s = d->d_state; + for (j = 0; j < d->d_nstates; j++, s++) + fixstate(g, d, s); +} + +void +addaccelerators(g) + grammar *g; +{ + dfa *d; + int i; +#ifdef DEBUG + printf("Adding parser accellerators ...\n"); +#endif + d = g->g_dfa; + for (i = g->g_ndfas; --i >= 0; d++) + fixdfa(g, d); + g->g_accel = 1; +#ifdef DEBUG + printf("Done.\n"); +#endif +} diff --git a/Parser/assert.h b/Parser/assert.h new file mode 100644 index 0000000..e8edafc --- /dev/null +++ b/Parser/assert.h @@ -0,0 +1 @@ +#define assert(e) { if (!(e)) { printf("Assertion failed\n"); abort(); } } diff --git a/Parser/bitset.c b/Parser/bitset.c new file mode 100644 index 0000000..b43fb82 --- /dev/null +++ b/Parser/bitset.c @@ -0,0 +1,76 @@ +/* Bitset primitives */ + +#include "PROTO.h" +#include "malloc.h" +#include "bitset.h" + +bitset +newbitset(nbits) + int nbits; +{ + int nbytes = NBYTES(nbits); + bitset ss = NEW(BYTE, nbytes); + + if (ss == NULL) + fatal("no mem for bitset"); + + ss += nbytes; + while (--nbytes >= 0) + *--ss = 0; + return ss; +} + +void +delbitset(ss) + bitset ss; +{ + DEL(ss); +} + +int +addbit(ss, ibit) + bitset ss; + int ibit; +{ + int ibyte = BIT2BYTE(ibit); + BYTE mask = BIT2MASK(ibit); + + if (ss[ibyte] & mask) + return 0; /* Bit already set */ + ss[ibyte] |= mask; + return 1; +} + +#if 0 /* Now a macro */ +int +testbit(ss, ibit) + bitset ss; + int ibit; +{ + return (ss[BIT2BYTE(ibit)] & BIT2MASK(ibit)) != 0; +} +#endif + +int +samebitset(ss1, ss2, nbits) + bitset ss1, ss2; + int nbits; +{ + int i; + + for (i = NBYTES(nbits); --i >= 0; ) + if (*ss1++ != *ss2++) + return 0; + return 1; +} + +void +mergebitset(ss1, ss2, nbits) + bitset ss1, ss2; + int nbits; +{ + int i; + + for (i = NBYTES(nbits); --i >= 0; ) + *ss1++ |= *ss2++; +} diff --git a/Parser/firstsets.c b/Parser/firstsets.c new file mode 100644 index 0000000..0f28dd0 --- /dev/null +++ b/Parser/firstsets.c @@ -0,0 +1,109 @@ +/* Computation of FIRST stets */ + +#include <stdio.h> + +#include "PROTO.h" +#include "malloc.h" +#include "grammar.h" +#include "token.h" + +extern int debugging; + +static void +calcfirstset(g, d) + grammar *g; + dfa *d; +{ + int i, j; + state *s; + arc *a; + int nsyms; + int *sym; + int nbits; + static bitset dummy; + bitset result; + int type; + dfa *d1; + label *l0; + + if (debugging) + printf("Calculate FIRST set for '%s'\n", d->d_name); + + if (dummy == NULL) + dummy = newbitset(1); + if (d->d_first == dummy) { + fprintf(stderr, "Left-recursion for '%s'\n", d->d_name); + return; + } + if (d->d_first != NULL) { + fprintf(stderr, "Re-calculating FIRST set for '%s' ???\n", + d->d_name); + } + d->d_first = dummy; + + l0 = g->g_ll.ll_label; + nbits = g->g_ll.ll_nlabels; + result = newbitset(nbits); + + sym = NEW(int, 1); + if (sym == NULL) + fatal("no mem for new sym in calcfirstset"); + nsyms = 1; + sym[0] = findlabel(&g->g_ll, d->d_type, (char *)NULL); + + s = &d->d_state[d->d_initial]; + for (i = 0; i < s->s_narcs; i++) { + a = &s->s_arc[i]; + for (j = 0; j < nsyms; j++) { + if (sym[j] == a->a_lbl) + break; + } + if (j >= nsyms) { /* New label */ + RESIZE(sym, int, nsyms + 1); + if (sym == NULL) + fatal("no mem to resize sym in calcfirstset"); + sym[nsyms++] = a->a_lbl; + type = l0[a->a_lbl].lb_type; + if (ISNONTERMINAL(type)) { + d1 = finddfa(g, type); + if (d1->d_first == dummy) { + fprintf(stderr, + "Left-recursion below '%s'\n", + d->d_name); + } + else { + if (d1->d_first == NULL) + calcfirstset(g, d1); + mergebitset(result, d1->d_first, nbits); + } + } + else if (ISTERMINAL(type)) { + addbit(result, a->a_lbl); + } + } + } + d->d_first = result; + if (debugging) { + printf("FIRST set for '%s': {", d->d_name); + for (i = 0; i < nbits; i++) { + if (testbit(result, i)) + printf(" %s", labelrepr(&l0[i])); + } + printf(" }\n"); + } +} + +void +addfirstsets(g) + grammar *g; +{ + int i; + dfa *d; + + printf("Adding FIRST sets ...\n"); + for (i = 0; i < g->g_ndfas; i++) { + d = &g->g_dfa[i]; + if (d->d_first == NULL) + calcfirstset(g, d); + } +} diff --git a/Parser/grammar.c b/Parser/grammar.c new file mode 100644 index 0000000..ad9c2b0 --- /dev/null +++ b/Parser/grammar.c @@ -0,0 +1,207 @@ +/* Grammar implementation */ + +#include <stdio.h> +#include <ctype.h> + +#include "PROTO.h" +#include "malloc.h" +#include "assert.h" +#include "token.h" +#include "grammar.h" + +extern int debugging; + +grammar * +newgrammar(start) + int start; +{ + grammar *g; + + g = NEW(grammar, 1); + if (g == NULL) + fatal("no mem for new grammar"); + g->g_ndfas = 0; + g->g_dfa = NULL; + g->g_start = start; + g->g_ll.ll_nlabels = 0; + g->g_ll.ll_label = NULL; + return g; +} + +dfa * +adddfa(g, type, name) + grammar *g; + int type; + char *name; +{ + dfa *d; + + RESIZE(g->g_dfa, dfa, g->g_ndfas + 1); + if (g->g_dfa == NULL) + fatal("no mem to resize dfa in adddfa"); + d = &g->g_dfa[g->g_ndfas++]; + d->d_type = type; + d->d_name = name; + d->d_nstates = 0; + d->d_state = NULL; + d->d_initial = -1; + d->d_first = NULL; + return d; /* Only use while fresh! */ +} + +int +addstate(d) + dfa *d; +{ + state *s; + + RESIZE(d->d_state, state, d->d_nstates + 1); + if (d->d_state == NULL) + fatal("no mem to resize state in addstate"); + s = &d->d_state[d->d_nstates++]; + s->s_narcs = 0; + s->s_arc = NULL; + return s - d->d_state; +} + +void +addarc(d, from, to, lbl) + dfa *d; + int lbl; +{ + state *s; + arc *a; + + assert(0 <= from && from < d->d_nstates); + assert(0 <= to && to < d->d_nstates); + + s = &d->d_state[from]; + RESIZE(s->s_arc, arc, s->s_narcs + 1); + if (s->s_arc == NULL) + fatal("no mem to resize arc list in addarc"); + a = &s->s_arc[s->s_narcs++]; + a->a_lbl = lbl; + a->a_arrow = to; +} + +int +addlabel(ll, type, str) + labellist *ll; + int type; + char *str; +{ + int i; + label *lb; + + for (i = 0; i < ll->ll_nlabels; i++) { + if (ll->ll_label[i].lb_type == type && + strcmp(ll->ll_label[i].lb_str, str) == 0) + return i; + } + RESIZE(ll->ll_label, label, ll->ll_nlabels + 1); + if (ll->ll_label == NULL) + fatal("no mem to resize labellist in addlabel"); + lb = &ll->ll_label[ll->ll_nlabels++]; + lb->lb_type = type; + lb->lb_str = str; /* XXX strdup(str) ??? */ + return lb - ll->ll_label; +} + +/* Same, but rather dies than adds */ + +int +findlabel(ll, type, str) + labellist *ll; + int type; + char *str; +{ + int i; + label *lb; + + for (i = 0; i < ll->ll_nlabels; i++) { + if (ll->ll_label[i].lb_type == type /*&& + strcmp(ll->ll_label[i].lb_str, str) == 0*/) + return i; + } + fprintf(stderr, "Label %d/'%s' not found\n", type, str); + abort(); +} + +static void +translabel(g, lb) + grammar *g; + label *lb; +{ + int i; + + if (debugging) + printf("Translating label %s ...\n", labelrepr(lb)); + + if (lb->lb_type == NAME) { + for (i = 0; i < g->g_ndfas; i++) { + if (strcmp(lb->lb_str, g->g_dfa[i].d_name) == 0) { + if (debugging) + printf("Label %s is non-terminal %d.\n", + lb->lb_str, + g->g_dfa[i].d_type); + lb->lb_type = g->g_dfa[i].d_type; + lb->lb_str = NULL; + return; + } + } + for (i = 0; i < (int)N_TOKENS; i++) { + if (strcmp(lb->lb_str, tok_name[i]) == 0) { + if (debugging) + printf("Label %s is terminal %d.\n", + lb->lb_str, i); + lb->lb_type = i; + lb->lb_str = NULL; + return; + } + } + printf("Can't translate NAME label '%s'\n", lb->lb_str); + return; + } + + if (lb->lb_type == STRING) { + if (isalpha(lb->lb_str[1])) { + char *p, *strchr(); + if (debugging) + printf("Label %s is a keyword\n", lb->lb_str); + lb->lb_type = NAME; + lb->lb_str++; + p = strchr(lb->lb_str, '\''); + if (p) + *p = '\0'; + } + else { + if (lb->lb_str[2] == lb->lb_str[0]) { + int type = (int) tok_1char(lb->lb_str[1]); + if (type != OP) { + lb->lb_type = type; + lb->lb_str = NULL; + } + else + printf("Unknown OP label %s\n", + lb->lb_str); + } + else + printf("Can't translate STRING label %s\n", + lb->lb_str); + } + } + else + printf("Can't translate label '%s'\n", labelrepr(lb)); +} + +void +translatelabels(g) + grammar *g; +{ + int i; + + printf("Translating labels ...\n"); + /* Don't translate EMPTY */ + for (i = EMPTY+1; i < g->g_ll.ll_nlabels; i++) + translabel(g, &g->g_ll.ll_label[i]); +} diff --git a/Parser/grammar1.c b/Parser/grammar1.c new file mode 100644 index 0000000..0cd66af --- /dev/null +++ b/Parser/grammar1.c @@ -0,0 +1,52 @@ +/* Grammar subroutines needed by parser */ + +#include "PROTO.h" +#define NULL 0 +#include "assert.h" +#include "grammar.h" +#include "token.h" + +/* Return the DFA for the given type */ + +dfa * +finddfa(g, type) + grammar *g; + register int type; +{ + register int i; + register dfa *d; + + for (i = g->g_ndfas, d = g->g_dfa; --i >= 0; d++) { + if (d->d_type == type) + return d; + } + assert(0); + /* NOTREACHED */ +} + +char * +labelrepr(lb) + label *lb; +{ + static char buf[100]; + + if (lb->lb_type == ENDMARKER) + return "EMPTY"; + else if (ISNONTERMINAL(lb->lb_type)) { + if (lb->lb_str == NULL) { + sprintf(buf, "NT%d", lb->lb_type); + return buf; + } + else + return lb->lb_str; + } + else { + if (lb->lb_str == NULL) + return tok_name[lb->lb_type]; + else { + sprintf(buf, "%.32s(%.32s)", + tok_name[lb->lb_type], lb->lb_str); + return buf; + } + } +} diff --git a/Parser/intrcheck.c b/Parser/intrcheck.c new file mode 100644 index 0000000..f963208 --- /dev/null +++ b/Parser/intrcheck.c @@ -0,0 +1,95 @@ +/* Check for interrupts */ + +#ifdef MSDOS + +/* This might work for MS-DOS: */ + +void +initintr() +{ +} + +int +intrcheck() +{ + int interrupted = 0; + while (kbhit()) { + if (getch() == '\003') + interrupted = 1; + } + return interrupted; +} + +#define OK + +#endif + + +#ifdef THINK_C + +#include <MacHeaders> + +void +initintr() +{ +} + +int +intrcheck() +{ + /* Static to make it faster(?) only */ + static EventRecord e; + + /* XXX This fails if the user first types ahead and then + decides to interrupt -- repeating Command-. until the + event queue overflows may work though. */ + if (EventAvail(keyDownMask|autoKeyMask, &e) && + (e.modifiers & cmdKey) && + (e.message & charCodeMask) == '.') { + (void) GetNextEvent(keyDownMask|autoKeyMask, &e); + return 1; + } + return 0; +} + +#define OK + +#endif /* THINK_C */ + + +#ifndef OK + +/* Default version -- should work for Unix and Standard C */ + +#include <stdio.h> +#include <signal.h> + +#include "sigtype.h" + +static int interrupted; + +static SIGTYPE +intcatcher(sig) + int sig; +{ + interrupted = 1; + signal(SIGINT, intcatcher); +} + +void +initintr() +{ + if (signal(SIGINT, SIG_IGN) != SIG_IGN) + signal(SIGINT, intcatcher); +} + +int +intrcheck() +{ + if (!interrupted) + return 0; + interrupted = 0; + return 1; +} + +#endif /* !OK */ diff --git a/Parser/listnode.c b/Parser/listnode.c new file mode 100644 index 0000000..e0a979e --- /dev/null +++ b/Parser/listnode.c @@ -0,0 +1,68 @@ +/* List a node on a file */ + +#include <stdio.h> + +#include "PROTO.h" +#include "token.h" +#include "node.h" + +static int level, atbol; + +static void +list1node(fp, n) + FILE *fp; + node *n; +{ + if (n == 0) + return; + if (ISNONTERMINAL(TYPE(n))) { + int i; + for (i = 0; i < NCH(n); i++) + list1node(fp, CHILD(n, i)); + } + else if (ISTERMINAL(TYPE(n))) { + switch (TYPE(n)) { + case INDENT: + ++level; + break; + case DEDENT: + --level; + break; + default: + if (atbol) { + int i; + for (i = 0; i < level; ++i) + fprintf(fp, "\t"); + atbol = 0; + } + if (TYPE(n) == NEWLINE) { + if (STR(n) != NULL) + fprintf(fp, "%s", STR(n)); + fprintf(fp, "\n"); + atbol = 1; + } + else + fprintf(fp, "%s ", STR(n)); + break; + } + } + else + fprintf(fp, "? "); +} + +void +listnode(fp, n) + FILE *fp; + node *n; +{ + level = 0; + atbol = 1; + list1node(fp, n); +} + +void +listtree(n) + node *n; +{ + listnode(stdout, n); +} diff --git a/Parser/metagrammar.c b/Parser/metagrammar.c new file mode 100644 index 0000000..a7e6364 --- /dev/null +++ b/Parser/metagrammar.c @@ -0,0 +1,151 @@ +#include "PROTO.h" +#include "metagrammar.h" +#include "grammar.h" +static arc arcs_0_0[3] = { + {2, 0}, + {3, 0}, + {4, 1}, +}; +static arc arcs_0_1[1] = { + {0, 1}, +}; +static state states_0[2] = { + {3, arcs_0_0}, + {1, arcs_0_1}, +}; +static arc arcs_1_0[1] = { + {5, 1}, +}; +static arc arcs_1_1[1] = { + {6, 2}, +}; +static arc arcs_1_2[1] = { + {7, 3}, +}; +static arc arcs_1_3[1] = { + {3, 4}, +}; +static arc arcs_1_4[1] = { + {0, 4}, +}; +static state states_1[5] = { + {1, arcs_1_0}, + {1, arcs_1_1}, + {1, arcs_1_2}, + {1, arcs_1_3}, + {1, arcs_1_4}, +}; +static arc arcs_2_0[1] = { + {8, 1}, +}; +static arc arcs_2_1[2] = { + {9, 0}, + {0, 1}, +}; +static state states_2[2] = { + {1, arcs_2_0}, + {2, arcs_2_1}, +}; +static arc arcs_3_0[1] = { + {10, 1}, +}; +static arc arcs_3_1[2] = { + {10, 1}, + {0, 1}, +}; +static state states_3[2] = { + {1, arcs_3_0}, + {2, arcs_3_1}, +}; +static arc arcs_4_0[2] = { + {11, 1}, + {13, 2}, +}; +static arc arcs_4_1[1] = { + {7, 3}, +}; +static arc arcs_4_2[3] = { + {14, 4}, + {15, 4}, + {0, 2}, +}; +static arc arcs_4_3[1] = { + {12, 4}, +}; +static arc arcs_4_4[1] = { + {0, 4}, +}; +static state states_4[5] = { + {2, arcs_4_0}, + {1, arcs_4_1}, + {3, arcs_4_2}, + {1, arcs_4_3}, + {1, arcs_4_4}, +}; +static arc arcs_5_0[3] = { + {5, 1}, + {16, 1}, + {17, 2}, +}; +static arc arcs_5_1[1] = { + {0, 1}, +}; +static arc arcs_5_2[1] = { + {7, 3}, +}; +static arc arcs_5_3[1] = { + {18, 1}, +}; +static state states_5[4] = { + {3, arcs_5_0}, + {1, arcs_5_1}, + {1, arcs_5_2}, + {1, arcs_5_3}, +}; +static dfa dfas[6] = { + {256, "MSTART", 0, 2, states_0, + "\070\000\000"}, + {257, "RULE", 0, 5, states_1, + "\040\000\000"}, + {258, "RHS", 0, 2, states_2, + "\040\010\003"}, + {259, "ALT", 0, 2, states_3, + "\040\010\003"}, + {260, "ITEM", 0, 5, states_4, + "\040\010\003"}, + {261, "ATOM", 0, 4, states_5, + "\040\000\003"}, +}; +static label labels[19] = { + {0, "EMPTY"}, + {256, 0}, + {257, 0}, + {4, 0}, + {0, 0}, + {1, 0}, + {11, 0}, + {258, 0}, + {259, 0}, + {18, 0}, + {260, 0}, + {9, 0}, + {10, 0}, + {261, 0}, + {16, 0}, + {14, 0}, + {3, 0}, + {7, 0}, + {8, 0}, +}; +static grammar gram = { + 6, + dfas, + {19, labels}, + 256 +}; + +grammar * +meta_grammar() +{ + return &gram; +} diff --git a/Parser/node.c b/Parser/node.c new file mode 100644 index 0000000..86d607a --- /dev/null +++ b/Parser/node.c @@ -0,0 +1,47 @@ +/* Parse tree node implementation */ + +#include "PROTO.h" +#include "malloc.h" +#include "node.h" + +node * +newnode(type) + int type; +{ + node *n = NEW(node, 1); + if (n == NULL) + return NULL; + n->n_type = type; + n->n_str = NULL; + n->n_nchildren = 0; + n->n_child = NULL; + return n; +} + +#define XXX 3 /* Node alignment factor to speed up realloc */ +#define XXXROUNDUP(n) ((n) == 1 ? 1 : ((n) + XXX - 1) / XXX * XXX) + +node * +addchild(n1, type, str) + register node *n1; + int type; + char *str; +{ + register int nch = n1->n_nchildren; + register int nch1 = nch+1; + register node *n; + if (XXXROUNDUP(nch) < nch1) { + n = n1->n_child; + nch1 = XXXROUNDUP(nch1); + RESIZE(n, node, nch1); + if (n == NULL) + return NULL; + n1->n_child = n; + } + n = &n1->n_child[n1->n_nchildren++]; + n->n_type = type; + n->n_str = str; + n->n_nchildren = 0; + n->n_child = NULL; + return n; +} diff --git a/Parser/parser.c b/Parser/parser.c new file mode 100644 index 0000000..e0b3530 --- /dev/null +++ b/Parser/parser.c @@ -0,0 +1,396 @@ +/* Parser implementation */ + +/* For a description, see the comments at end of this file */ + +/* XXX To do: error recovery */ + +#include <stdio.h> +#include "assert.h" + +#include "PROTO.h" +#include "malloc.h" +#include "token.h" +#include "grammar.h" +#include "node.h" +#include "parser.h" +#include "errcode.h" + +extern int debugging; + +#ifdef DEBUG +#define D(x) if (!debugging); else x +#else +#define D(x) +#endif + + +/* STACK DATA TYPE */ + +static void s_reset PROTO((stack *)); + +static void +s_reset(s) + stack *s; +{ + s->s_top = &s->s_base[MAXSTACK]; +} + +#define s_empty(s) ((s)->s_top == &(s)->s_base[MAXSTACK]) + +static int s_push PROTO((stack *, dfa *, node *)); + +static int +s_push(s, d, parent) + register stack *s; + dfa *d; + node *parent; +{ + register stackentry *top; + if (s->s_top == s->s_base) { + fprintf(stderr, "s_push: parser stack overflow\n"); + return -1; + } + top = --s->s_top; + top->s_dfa = d; + top->s_parent = parent; + top->s_state = 0; + return 0; +} + +#ifdef DEBUG + +static void s_pop PROTO((stack *)); + +static void +s_pop(s) + register stack *s; +{ + if (s_empty(s)) { + fprintf(stderr, "s_pop: parser stack underflow -- FATAL\n"); + abort(); + } + s->s_top++; +} + +#else /* !DEBUG */ + +#define s_pop(s) (s)->s_top++ + +#endif + + +/* PARSER CREATION */ + +parser_state * +newparser(g, start) + grammar *g; + int start; +{ + parser_state *ps; + + if (!g->g_accel) + addaccelerators(g); + ps = NEW(parser_state, 1); + if (ps == NULL) + return NULL; + ps->p_grammar = g; + ps->p_tree = newnode(start); + if (ps->p_tree == NULL) { + if (ps->p_tree != NULL) + DEL(ps->p_tree); /* XXX freeing a node!?! */ + DEL(ps); + return NULL; + } + s_reset(&ps->p_stack); + (void) s_push(&ps->p_stack, finddfa(g, start), ps->p_tree); + return ps; +} + +void +delparser(ps) + parser_state *ps; +{ + DEL(ps); +} + + +/* PARSER STACK OPERATIONS */ + +static int shift PROTO((stack *, int, char *, int)); + +static int +shift(s, type, str, newstate) + register stack *s; + int type; + char *str; + int newstate; +{ + assert(!s_empty(s)); + if (addchild(s->s_top->s_parent, type, str) == NULL) { + fprintf(stderr, "shift: no mem in addchild\n"); + return -1; + } + s->s_top->s_state = newstate; + return 0; +} + +static int push PROTO((stack *, int, dfa *, int)); + +static int +push(s, type, d, newstate) + register stack *s; + int type; + dfa *d; + int newstate; +{ + register node *n; + n = s->s_top->s_parent; + assert(!s_empty(s)); + if (addchild(n, type, (char *)NULL) == NULL) { + fprintf(stderr, "push: no mem in addchild\n"); + return -1; + } + s->s_top->s_state = newstate; + return s_push(s, d, CHILD(n, NCH(n)-1)); +} + + +/* PARSER PROPER */ + +static int classify PROTO((grammar *, int, char *)); + +static int +classify(g, type, str) + grammar *g; + register int type; + char *str; +{ + register int n = g->g_ll.ll_nlabels; + + if (type == NAME) { + register char *s = str; + register label *l = g->g_ll.ll_label; + register int i; + for (i = n; i > 0; i--, l++) { + if (l->lb_type == NAME && l->lb_str != NULL && + l->lb_str[0] == s[0] && + strcmp(l->lb_str, s) == 0) { + D(printf("It's a keyword\n")); + return n - i; + } + } + } + + { + register label *l = g->g_ll.ll_label; + register int i; + for (i = n; i > 0; i--, l++) { + if (l->lb_type == type && l->lb_str == NULL) { + D(printf("It's a token we know\n")); + return n - i; + } + } + } + + D(printf("Illegal token\n")); + return -1; +} + +int +addtoken(ps, type, str) + register parser_state *ps; + register int type; + char *str; +{ + register int ilabel; + + D(printf("Token %s/'%s' ... ", tok_name[type], str)); + + /* Find out which label this token is */ + ilabel = classify(ps->p_grammar, type, str); + if (ilabel < 0) + return E_SYNTAX; + + /* Loop until the token is shifted or an error occurred */ + for (;;) { + /* Fetch the current dfa and state */ + register dfa *d = ps->p_stack.s_top->s_dfa; + register state *s = &d->d_state[ps->p_stack.s_top->s_state]; + + D(printf(" DFA '%s', state %d:", + d->d_name, ps->p_stack.s_top->s_state)); + + /* Check accelerator */ + if (s->s_lower <= ilabel && ilabel < s->s_upper) { + register int x = s->s_accel[ilabel - s->s_lower]; + if (x != -1) { + if (x & (1<<7)) { + /* Push non-terminal */ + int nt = (x >> 8) + NT_OFFSET; + int arrow = x & ((1<<7)-1); + dfa *d1 = finddfa(ps->p_grammar, nt); + if (push(&ps->p_stack, nt, d1, arrow) < 0) { + D(printf(" MemError: push.\n")); + return E_NOMEM; + } + D(printf(" Push ...\n")); + continue; + } + + /* Shift the token */ + if (shift(&ps->p_stack, type, str, x) < 0) { + D(printf(" MemError: shift.\n")); + return E_NOMEM; + } + D(printf(" Shift.\n")); + /* Pop while we are in an accept-only state */ + while (s = &d->d_state + [ps->p_stack.s_top->s_state], + s->s_accept && s->s_narcs == 1) { + D(printf(" Direct pop.\n")); + s_pop(&ps->p_stack); + if (s_empty(&ps->p_stack)) { + D(printf(" ACCEPT.\n")); + return E_DONE; + } + d = ps->p_stack.s_top->s_dfa; + } + return E_OK; + } + } + + if (s->s_accept) { + /* Pop this dfa and try again */ + s_pop(&ps->p_stack); + D(printf(" Pop ...\n")); + if (s_empty(&ps->p_stack)) { + D(printf(" Error: bottom of stack.\n")); + return E_SYNTAX; + } + continue; + } + + /* Stuck, report syntax error */ + D(printf(" Error.\n")); + return E_SYNTAX; + } +} + + +#ifdef DEBUG + +/* DEBUG OUTPUT */ + +void +dumptree(g, n) + grammar *g; + node *n; +{ + int i; + + if (n == NULL) + printf("NIL"); + else { + label l; + l.lb_type = TYPE(n); + l.lb_str = TYPE(str); + printf("%s", labelrepr(&l)); + if (ISNONTERMINAL(TYPE(n))) { + printf("("); + for (i = 0; i < NCH(n); i++) { + if (i > 0) + printf(","); + dumptree(g, CHILD(n, i)); + } + printf(")"); + } + } +} + +void +showtree(g, n) + grammar *g; + node *n; +{ + int i; + + if (n == NULL) + return; + if (ISNONTERMINAL(TYPE(n))) { + for (i = 0; i < NCH(n); i++) + showtree(g, CHILD(n, i)); + } + else if (ISTERMINAL(TYPE(n))) { + printf("%s", tok_name[TYPE(n)]); + if (TYPE(n) == NUMBER || TYPE(n) == NAME) + printf("(%s)", STR(n)); + printf(" "); + } + else + printf("? "); +} + +void +printtree(ps) + parser_state *ps; +{ + if (debugging) { + printf("Parse tree:\n"); + dumptree(ps->p_grammar, ps->p_tree); + printf("\n"); + printf("Tokens:\n"); + showtree(ps->p_grammar, ps->p_tree); + printf("\n"); + } + printf("Listing:\n"); + listtree(ps->p_tree); + printf("\n"); +} + +#endif /* DEBUG */ + +/* + +Description +----------- + +The parser's interface is different than usual: the function addtoken() +must be called for each token in the input. This makes it possible to +turn it into an incremental parsing system later. The parsing system +constructs a parse tree as it goes. + +A parsing rule is represented as a Deterministic Finite-state Automaton +(DFA). A node in a DFA represents a state of the parser; an arc represents +a transition. Transitions are either labeled with terminal symbols or +with non-terminals. When the parser decides to follow an arc labeled +with a non-terminal, it is invoked recursively with the DFA representing +the parsing rule for that as its initial state; when that DFA accepts, +the parser that invoked it continues. The parse tree constructed by the +recursively called parser is inserted as a child in the current parse tree. + +The DFA's can be constructed automatically from a more conventional +language description. An extended LL(1) grammar (ELL(1)) is suitable. +Certain restrictions make the parser's life easier: rules that can produce +the empty string should be outlawed (there are other ways to put loops +or optional parts in the language). To avoid the need to construct +FIRST sets, we can require that all but the last alternative of a rule +(really: arc going out of a DFA's state) must begin with a terminal +symbol. + +As an example, consider this grammar: + +expr: term (OP term)* +term: CONSTANT | '(' expr ')' + +The DFA corresponding to the rule for expr is: + +------->.---term-->.-------> + ^ | + | | + \----OP----/ + +The parse tree generated for the input a+b is: + +(expr: (term: (NAME: a)), (OP: +), (term: (NAME: b))) + +*/ diff --git a/Parser/parser.h b/Parser/parser.h new file mode 100644 index 0000000..16eee0e --- /dev/null +++ b/Parser/parser.h @@ -0,0 +1,25 @@ +/* Parser interface */ + +#define MAXSTACK 100 + +typedef struct _stackentry { + int s_state; /* State in current DFA */ + dfa *s_dfa; /* Current DFA */ + node *s_parent; /* Where to add next node */ +} stackentry; + +typedef struct _stack { + stackentry *s_top; /* Top entry */ + stackentry s_base[MAXSTACK];/* Array of stack entries */ + /* NB The stack grows down */ +} stack; + +typedef struct { + struct _stack p_stack; /* Stack of parser states */ + struct _grammar *p_grammar; /* Grammar to use */ + struct _node *p_tree; /* Top of parse tree */ +} parser_state; + +parser_state *newparser PROTO((struct _grammar *g, int start)); +void delparser PROTO((parser_state *ps)); +int addtoken PROTO((parser_state *ps, int type, char *str)); diff --git a/Parser/parsetok.c b/Parser/parsetok.c new file mode 100644 index 0000000..01877a1 --- /dev/null +++ b/Parser/parsetok.c @@ -0,0 +1,131 @@ +/* Parser-tokenizer link implementation */ + +#include <stdio.h> + +#include "PROTO.h" +#include "malloc.h" +#include "tokenizer.h" +#include "node.h" +#include "grammar.h" +#include "parser.h" +#include "errcode.h" + +extern int debugging; + + +/* Parse input coming from the given tokenizer structure. + Return error code. */ + +static int +parsetok(tok, g, start, n_ret) + struct tok_state *tok; + grammar *g; + int start; + node **n_ret; +{ + parser_state *ps; + int ret; + + if ((ps = newparser(g, start)) == NULL) { + fprintf(stderr, "no mem for new parser\n"); + return E_NOMEM; + } + + for (;;) { + char *a, *b; + int type; + int len; + char *str; + + type = tok_get(tok, &a, &b); + if (type == ERRORTOKEN) { + ret = tok->done; + break; + } + len = b - a; + str = NEW(char, len + 1); + if (str == NULL) { + fprintf(stderr, "no mem for next token\n"); + ret = E_NOMEM; + break; + } + strncpy(str, a, len); + str[len] = '\0'; + ret = addtoken(ps, (int)type, str); + if (ret != E_OK) { + if (ret == E_DONE) + *n_ret = ps->p_tree; + else if (tok->lineno <= 1 && tok->done == E_EOF) + ret = E_EOF; + break; + } + } + + delparser(ps); + return ret; +} + + +/* Parse input coming from a string. Return error code. */ + +int +parsestring(s, g, start, n_ret) + char *s; + grammar *g; + int start; + node **n_ret; +{ + struct tok_state *tok = tok_setups(s); + int ret; + + if (tok == NULL) { + fprintf(stderr, "no mem for tok_setups\n"); + return E_NOMEM; + } + ret = parsetok(tok, g, start, n_ret); + if (ret == E_TOKEN || ret == E_SYNTAX) { + fprintf(stderr, "String parsing error at line %d\n", + tok->lineno); + } + tok_free(tok); + return ret; +} + + +/* Parse input coming from a file. Return error code. */ + +int +parsefile(fp, g, start, ps1, ps2, n_ret) + FILE *fp; + grammar *g; + int start; + char *ps1, *ps2; + node **n_ret; +{ + struct tok_state *tok = tok_setupf(fp, ps1, ps2); + int ret; + + if (tok == NULL) { + fprintf(stderr, "no mem for tok_setupf\n"); + return E_NOMEM; + } + ret = parsetok(tok, g, start, n_ret); + if (ret == E_TOKEN || ret == E_SYNTAX) { + char *p; + fprintf(stderr, "Parsing error at line %d:\n", + tok->lineno); + *tok->inp = '\0'; + if (tok->inp > tok->buf && tok->inp[-1] == '\n') + tok->inp[-1] = '\0'; + fprintf(stderr, "%s\n", tok->buf); + for (p = tok->buf; p < tok->cur; p++) { + if (*p == '\t') + putc('\t', stderr); + else + putc(' ', stderr); + } + fprintf(stderr, "^\n"); + } + tok_free(tok); + return ret; +} diff --git a/Parser/pgen.c b/Parser/pgen.c new file mode 100644 index 0000000..34d9b71 --- /dev/null +++ b/Parser/pgen.c @@ -0,0 +1,729 @@ +/* Parser generator */ + +/* For a description, see the comments at end of this file */ + +#include <stdio.h> +#include "assert.h" + +#include "PROTO.h" +#include "malloc.h" +#include "token.h" +#include "node.h" +#include "grammar.h" +#include "metagrammar.h" +#include "pgen.h" + +extern int debugging; + + +/* PART ONE -- CONSTRUCT NFA -- Cf. Algorithm 3.2 from [Aho&Ullman 77] */ + +typedef struct _nfaarc { + int ar_label; + int ar_arrow; +} nfaarc; + +typedef struct _nfastate { + int st_narcs; + nfaarc *st_arc; +} nfastate; + +typedef struct _nfa { + int nf_type; + char *nf_name; + int nf_nstates; + nfastate *nf_state; + int nf_start, nf_finish; +} nfa; + +static int +addnfastate(nf) + nfa *nf; +{ + nfastate *st; + + RESIZE(nf->nf_state, nfastate, nf->nf_nstates + 1); + if (nf->nf_state == NULL) + fatal("out of mem"); + st = &nf->nf_state[nf->nf_nstates++]; + st->st_narcs = 0; + st->st_arc = NULL; + return st - nf->nf_state; +} + +static void +addnfaarc(nf, from, to, lbl) + nfa *nf; + int from, to, lbl; +{ + nfastate *st; + nfaarc *ar; + + st = &nf->nf_state[from]; + RESIZE(st->st_arc, nfaarc, st->st_narcs + 1); + if (st->st_arc == NULL) + fatal("out of mem"); + ar = &st->st_arc[st->st_narcs++]; + ar->ar_label = lbl; + ar->ar_arrow = to; +} + +static nfa * +newnfa(name) + char *name; +{ + nfa *nf; + static type = NT_OFFSET; /* All types will be disjunct */ + + nf = NEW(nfa, 1); + if (nf == NULL) + fatal("no mem for new nfa"); + nf->nf_type = type++; + nf->nf_name = name; /* XXX strdup(name) ??? */ + nf->nf_nstates = 0; + nf->nf_state = NULL; + nf->nf_start = nf->nf_finish = -1; + return nf; +} + +typedef struct _nfagrammar { + int gr_nnfas; + nfa **gr_nfa; + labellist gr_ll; +} nfagrammar; + +static nfagrammar * +newnfagrammar() +{ + nfagrammar *gr; + + gr = NEW(nfagrammar, 1); + if (gr == NULL) + fatal("no mem for new nfa grammar"); + gr->gr_nnfas = 0; + gr->gr_nfa = NULL; + gr->gr_ll.ll_nlabels = 0; + gr->gr_ll.ll_label = NULL; + addlabel(&gr->gr_ll, ENDMARKER, "EMPTY"); + return gr; +} + +static nfa * +addnfa(gr, name) + nfagrammar *gr; + char *name; +{ + nfa *nf; + + nf = newnfa(name); + RESIZE(gr->gr_nfa, nfa *, gr->gr_nnfas + 1); + if (gr->gr_nfa == NULL) + fatal("out of mem"); + gr->gr_nfa[gr->gr_nnfas++] = nf; + addlabel(&gr->gr_ll, NAME, nf->nf_name); + return nf; +} + +#ifdef DEBUG + +static char REQNFMT[] = "metacompile: less than %d children\n"; + +#define REQN(i, count) \ + if (i < count) { \ + fprintf(stderr, REQNFMT, count); \ + abort(); \ + } else + +#else +#define REQN(i, count) /* empty */ +#endif + +static nfagrammar * +metacompile(n) + node *n; +{ + nfagrammar *gr; + int i; + + printf("Compiling (meta-) parse tree into NFA grammar\n"); + gr = newnfagrammar(); + REQ(n, MSTART); + i = n->n_nchildren - 1; /* Last child is ENDMARKER */ + n = n->n_child; + for (; --i >= 0; n++) { + if (n->n_type != NEWLINE) + compile_rule(gr, n); + } + return gr; +} + +static +compile_rule(gr, n) + nfagrammar *gr; + node *n; +{ + nfa *nf; + + REQ(n, RULE); + REQN(n->n_nchildren, 4); + n = n->n_child; + REQ(n, NAME); + nf = addnfa(gr, n->n_str); + n++; + REQ(n, COLON); + n++; + REQ(n, RHS); + compile_rhs(&gr->gr_ll, nf, n, &nf->nf_start, &nf->nf_finish); + n++; + REQ(n, NEWLINE); +} + +static +compile_rhs(ll, nf, n, pa, pb) + labellist *ll; + nfa *nf; + node *n; + int *pa, *pb; +{ + int i; + int a, b; + + REQ(n, RHS); + i = n->n_nchildren; + REQN(i, 1); + n = n->n_child; + REQ(n, ALT); + compile_alt(ll, nf, n, pa, pb); + if (--i <= 0) + return; + n++; + a = *pa; + b = *pb; + *pa = addnfastate(nf); + *pb = addnfastate(nf); + addnfaarc(nf, *pa, a, EMPTY); + addnfaarc(nf, b, *pb, EMPTY); + for (; --i >= 0; n++) { + REQ(n, VBAR); + REQN(i, 1); + --i; + n++; + REQ(n, ALT); + compile_alt(ll, nf, n, &a, &b); + addnfaarc(nf, *pa, a, EMPTY); + addnfaarc(nf, b, *pb, EMPTY); + } +} + +static +compile_alt(ll, nf, n, pa, pb) + labellist *ll; + nfa *nf; + node *n; + int *pa, *pb; +{ + int i; + int a, b; + + REQ(n, ALT); + i = n->n_nchildren; + REQN(i, 1); + n = n->n_child; + REQ(n, ITEM); + compile_item(ll, nf, n, pa, pb); + --i; + n++; + for (; --i >= 0; n++) { + if (n->n_type == COMMA) { /* XXX Temporary */ + REQN(i, 1); + --i; + n++; + } + REQ(n, ITEM); + compile_item(ll, nf, n, &a, &b); + addnfaarc(nf, *pb, a, EMPTY); + *pb = b; + } +} + +static +compile_item(ll, nf, n, pa, pb) + labellist *ll; + nfa *nf; + node *n; + int *pa, *pb; +{ + int i; + int a, b; + + REQ(n, ITEM); + i = n->n_nchildren; + REQN(i, 1); + n = n->n_child; + if (n->n_type == LSQB) { + REQN(i, 3); + n++; + REQ(n, RHS); + *pa = addnfastate(nf); + *pb = addnfastate(nf); + addnfaarc(nf, *pa, *pb, EMPTY); + compile_rhs(ll, nf, n, &a, &b); + addnfaarc(nf, *pa, a, EMPTY); + addnfaarc(nf, b, *pb, EMPTY); + REQN(i, 1); + n++; + REQ(n, RSQB); + } + else { + compile_atom(ll, nf, n, pa, pb); + if (--i <= 0) + return; + n++; + addnfaarc(nf, *pb, *pa, EMPTY); + if (n->n_type == STAR) + *pb = *pa; + else + REQ(n, PLUS); + } +} + +static +compile_atom(ll, nf, n, pa, pb) + labellist *ll; + nfa *nf; + node *n; + int *pa, *pb; +{ + int i; + + REQ(n, ATOM); + i = n->n_nchildren; + REQN(i, 1); + n = n->n_child; + if (n->n_type == LPAR) { + REQN(i, 3); + n++; + REQ(n, RHS); + compile_rhs(ll, nf, n, pa, pb); + n++; + REQ(n, RPAR); + } + else if (n->n_type == NAME || n->n_type == STRING) { + *pa = addnfastate(nf); + *pb = addnfastate(nf); + addnfaarc(nf, *pa, *pb, addlabel(ll, n->n_type, n->n_str)); + } + else + REQ(n, NAME); +} + +static void +dumpstate(ll, nf, istate) + labellist *ll; + nfa *nf; + int istate; +{ + nfastate *st; + int i; + nfaarc *ar; + + printf("%c%2d%c", + istate == nf->nf_start ? '*' : ' ', + istate, + istate == nf->nf_finish ? '.' : ' '); + st = &nf->nf_state[istate]; + ar = st->st_arc; + for (i = 0; i < st->st_narcs; i++) { + if (i > 0) + printf("\n "); + printf("-> %2d %s", ar->ar_arrow, + labelrepr(&ll->ll_label[ar->ar_label])); + ar++; + } + printf("\n"); +} + +static void +dumpnfa(ll, nf) + labellist *ll; + nfa *nf; +{ + int i; + + printf("NFA '%s' has %d states; start %d, finish %d\n", + nf->nf_name, nf->nf_nstates, nf->nf_start, nf->nf_finish); + for (i = 0; i < nf->nf_nstates; i++) + dumpstate(ll, nf, i); +} + + +/* PART TWO -- CONSTRUCT DFA -- Algorithm 3.1 from [Aho&Ullman 77] */ + +static int +addclosure(ss, nf, istate) + bitset ss; + nfa *nf; + int istate; +{ + if (addbit(ss, istate)) { + nfastate *st = &nf->nf_state[istate]; + nfaarc *ar = st->st_arc; + int i; + + for (i = st->st_narcs; --i >= 0; ) { + if (ar->ar_label == EMPTY) + addclosure(ss, nf, ar->ar_arrow); + ar++; + } + } +} + +typedef struct _ss_arc { + bitset sa_bitset; + int sa_arrow; + int sa_label; +} ss_arc; + +typedef struct _ss_state { + bitset ss_ss; + int ss_narcs; + ss_arc *ss_arc; + int ss_deleted; + int ss_finish; + int ss_rename; +} ss_state; + +typedef struct _ss_dfa { + int sd_nstates; + ss_state *sd_state; +} ss_dfa; + +static +makedfa(gr, nf, d) + nfagrammar *gr; + nfa *nf; + dfa *d; +{ + int nbits = nf->nf_nstates; + bitset ss; + int xx_nstates; + ss_state *xx_state, *yy; + ss_arc *zz; + int istate, jstate, iarc, jarc, ibit; + nfastate *st; + nfaarc *ar; + + ss = newbitset(nbits); + addclosure(ss, nf, nf->nf_start); + xx_state = NEW(ss_state, 1); + if (xx_state == NULL) + fatal("no mem for xx_state in makedfa"); + xx_nstates = 1; + yy = &xx_state[0]; + yy->ss_ss = ss; + yy->ss_narcs = 0; + yy->ss_arc = NULL; + yy->ss_deleted = 0; + yy->ss_finish = testbit(ss, nf->nf_finish); + if (yy->ss_finish) + printf("Error: nonterminal '%s' may produce empty.\n", + nf->nf_name); + + /* This algorithm is from a book written before + the invention of structured programming... */ + + /* For each unmarked state... */ + for (istate = 0; istate < xx_nstates; ++istate) { + yy = &xx_state[istate]; + ss = yy->ss_ss; + /* For all its states... */ + for (ibit = 0; ibit < nf->nf_nstates; ++ibit) { + if (!testbit(ss, ibit)) + continue; + st = &nf->nf_state[ibit]; + /* For all non-empty arcs from this state... */ + for (iarc = 0; iarc < st->st_narcs; iarc++) { + ar = &st->st_arc[iarc]; + if (ar->ar_label == EMPTY) + continue; + /* Look up in list of arcs from this state */ + for (jarc = 0; jarc < yy->ss_narcs; ++jarc) { + zz = &yy->ss_arc[jarc]; + if (ar->ar_label == zz->sa_label) + goto found; + } + /* Add new arc for this state */ + RESIZE(yy->ss_arc, ss_arc, yy->ss_narcs + 1); + if (yy->ss_arc == NULL) + fatal("out of mem"); + zz = &yy->ss_arc[yy->ss_narcs++]; + zz->sa_label = ar->ar_label; + zz->sa_bitset = newbitset(nbits); + zz->sa_arrow = -1; + found: ; + /* Add destination */ + addclosure(zz->sa_bitset, nf, ar->ar_arrow); + } + } + /* Now look up all the arrow states */ + for (jarc = 0; jarc < xx_state[istate].ss_narcs; jarc++) { + zz = &xx_state[istate].ss_arc[jarc]; + for (jstate = 0; jstate < xx_nstates; jstate++) { + if (samebitset(zz->sa_bitset, + xx_state[jstate].ss_ss, nbits)) { + zz->sa_arrow = jstate; + goto done; + } + } + RESIZE(xx_state, ss_state, xx_nstates + 1); + if (xx_state == NULL) + fatal("out of mem"); + zz->sa_arrow = xx_nstates; + yy = &xx_state[xx_nstates++]; + yy->ss_ss = zz->sa_bitset; + yy->ss_narcs = 0; + yy->ss_arc = NULL; + yy->ss_deleted = 0; + yy->ss_finish = testbit(yy->ss_ss, nf->nf_finish); + done: ; + } + } + + if (debugging) + printssdfa(xx_nstates, xx_state, nbits, &gr->gr_ll, + "before minimizing"); + + simplify(xx_nstates, xx_state); + + if (debugging) + printssdfa(xx_nstates, xx_state, nbits, &gr->gr_ll, + "after minimizing"); + + convert(d, xx_nstates, xx_state); + + /* XXX cleanup */ +} + +static +printssdfa(xx_nstates, xx_state, nbits, ll, msg) + int xx_nstates; + ss_state *xx_state; + int nbits; + labellist *ll; + char *msg; +{ + int i, ibit, iarc; + ss_state *yy; + ss_arc *zz; + + printf("Subset DFA %s\n", msg); + for (i = 0; i < xx_nstates; i++) { + yy = &xx_state[i]; + if (yy->ss_deleted) + continue; + printf(" Subset %d", i); + if (yy->ss_finish) + printf(" (finish)"); + printf(" { "); + for (ibit = 0; ibit < nbits; ibit++) { + if (testbit(yy->ss_ss, ibit)) + printf("%d ", ibit); + } + printf("}\n"); + for (iarc = 0; iarc < yy->ss_narcs; iarc++) { + zz = &yy->ss_arc[iarc]; + printf(" Arc to state %d, label %s\n", + zz->sa_arrow, + labelrepr(&ll->ll_label[zz->sa_label])); + } + } +} + + +/* PART THREE -- SIMPLIFY DFA */ + +/* Simplify the DFA by repeatedly eliminating states that are + equivalent to another oner. This is NOT Algorithm 3.3 from + [Aho&Ullman 77]. It does not always finds the minimal DFA, + but it does usually make a much smaller one... (For an example + of sub-optimal behaviour, try S: x a b+ | y a b+.) +*/ + +static int +samestate(s1, s2) + ss_state *s1, *s2; +{ + int i; + + if (s1->ss_narcs != s2->ss_narcs || s1->ss_finish != s2->ss_finish) + return 0; + for (i = 0; i < s1->ss_narcs; i++) { + if (s1->ss_arc[i].sa_arrow != s2->ss_arc[i].sa_arrow || + s1->ss_arc[i].sa_label != s2->ss_arc[i].sa_label) + return 0; + } + return 1; +} + +static void +renamestates(xx_nstates, xx_state, from, to) + int xx_nstates; + ss_state *xx_state; + int from, to; +{ + int i, j; + + if (debugging) + printf("Rename state %d to %d.\n", from, to); + for (i = 0; i < xx_nstates; i++) { + if (xx_state[i].ss_deleted) + continue; + for (j = 0; j < xx_state[i].ss_narcs; j++) { + if (xx_state[i].ss_arc[j].sa_arrow == from) + xx_state[i].ss_arc[j].sa_arrow = to; + } + } +} + +static +simplify(xx_nstates, xx_state) + int xx_nstates; + ss_state *xx_state; +{ + int changes; + int i, j, k; + + do { + changes = 0; + for (i = 1; i < xx_nstates; i++) { + if (xx_state[i].ss_deleted) + continue; + for (j = 0; j < i; j++) { + if (xx_state[j].ss_deleted) + continue; + if (samestate(&xx_state[i], &xx_state[j])) { + xx_state[i].ss_deleted++; + renamestates(xx_nstates, xx_state, i, j); + changes++; + break; + } + } + } + } while (changes); +} + + +/* PART FOUR -- GENERATE PARSING TABLES */ + +/* Convert the DFA into a grammar that can be used by our parser */ + +static +convert(d, xx_nstates, xx_state) + dfa *d; + int xx_nstates; + ss_state *xx_state; +{ + int i, j; + ss_state *yy; + ss_arc *zz; + + for (i = 0; i < xx_nstates; i++) { + yy = &xx_state[i]; + if (yy->ss_deleted) + continue; + yy->ss_rename = addstate(d); + } + + for (i = 0; i < xx_nstates; i++) { + yy = &xx_state[i]; + if (yy->ss_deleted) + continue; + for (j = 0; j < yy->ss_narcs; j++) { + zz = &yy->ss_arc[j]; + addarc(d, yy->ss_rename, + xx_state[zz->sa_arrow].ss_rename, + zz->sa_label); + } + if (yy->ss_finish) + addarc(d, yy->ss_rename, yy->ss_rename, 0); + } + + d->d_initial = 0; +} + + +/* PART FIVE -- GLUE IT ALL TOGETHER */ + +static grammar * +maketables(gr) + nfagrammar *gr; +{ + int i; + nfa *nf; + dfa *d; + grammar *g; + + if (gr->gr_nnfas == 0) + return NULL; + g = newgrammar(gr->gr_nfa[0]->nf_type); + /* XXX first rule must be start rule */ + g->g_ll = gr->gr_ll; + + for (i = 0; i < gr->gr_nnfas; i++) { + nf = gr->gr_nfa[i]; + if (debugging) { + printf("Dump of NFA for '%s' ...\n", nf->nf_name); + dumpnfa(&gr->gr_ll, nf); + } + printf("Making DFA for '%s' ...\n", nf->nf_name); + d = adddfa(g, nf->nf_type, nf->nf_name); + makedfa(gr, gr->gr_nfa[i], d); + } + + return g; +} + +grammar * +pgen(n) + node *n; +{ + nfagrammar *gr; + grammar *g; + + gr = metacompile(n); + g = maketables(gr); + translatelabels(g); + addfirstsets(g); + return g; +} + + +/* + +Description +----------- + +Input is a grammar in extended BNF (using * for repetition, + for +at-least-once repetition, [] for optional parts, | for alternatives and +() for grouping). This has already been parsed and turned into a parse +tree. + +Each rule is considered as a regular expression in its own right. +It is turned into a Non-deterministic Finite Automaton (NFA), which +is then turned into a Deterministic Finite Automaton (DFA), which is then +optimized to reduce the number of states. See [Aho&Ullman 77] chapter 3, +or similar compiler books (this technique is more often used for lexical +analyzers). + +The DFA's are used by the parser as parsing tables in a special way +that's probably unique. Before they are usable, the FIRST sets of all +non-terminals are computed. + +Reference +--------- + +[Aho&Ullman 77] + Aho&Ullman, Principles of Compiler Design, Addison-Wesley 1977 + (first edition) + +*/ diff --git a/Parser/pgen.h b/Parser/pgen.h new file mode 100644 index 0000000..7fcf277 --- /dev/null +++ b/Parser/pgen.h @@ -0,0 +1,6 @@ +/* Parser generator interface */ + +extern grammar gram; + +extern grammar *meta_grammar PROTO((void)); +extern grammar *pgen PROTO((node *)); diff --git a/Parser/pgenmain.c b/Parser/pgenmain.c new file mode 100644 index 0000000..678be5d --- /dev/null +++ b/Parser/pgenmain.c @@ -0,0 +1,111 @@ +/* Parser generator main program */ + +#include <stdio.h> + +#include "PROTO.h" +#include "grammar.h" +#include "node.h" +#include "parsetok.h" +#include "pgen.h" + +int debugging; + +#ifdef THINK_C +char * +askfile() +{ + char buf[256]; + static char name[256]; + printf("Input file name: "); + if (fgets(buf, sizeof buf, stdin) == NULL) { + printf("EOF\n"); + exit(1); + } + if (sscanf(buf, " %s ", name) != 1) { + printf("No file\n"); + exit(1); + } + return name; +} +#endif + +grammar * +getgrammar(filename) + char *filename; +{ + FILE *fp; + node *n; + grammar *g0, *g; + + fp = fopen(filename, "r"); + if (fp == NULL) { + perror(filename); + exit(1); + } + g0 = meta_grammar(); + n = NULL; + parsefile(fp, g0, g0->g_start, (char *)NULL, (char *)NULL, &n); + fclose(fp); + if (n == NULL) { + fprintf(stderr, "Parsing error.\n"); + exit(1); + } + g = pgen(n); + if (g == NULL) { + printf("Bad grammar.\n"); + exit(1); + } + return g; +} + +main(argc, argv) + int argc; + char **argv; +{ + grammar *g; + node *n; + FILE *fp; + char *filename; + +#ifdef THINK_C + filename = askfile(); +#else + if (argc != 2) { + fprintf(stderr, "usage: %s grammar\n", argv[0]); + exit(2); + } + filename = argv[1]; +#endif + g = getgrammar(filename); + fp = fopen("graminit.c", "w"); + if (fp == NULL) { + perror("graminit.c"); + exit(1); + } + printf("Writing graminit.c ...\n"); + printgrammar(g, fp); + fclose(fp); + fp = fopen("graminit.h", "w"); + if (fp == NULL) { + perror("graminit.h"); + exit(1); + } + printf("Writing graminit.h ...\n"); + printnonterminals(g, fp); + fclose(fp); + exit(0); +} + +void +fatal(msg) + char *msg; +{ + fprintf(stderr, "pgen: FATAL ERROR: %s\n", msg); + exit(1); +} + +/* TO DO: + + - improve user interface + - check for duplicate definitions of names (instead of fatal err) +*/ diff --git a/Parser/printgrammar.c b/Parser/printgrammar.c new file mode 100644 index 0000000..f6aa2cc --- /dev/null +++ b/Parser/printgrammar.c @@ -0,0 +1,121 @@ +/* Print a bunch of C initializers that represent a grammar */ + +#include <stdio.h> + +#include "PROTO.h" +#include "grammar.h" + +static void +printarcs(i, d, fp) + int i; + dfa *d; + FILE *fp; +{ + arc *a; + state *s; + int j, k; + + s = d->d_state; + for (j = 0; j < d->d_nstates; j++, s++) { + fprintf(fp, "static arc arcs_%d_%d[%d] = {\n", + i, j, s->s_narcs); + a = s->s_arc; + for (k = 0; k < s->s_narcs; k++, a++) + fprintf(fp, "\t{%d, %d},\n", a->a_lbl, a->a_arrow); + fprintf(fp, "};\n"); + } +} + +static void +printstates(g, fp) + grammar *g; + FILE *fp; +{ + state *s; + dfa *d; + int i, j; + + d = g->g_dfa; + for (i = 0; i < g->g_ndfas; i++, d++) { + printarcs(i, d, fp); + fprintf(fp, "static state states_%d[%d] = {\n", + i, d->d_nstates); + s = d->d_state; + for (j = 0; j < d->d_nstates; j++, s++) + fprintf(fp, "\t{%d, arcs_%d_%d},\n", + s->s_narcs, i, j); + fprintf(fp, "};\n"); + } +} + +static void +printdfas(g, fp) + grammar *g; + FILE *fp; +{ + dfa *d; + int i, j; + + printstates(g, fp); + fprintf(fp, "static dfa dfas[%d] = {\n", g->g_ndfas); + d = g->g_dfa; + for (i = 0; i < g->g_ndfas; i++, d++) { + fprintf(fp, "\t{%d, \"%s\", %d, %d, states_%d,\n", + d->d_type, d->d_name, d->d_initial, d->d_nstates, i); + fprintf(fp, "\t \""); + for (j = 0; j < NBYTES(g->g_ll.ll_nlabels); j++) + fprintf(fp, "\\%03o", d->d_first[j] & 0xff); + fprintf(fp, "\"},\n"); + } + fprintf(fp, "};\n"); +} + +static void +printlabels(g, fp) + grammar *g; + FILE *fp; +{ + label *l; + int i; + + fprintf(fp, "static label labels[%d] = {\n", g->g_ll.ll_nlabels); + l = g->g_ll.ll_label; + for (i = g->g_ll.ll_nlabels; --i >= 0; l++) { + if (l->lb_str == NULL) + fprintf(fp, "\t{%d, 0},\n", l->lb_type); + else + fprintf(fp, "\t{%d, \"%s\"},\n", + l->lb_type, l->lb_str); + } + fprintf(fp, "};\n"); +} + +void +printgrammar(g, fp) + grammar *g; + FILE *fp; +{ + fprintf(fp, "#include \"PROTO.h\"\n"); + fprintf(fp, "#include \"grammar.h\"\n"); + printdfas(g, fp); + printlabels(g, fp); + fprintf(fp, "grammar gram = {\n"); + fprintf(fp, "\t%d,\n", g->g_ndfas); + fprintf(fp, "\tdfas,\n"); + fprintf(fp, "\t{%d, labels},\n", g->g_ll.ll_nlabels); + fprintf(fp, "\t%d\n", g->g_start); + fprintf(fp, "};\n"); +} + +void +printnonterminals(g, fp) + grammar *g; + FILE *fp; +{ + dfa *d; + int i; + + d = g->g_dfa; + for (i = g->g_ndfas; --i >= 0; d++) + fprintf(fp, "#define %s %d\n", d->d_name, d->d_type); +} diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c new file mode 100644 index 0000000..38f76ed --- /dev/null +++ b/Parser/tokenizer.c @@ -0,0 +1,490 @@ +/* Tokenizer implementation */ + +/* XXX This is rather old, should be restructured perhaps */ +/* XXX Need a better interface to report errors than writing to stderr */ + +#include <stdio.h> +#include <ctype.h> +#include "string.h" + +#include "PROTO.h" +#include "malloc.h" +#include "tokenizer.h" +#include "errcode.h" + +#ifdef THINK_C +#define TABSIZE 4 +#endif + +#ifndef TABSIZE +#define TABSIZE 8 +#endif + +/* Token names */ + +char *tok_name[] = { + "ENDMARKER", + "NAME", + "NUMBER", + "STRING", + "NEWLINE", + "INDENT", + "DEDENT", + "LPAR", + "RPAR", + "LSQB", + "RSQB", + "COLON", + "COMMA", + "SEMI", + "PLUS", + "MINUS", + "STAR", + "SLASH", + "VBAR", + "AMPER", + "LESS", + "GREATER", + "EQUAL", + "DOT", + "PERCENT", + "BACKQUOTE", + "LBRACE", + "RBRACE", + "OP", + "<ERRORTOKEN>", + "<N_TOKENS>" +}; + + +/* Create and initialize a new tok_state structure */ + +static struct tok_state * +tok_new() +{ + struct tok_state *tok = NEW(struct tok_state, 1); + if (tok == NULL) + return NULL; + tok->buf = tok->cur = tok->end = tok->inp = NULL; + tok->done = E_OK; + tok->fp = NULL; + tok->tabsize = TABSIZE; + tok->indent = 0; + tok->indstack[0] = 0; + tok->atbol = 1; + tok->pendin = 0; + tok->prompt = tok->nextprompt = NULL; + tok->lineno = 0; + return tok; +} + + +/* Set up tokenizer for string */ + +struct tok_state * +tok_setups(str) + char *str; +{ + struct tok_state *tok = tok_new(); + if (tok == NULL) + return NULL; + tok->buf = tok->cur = str; + tok->end = tok->inp = strchr(str, '\0'); + return tok; +} + + +/* Set up tokenizer for string */ + +struct tok_state * +tok_setupf(fp, ps1, ps2) + FILE *fp; + char *ps1, *ps2; +{ + struct tok_state *tok = tok_new(); + if (tok == NULL) + return NULL; + if ((tok->buf = NEW(char, BUFSIZ)) == NULL) { + DEL(tok); + return NULL; + } + tok->cur = tok->inp = tok->buf; + tok->end = tok->buf + BUFSIZ; + tok->fp = fp; + tok->prompt = ps1; + tok->nextprompt = ps2; + return tok; +} + + +/* Free a tok_state structure */ + +void +tok_free(tok) + struct tok_state *tok; +{ + /* XXX really need a separate flag to say 'my buffer' */ + if (tok->fp != NULL && tok->buf != NULL) + DEL(tok->buf); + DEL(tok); +} + + +/* Get next char, updating state; error code goes into tok->done */ + +static int +tok_nextc(tok) + register struct tok_state *tok; +{ + if (tok->done != E_OK) + return EOF; + + for (;;) { + if (tok->cur < tok->inp) + return *tok->cur++; + if (tok->fp == NULL) { + tok->done = E_EOF; + return EOF; + } + if (tok->inp > tok->buf && tok->inp[-1] == '\n') + tok->inp = tok->buf; + if (tok->inp == tok->end) { + int n = tok->end - tok->buf; + char *new = tok->buf; + RESIZE(new, char, n+n); + if (new == NULL) { + fprintf(stderr, "tokenizer out of mem\n"); + tok->done = E_NOMEM; + return EOF; + } + tok->buf = new; + tok->inp = tok->buf + n; + tok->end = tok->inp + n; + } +#ifdef USE_READLINE + if (tok->prompt != NULL) { + extern char *readline PROTO((char *prompt)); + static int been_here; + if (!been_here) { + /* Force rebind of TAB to insert-tab */ + extern int rl_insert(); + rl_bind_key('\t', rl_insert); + been_here++; + } + if (tok->buf != NULL) + free(tok->buf); + tok->buf = readline(tok->prompt); + (void) intrcheck(); /* Clear pending interrupt */ + if (tok->nextprompt != NULL) + tok->prompt = tok->nextprompt; + /* XXX different semantics w/o readline()! */ + if (tok->buf == NULL) { + tok->done = E_EOF; + } + else { + unsigned int n = strlen(tok->buf); + if (n > 0) + add_history(tok->buf); + /* Append the '\n' that readline() + doesn't give us, for the tokenizer... */ + tok->buf = realloc(tok->buf, n+2); + if (tok->buf == NULL) + tok->done = E_NOMEM; + else { + tok->end = tok->buf + n; + *tok->end++ = '\n'; + *tok->end = '\0'; + tok->inp = tok->end; + tok->cur = tok->buf; + } + } + } + else +#endif + { + tok->cur = tok->inp; + if (tok->prompt != NULL && tok->inp == tok->buf) { + fprintf(stderr, "%s", tok->prompt); + tok->prompt = tok->nextprompt; + } + tok->done = fgets_intr(tok->inp, + (int)(tok->end - tok->inp), tok->fp); + } + if (tok->done != E_OK) { + if (tok->prompt != NULL) + fprintf(stderr, "\n"); + return EOF; + } + tok->inp = strchr(tok->inp, '\0'); + } +} + + +/* Back-up one character */ + +static void +tok_backup(tok, c) + register struct tok_state *tok; + register int c; +{ + if (c != EOF) { + if (--tok->cur < tok->buf) { + fprintf(stderr, "tok_backup: begin of buffer\n"); + abort(); + } + if (*tok->cur != c) + *tok->cur = c; + } +} + + +/* Return the token corresponding to a single character */ + +int +tok_1char(c) + int c; +{ + switch (c) { + case '(': return LPAR; + case ')': return RPAR; + case '[': return LSQB; + case ']': return RSQB; + case ':': return COLON; + case ',': return COMMA; + case ';': return SEMI; + case '+': return PLUS; + case '-': return MINUS; + case '*': return STAR; + case '/': return SLASH; + case '|': return VBAR; + case '&': return AMPER; + case '<': return LESS; + case '>': return GREATER; + case '=': return EQUAL; + case '.': return DOT; + case '%': return PERCENT; + case '`': return BACKQUOTE; + case '{': return LBRACE; + case '}': return RBRACE; + default: return OP; + } +} + + +/* Get next token, after space stripping etc. */ + +int +tok_get(tok, p_start, p_end) + register struct tok_state *tok; /* In/out: tokenizer state */ + char **p_start, **p_end; /* Out: point to start/end of token */ +{ + register int c; + + /* Get indentation level */ + if (tok->atbol) { + register int col = 0; + tok->atbol = 0; + tok->lineno++; + for (;;) { + c = tok_nextc(tok); + if (c == ' ') + col++; + else if (c == '\t') + col = (col/tok->tabsize + 1) * tok->tabsize; + else + break; + } + tok_backup(tok, c); + if (col == tok->indstack[tok->indent]) { + /* No change */ + } + else if (col > tok->indstack[tok->indent]) { + /* Indent -- always one */ + if (tok->indent+1 >= MAXINDENT) { + fprintf(stderr, "excessive indent\n"); + tok->done = E_TOKEN; + return ERRORTOKEN; + } + tok->pendin++; + tok->indstack[++tok->indent] = col; + } + else /* col < tok->indstack[tok->indent] */ { + /* Dedent -- any number, must be consistent */ + while (tok->indent > 0 && + col < tok->indstack[tok->indent]) { + tok->indent--; + tok->pendin--; + } + if (col != tok->indstack[tok->indent]) { + fprintf(stderr, "inconsistent dedent\n"); + tok->done = E_TOKEN; + return ERRORTOKEN; + } + } + } + + *p_start = *p_end = tok->cur; + + /* Return pending indents/dedents */ + if (tok->pendin != 0) { + if (tok->pendin < 0) { + tok->pendin++; + return DEDENT; + } + else { + tok->pendin--; + return INDENT; + } + } + + again: + /* Skip spaces */ + do { + c = tok_nextc(tok); + } while (c == ' ' || c == '\t'); + + /* Set start of current token */ + *p_start = tok->cur - 1; + + /* Skip comment */ + if (c == '#') { + /* Hack to allow overriding the tabsize in the file. + This is also recognized by vi, when it occurs near the + beginning or end of the file. (Will vi never die...?) */ + int x; + if (sscanf(tok->cur, " vi:set tabsize=%d:", &x) == 1 && + x >= 1 && x <= 40) { + fprintf(stderr, "# vi:set tabsize=%d:\n", x); + tok->tabsize = x; + } + do { + c = tok_nextc(tok); + } while (c != EOF && c != '\n'); + } + + /* Check for EOF and errors now */ + if (c == EOF) + return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN; + + /* Identifier (most frequent token!) */ + if (isalpha(c) || c == '_') { + do { + c = tok_nextc(tok); + } while (isalnum(c) || c == '_'); + tok_backup(tok, c); + *p_end = tok->cur; + return NAME; + } + + /* Newline */ + if (c == '\n') { + tok->atbol = 1; + *p_end = tok->cur - 1; /* Leave '\n' out of the string */ + return NEWLINE; + } + + /* Number */ + if (isdigit(c)) { + if (c == '0') { + /* Hex or octal */ + c = tok_nextc(tok); + if (c == '.') + goto fraction; + if (c == 'x' || c == 'X') { + /* Hex */ + do { + c = tok_nextc(tok); + } while (isxdigit(c)); + } + else { + /* Octal; c is first char of it */ + /* There's no 'isoctdigit' macro, sigh */ + while ('0' <= c && c < '8') { + c = tok_nextc(tok); + } + } + } + else { + /* Decimal */ + do { + c = tok_nextc(tok); + } while (isdigit(c)); + /* Accept floating point numbers. + XXX This accepts incomplete things like 12e or 1e+; + worry about that at run-time. + XXX Doesn't accept numbers starting with a dot */ + if (c == '.') { + fraction: + /* Fraction */ + do { + c = tok_nextc(tok); + } while (isdigit(c)); + } + if (c == 'e' || c == 'E') { + /* Exponent part */ + c = tok_nextc(tok); + if (c == '+' || c == '-') + c = tok_nextc(tok); + while (isdigit(c)) { + c = tok_nextc(tok); + } + } + } + tok_backup(tok, c); + *p_end = tok->cur; + return NUMBER; + } + + /* String */ + if (c == '\'') { + for (;;) { + c = tok_nextc(tok); + if (c == '\n' || c == EOF) { + tok->done = E_TOKEN; + return ERRORTOKEN; + } + if (c == '\\') { + c = tok_nextc(tok); + *p_end = tok->cur; + if (c == '\n' || c == EOF) { + tok->done = E_TOKEN; + return ERRORTOKEN; + } + continue; + } + if (c == '\'') + break; + } + *p_end = tok->cur; + return STRING; + } + + /* Line continuation */ + if (c == '\\') { + c = tok_nextc(tok); + if (c != '\n') { + tok->done = E_TOKEN; + return ERRORTOKEN; + } + goto again; /* Read next line */ + } + + /* Punctuation character */ + *p_end = tok->cur; + return tok_1char(c); +} + + +#ifdef DEBUG + +void +tok_dump(type, start, end) + int type; + char *start, *end; +{ + printf("%s", tok_name[type]); + if (type == NAME || type == NUMBER || type == STRING || type == OP) + printf("(%.*s)", (int)(end - start), start); +} + +#endif diff --git a/Parser/tokenizer.h b/Parser/tokenizer.h new file mode 100644 index 0000000..8950c62 --- /dev/null +++ b/Parser/tokenizer.h @@ -0,0 +1,29 @@ +/* Tokenizer interface */ + +#include "token.h" /* For token types */ + +#define MAXINDENT 100 /* Max indentation level */ + +/* Tokenizer state */ +struct tok_state { + /* Input state; buf <= cur <= inp <= end */ + /* NB an entire token must fit in the buffer */ + char *buf; /* Input buffer */ + char *cur; /* Next character in buffer */ + char *inp; /* End of data in buffer */ + char *end; /* End of input buffer */ + int done; /* 0 normally, 1 at EOF, -1 after error */ + FILE *fp; /* Rest of input; NULL if tokenizing a string */ + int tabsize; /* Tab spacing */ + int indent; /* Current indentation index */ + int indstack[MAXINDENT]; /* Stack of indents */ + int atbol; /* Nonzero if at begin of new line */ + int pendin; /* Pending indents (if > 0) or dedents (if < 0) */ + char *prompt, *nextprompt; /* For interactive prompting */ + int lineno; /* Current line number */ +}; + +extern struct tok_state *tok_setups PROTO((char *)); +extern struct tok_state *tok_setupf PROTO((FILE *, char *ps1, char *ps2)); +extern void tok_free PROTO((struct tok_state *)); +extern int tok_get PROTO((struct tok_state *, char **, char **)); |