diff options
author | stanton <stanton> | 1998-10-21 20:39:57 (GMT) |
---|---|---|
committer | stanton <stanton> | 1998-10-21 20:39:57 (GMT) |
commit | 7e7056e21d0a0d9fa39bdfd742e82b101a6c4b7c (patch) | |
tree | 99e08a09e1567ade05e7bc7edac3758b3695d424 /generic/regc_locale.c | |
parent | 966ff877247e93fbe6e641cfa77df19d03cfe932 (diff) | |
download | tcl-7e7056e21d0a0d9fa39bdfd742e82b101a6c4b7c.zip tcl-7e7056e21d0a0d9fa39bdfd742e82b101a6c4b7c.tar.gz tcl-7e7056e21d0a0d9fa39bdfd742e82b101a6c4b7c.tar.bz2 |
Integrated latest regexp changes from Henry Spencer.
Moved regexp related declarations out of tclInt.h and into tclRegexp.h.
Added "encoding" command.
Diffstat (limited to 'generic/regc_locale.c')
-rw-r--r-- | generic/regc_locale.c | 426 |
1 files changed, 426 insertions, 0 deletions
diff --git a/generic/regc_locale.c b/generic/regc_locale.c new file mode 100644 index 0000000..769241f --- /dev/null +++ b/generic/regc_locale.c @@ -0,0 +1,426 @@ +/* + * locale-specific stuff, including MCCE handling + * This file is #included by regcomp.c. + * + * No MCCEs for Tcl. The handling of character names and classes is + * still ASCII-centric, and needs to be extended to handle full Unicode. + */ + +/* ASCII character-name table */ +static struct cname { + char *name; + char code; +} cnames[] = { + {"NUL", '\0'}, + {"SOH", '\001'}, + {"STX", '\002'}, + {"ETX", '\003'}, + {"EOT", '\004'}, + {"ENQ", '\005'}, + {"ACK", '\006'}, + {"BEL", '\007'}, + {"alert", '\007'}, + {"BS", '\010'}, + {"backspace", '\b'}, + {"HT", '\011'}, + {"tab", '\t'}, + {"LF", '\012'}, + {"newline", '\n'}, + {"VT", '\013'}, + {"vertical-tab", '\v'}, + {"FF", '\014'}, + {"form-feed", '\f'}, + {"CR", '\015'}, + {"carriage-return", '\r'}, + {"SO", '\016'}, + {"SI", '\017'}, + {"DLE", '\020'}, + {"DC1", '\021'}, + {"DC2", '\022'}, + {"DC3", '\023'}, + {"DC4", '\024'}, + {"NAK", '\025'}, + {"SYN", '\026'}, + {"ETB", '\027'}, + {"CAN", '\030'}, + {"EM", '\031'}, + {"SUB", '\032'}, + {"ESC", '\033'}, + {"IS4", '\034'}, + {"FS", '\034'}, + {"IS3", '\035'}, + {"GS", '\035'}, + {"IS2", '\036'}, + {"RS", '\036'}, + {"IS1", '\037'}, + {"US", '\037'}, + {"space", ' '}, + {"exclamation-mark", '!'}, + {"quotation-mark", '"'}, + {"number-sign", '#'}, + {"dollar-sign", '$'}, + {"percent-sign", '%'}, + {"ampersand", '&'}, + {"apostrophe", '\''}, + {"left-parenthesis", '('}, + {"right-parenthesis", ')'}, + {"asterisk", '*'}, + {"plus-sign", '+'}, + {"comma", ','}, + {"hyphen", '-'}, + {"hyphen-minus", '-'}, + {"period", '.'}, + {"full-stop", '.'}, + {"slash", '/'}, + {"solidus", '/'}, + {"zero", '0'}, + {"one", '1'}, + {"two", '2'}, + {"three", '3'}, + {"four", '4'}, + {"five", '5'}, + {"six", '6'}, + {"seven", '7'}, + {"eight", '8'}, + {"nine", '9'}, + {"colon", ':'}, + {"semicolon", ';'}, + {"less-than-sign", '<'}, + {"equals-sign", '='}, + {"greater-than-sign", '>'}, + {"question-mark", '?'}, + {"commercial-at", '@'}, + {"left-square-bracket", '['}, + {"backslash", '\\'}, + {"reverse-solidus", '\\'}, + {"right-square-bracket", ']'}, + {"circumflex", '^'}, + {"circumflex-accent", '^'}, + {"underscore", '_'}, + {"low-line", '_'}, + {"grave-accent", '`'}, + {"left-brace", '{'}, + {"left-curly-bracket", '{'}, + {"vertical-line", '|'}, + {"right-brace", '}'}, + {"right-curly-bracket", '}'}, + {"tilde", '~'}, + {"DEL", '\177'}, + {NULL, 0} +}; + +/* ASCII character-class table */ +static struct cclass { + char *name; + char *chars; + int hasch; +} cclasses[] = { + {"alnum", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\ +0123456789", 1}, + {"alpha", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz", + 1}, + {"blank", " \t", 0}, + {"cntrl", "\007\b\t\n\v\f\r\1\2\3\4\5\6\16\17\20\21\22\23\24\ +\25\26\27\30\31\32\33\34\35\36\37\177", 0}, + {"digit", "0123456789", 0}, + {"graph", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\ +0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~", + 1}, + {"lower", "abcdefghijklmnopqrstuvwxyz", + 1}, + {"print", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\ +0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~ ", + 1}, + {"punct", "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~", + 0}, + {"space", "\t\n\v\f\r ", 0}, + {"upper", "ABCDEFGHIJKLMNOPQRSTUVWXYZ", + 0}, + {"xdigit", "0123456789ABCDEFabcdef", + 0}, + {NULL, 0, 0} +}; + +#define CH NOCELT + +/* + - nmcces - how many distinct MCCEs are there? + ^ static int nmcces(struct vars *); + */ +static int +nmcces(v) +struct vars *v; +{ + return 0; +} + +/* + - nleaders - how many chrs can be first chrs of MCCEs? + ^ static int nleaders(struct vars *); + */ +static int +nleaders(v) +struct vars *v; +{ + return 0; +} + +/* + - allmcces - return a cvec with all the MCCEs of the locale + ^ static struct cvec *allmcces(struct vars *, struct cvec *); + */ +static struct cvec * +allmcces(v, cv) +struct vars *v; +struct cvec *cv; /* this is supposed to have enough room */ +{ + return clearcvec(cv); +} + +/* + - element - map collating-element name to celt + ^ static celt element(struct vars *, chr *, chr *); + */ +static celt +element(v, startp, endp) +struct vars *v; +chr *startp; /* points to start of name */ +chr *endp; /* points just past end of name */ +{ + struct cname *cn; + size_t len; + Tcl_DString ds; + char *np; + + /* generic: one-chr names stand for themselves */ + assert(startp < endp); + len = endp - startp; + if (len == 1) + return *startp; + + NOTE(REG_ULOCALE); + + /* search table */ + Tcl_DStringInit(&ds); + np = TclUniCharToUtfDString(startp, (int)len, &ds); + for (cn = cnames; cn->name != NULL; cn++) + if (strlen(cn->name) == len && strncmp(cn->name, np, len) == 0) + break; /* NOTE BREAK OUT */ + Tcl_DStringFree(&ds); + if (cn->name != NULL) + return CHR(cn->code); + + /* couldn't find it */ + ERR(REG_ECOLLATE); + return 0; +} + +/* + - range - supply cvec for a range, including legality check + ^ static struct cvec *range(struct vars *, celt, celt, int); + */ +static struct cvec * +range(v, a, b, cases) +struct vars *v; +celt a; +celt b; /* might equal a */ +int cases; /* case-independent? */ +{ + int nchrs; + struct cvec *cv; + celt c, lc, uc, tc; + + if (a != b && !before(a, b)) { + ERR(REG_ERANGE); + return NULL; + } + + nchrs = b - a + 1; + if (cases) + nchrs *= 2; + cv = getcvec(v, nchrs, 0); + NOERRN(); + + for (c = a; c <= b; c++) { + addchr(cv, c); + if (cases) { + lc = Tcl_UniCharToLower((chr)c); + uc = Tcl_UniCharToUpper((chr)c); + tc = Tcl_UniCharToTitle((chr)c); + if (c != lc) { + addchr(cv, lc); + } + if (c != uc) { + addchr(cv, uc); + } + if (c != tc && tc != uc) { + addchr(cv, tc); + } + } + } + + return cv; +} + +/* + - before - is celt x before celt y, for purposes of range legality? + ^ static int before(celt, celt); + */ +static int /* predicate */ +before(x, y) +celt x; +celt y; +{ + /* trivial because no MCCEs */ + if (x < y) + return 1; + return 0; +} + +/* + - eclass - supply cvec for an equivalence class + * Must include case counterparts on request. + ^ static struct cvec *eclass(struct vars *, celt, int); + */ +static struct cvec * +eclass(v, c, cases) +struct vars *v; +celt c; +int cases; /* all cases? */ +{ + struct cvec *cv; + + /* crude fake equivalence class for testing */ + if ((v->cflags®_FAKEEC) && c == 'x') { + cv = getcvec(v, 4, 0); + addchr(cv, (chr)'x'); + addchr(cv, (chr)'y'); + if (cases) { + addchr(cv, (chr)'X'); + addchr(cv, (chr)'Y'); + } + return cv; + } + + /* otherwise, none */ + if (cases) + return allcases(v, c); + cv = getcvec(v, 1, 0); + assert(cv != NULL); + addchr(cv, (chr)c); + return cv; +} + +/* + - cclass - supply cvec for a character class + * Must include case counterparts on request. + ^ static struct cvec *cclass(struct vars *, chr *, chr *, int); + */ +static struct cvec * +cclass(v, startp, endp, cases) +struct vars *v; +chr *startp; /* where the name starts */ +chr *endp; /* just past the end of the name */ +int cases; /* case-independent? */ +{ + size_t len; + char *p; + struct cclass *cc; + struct cvec *cv; + Tcl_DString ds; + char *np; + + /* find the name */ + len = endp - startp; + Tcl_DStringInit(&ds); + np = TclUniCharToUtfDString(startp, (int)len, &ds); + if (cases && len == 5 && (strncmp("lower", np, 5) == 0 || + strncmp("upper", np, 5) == 0)) + np = "alpha"; + for (cc = cclasses; cc->name != NULL; cc++) + if (strlen(cc->name) == len && strncmp(cc->name, np, len) == 0) + break; /* NOTE BREAK OUT */ + Tcl_DStringFree(&ds); + if (cc->name == NULL) { + ERR(REG_ECTYPE); + return NULL; + } + + /* set up vector */ + cv = getcvec(v, (int)strlen(cc->chars), 0); + if (cv == NULL) { + ERR(REG_ESPACE); + return NULL; + } + + /* fill it in */ + for (p = cc->chars; *p != '\0'; p++) + addchr(cv, (chr)*p); + + return cv; +} + +/* + - allcases - supply cvec for all case counterparts of a chr (including itself) + * This is a shortcut, preferably an efficient one, for simple characters; + * messy cases are done via range(). + ^ static struct cvec *allcases(struct vars *, pchr); + */ +static struct cvec * +allcases(v, pc) +struct vars *v; +pchr pc; +{ + struct cvec *cv = getcvec(v, 2, 0); + chr c = (chr)pc; + + assert(cv != NULL); + addchr(cv, c); + if (TclUniCharIsUpper(c)) + addchr(cv, Tcl_UniCharToLower(c)); + else if (TclUniCharIsLower(c)) + addchr(cv, Tcl_UniCharToUpper(c)); + + return cv; +} + +/* + - cmp - chr-substring compare + * Backrefs need this. It should preferably be efficient. + * Note that it does not need to report anything except equal/unequal. + * Note also that the length is exact, and the comparison should not + * stop at embedded NULs! + ^ static int cmp(CONST chr *, CONST chr *, size_t); + */ +static int /* 0 for equal, nonzero for unequal */ +cmp(x, y, len) +CONST chr *x; +CONST chr *y; +size_t len; /* exact length of comparison */ +{ + return memcmp(VS(x), VS(y), len*sizeof(chr)); +} + +/* + - casecmp - case-independent chr-substring compare + * REG_ICASE backrefs need this. It should preferably be efficient. + * Note that it does not need to report anything except equal/unequal. + * Note also that the length is exact, and the comparison should not + * stop at embedded NULs! + ^ static int casecmp(CONST chr *, CONST chr *, size_t); + */ +static int /* 0 for equal, nonzero for unequal */ +casecmp(x, y, len) +CONST chr *x; +CONST chr *y; +size_t len; /* exact length of comparison */ +{ + size_t i; + CONST chr *xp; + CONST chr *yp; + + for (xp = x, yp = y, i = len; i > 0; i--) + if (Tcl_UniCharToLower(*xp++) != Tcl_UniCharToLower(*yp++)) + return 1; + return 0; +} |