From 05a1c1c8d017993fac4875d1baed7e62ece0bd93 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Mon, 9 Jan 2012 19:59:47 +0000 Subject: [Bug 3464428] string is graph \u0120 is wrong --- ChangeLog | 9 +++ doc/re_syntax.n | 2 +- generic/regc_cvec.c | 51 +++------------ generic/regc_locale.c | 175 ++++++++++++++++++++++++++------------------------ generic/regcomp.c | 87 +++---------------------- generic/tclUtf.c | 125 ++++++++++++++++-------------------- tests/utf.test | 28 ++++++++ tools/uniClass.tcl | 1 + 8 files changed, 203 insertions(+), 275 deletions(-) diff --git a/ChangeLog b/ChangeLog index 05a01bd..1b0d341 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,12 @@ +2012-01-09 Jan Nijtmans + + * generic/tclUtf.c: [Bug 3464428] string is graph \u0120 is wrong + * generic/regcomp.c: Remove some unused code + * generic/regc_locale.c: Add table for Unicode [:cntrl:] class + * tools/uniClass.tcl: Generate Unicode [:cntrl:] class table + * tests/utf.test: + * doc/re_syntax: Fix [:print:] class description + 2011-12-23 Jan Nijtmans * generic/tclUtf.c: [Bug 3464428] string is graph \u0120 is wrong diff --git a/doc/re_syntax.n b/doc/re_syntax.n index 3a47e25..8ec2deb 100644 --- a/doc/re_syntax.n +++ b/doc/re_syntax.n @@ -285,7 +285,7 @@ Standard character classes are: \fBdigit\fR A decimal digit. \fBxdigit\fR A hexadecimal digit. \fBalnum\fR An alphanumeric (letter or digit). -\fBprint\fR An alphanumeric (same as alnum). +\fBprint\fR A "printable" (same as graph, except also including space). \fBblank\fR A space or tab character. \fBspace\fR A character producing white space in displayed text. \fBpunct\fR A punctuation character. diff --git a/generic/regc_cvec.c b/generic/regc_cvec.c index d2d56fc..0b976b8 100644 --- a/generic/regc_cvec.c +++ b/generic/regc_cvec.c @@ -3,20 +3,20 @@ * This file is #included by regcomp.c. * * Copyright (c) 1998, 1999 Henry Spencer. All rights reserved. - * + * * Development of this software was funded, in part, by Cray Research Inc., * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics * Corporation, none of whom are responsible for the results. The author - * thanks all of them. - * + * thanks all of them. + * * Redistribution and use in source and binary forms -- with or without * modification -- are permitted for any purpose, provided that * redistributions in source form retain this entire copyright notice and * indicate the origin and nature of any modifications. - * + * * I'd appreciate being given credit for this package in the documentation * of software which uses it, but that is not a requirement. - * + * * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL @@ -113,38 +113,6 @@ addrange(cv, from, to) } /* - - addmcce - add an MCCE to a cvec - ^ static VOID addmcce(struct cvec *, chr *, chr *); - */ -static VOID -addmcce(cv, startp, endp) - struct cvec *cv; /* character vector */ - chr *startp; /* beginning of text */ - chr *endp; /* just past end of text */ -{ - int len; - int i; - chr *s; - chr *d; - - if (startp == NULL && endp == NULL) { - return; - } - len = endp - startp; - assert(len > 0); - assert(cv->nchrs + len < cv->chrspace - cv->nmccechrs); - assert(cv->nmcces < cv->mccespace); - d = &cv->chrs[cv->chrspace - cv->nmccechrs - len - 1]; - cv->mcces[cv->nmcces++] = d; - for (s = startp, i = len; i > 0; s++, i--) { - *d++ = *s; - } - *d++ = 0; /* endmarker */ - assert(d == &cv->chrs[cv->chrspace - cv->nmccechrs]); - cv->nmccechrs += len + 1; -} - -/* - haschr - does a cvec contain this chr? ^ static int haschr(struct cvec *, pchr); */ @@ -171,24 +139,23 @@ haschr(cv, c) /* - getcvec - get a cvec, remembering it as v->cv - ^ static struct cvec *getcvec(struct vars *, int, int, int); + ^ static struct cvec *getcvec(struct vars *, int, int); */ static struct cvec * -getcvec(v, nchrs, nranges, nmcces) +getcvec(v, nchrs, nranges) struct vars *v; /* context */ int nchrs; /* to hold this many chrs... */ int nranges; /* ... and this many ranges... */ - int nmcces; /* ... and this many MCCEs */ { if (v->cv != NULL && nchrs <= v->cv->chrspace && - nranges <= v->cv->rangespace && nmcces <= v->cv->mccespace) { + nranges <= v->cv->rangespace) { return clearcvec(v->cv); } if (v->cv != NULL) { freecvec(v->cv); } - v->cv = newcvec(nchrs, nranges, nmcces); + v->cv = newcvec(nchrs, nranges, 0); if (v->cv == NULL) { ERR(REG_ESPACE); } diff --git a/generic/regc_locale.c b/generic/regc_locale.c index 77cfd8e..ef8ecdc 100644 --- a/generic/regc_locale.c +++ b/generic/regc_locale.c @@ -12,7 +12,7 @@ /* ASCII character-name table */ -static struct cname { +static CONST struct cname { CONST char *name; CONST char code; } cnames[] = { @@ -224,6 +224,23 @@ static CONST chr alphaCharTable[] = { #define NUM_ALPHA_CHAR (sizeof(alphaCharTable)/sizeof(chr)) /* + * Unicode: control characters. + */ + +static CONST crange controlRangeTable[] = { + {0x007f, 0x009f}, {0x0600, 0x0603}, {0x200b, 0x200f}, {0x202a, 0x202e}, + {0x2060, 0x2064}, {0x206a, 0x206f}, {0xe000, 0xf8ff}, {0xfff9, 0xfffb} +}; + +#define NUM_CONTROL_RANGE (sizeof(controlRangeTable)/sizeof(crange)) + +static CONST chr controlCharTable[] = { + 0x00ad, 0x06dd, 0x070f, 0x17b4, 0x17b5, 0xfeff +}; + +#define NUM_CONTROL_CHAR (sizeof(controlCharTable)/sizeof(chr)) + +/* * Unicode: decimal digit characters. */ @@ -478,7 +495,7 @@ static CONST chr upperCharTable[] = { */ static CONST crange graphRangeTable[] = { - {0x0021, 0x007e}, {0x00a0, 0x00ac}, {0x00ae, 0x0377}, {0x037a, 0x037e}, + {0x0021, 0x007e}, {0x00a1, 0x00ac}, {0x00ae, 0x0377}, {0x037a, 0x037e}, {0x0384, 0x038a}, {0x038e, 0x03a1}, {0x03a3, 0x0527}, {0x0531, 0x0556}, {0x0559, 0x055f}, {0x0561, 0x0587}, {0x0591, 0x05c7}, {0x05d0, 0x05ea}, {0x05f0, 0x05f4}, {0x0606, 0x061b}, {0x061e, 0x06dc}, {0x06de, 0x070d}, @@ -513,21 +530,21 @@ static CONST crange graphRangeTable[] = { {0x1250, 0x1256}, {0x125a, 0x125d}, {0x1260, 0x1288}, {0x128a, 0x128d}, {0x1290, 0x12b0}, {0x12b2, 0x12b5}, {0x12b8, 0x12be}, {0x12c2, 0x12c5}, {0x12c8, 0x12d6}, {0x12d8, 0x1310}, {0x1312, 0x1315}, {0x1318, 0x135a}, - {0x135d, 0x137c}, {0x1380, 0x1399}, {0x13a0, 0x13f4}, {0x1400, 0x169c}, - {0x16a0, 0x16f0}, {0x1700, 0x170c}, {0x170e, 0x1714}, {0x1720, 0x1736}, - {0x1740, 0x1753}, {0x1760, 0x176c}, {0x176e, 0x1770}, {0x1780, 0x17b3}, - {0x17b6, 0x17dd}, {0x17e0, 0x17e9}, {0x17f0, 0x17f9}, {0x1800, 0x180e}, - {0x1810, 0x1819}, {0x1820, 0x1877}, {0x1880, 0x18aa}, {0x18b0, 0x18f5}, - {0x1900, 0x191c}, {0x1920, 0x192b}, {0x1930, 0x193b}, {0x1944, 0x196d}, - {0x1970, 0x1974}, {0x1980, 0x19ab}, {0x19b0, 0x19c9}, {0x19d0, 0x19da}, - {0x19de, 0x1a1b}, {0x1a1e, 0x1a5e}, {0x1a60, 0x1a7c}, {0x1a7f, 0x1a89}, - {0x1a90, 0x1a99}, {0x1aa0, 0x1aad}, {0x1b00, 0x1b4b}, {0x1b50, 0x1b7c}, - {0x1b80, 0x1baa}, {0x1bae, 0x1bb9}, {0x1bc0, 0x1bf3}, {0x1bfc, 0x1c37}, - {0x1c3b, 0x1c49}, {0x1c4d, 0x1c7f}, {0x1cd0, 0x1cf2}, {0x1d00, 0x1de6}, - {0x1dfc, 0x1f15}, {0x1f18, 0x1f1d}, {0x1f20, 0x1f45}, {0x1f48, 0x1f4d}, - {0x1f50, 0x1f57}, {0x1f5f, 0x1f7d}, {0x1f80, 0x1fb4}, {0x1fb6, 0x1fc4}, - {0x1fc6, 0x1fd3}, {0x1fd6, 0x1fdb}, {0x1fdd, 0x1fef}, {0x1ff2, 0x1ff4}, - {0x1ff6, 0x1ffe}, {0x2000, 0x200a}, {0x2010, 0x2029}, {0x202f, 0x205f}, + {0x135d, 0x137c}, {0x1380, 0x1399}, {0x13a0, 0x13f4}, {0x1400, 0x167f}, + {0x1681, 0x169c}, {0x16a0, 0x16f0}, {0x1700, 0x170c}, {0x170e, 0x1714}, + {0x1720, 0x1736}, {0x1740, 0x1753}, {0x1760, 0x176c}, {0x176e, 0x1770}, + {0x1780, 0x17b3}, {0x17b6, 0x17dd}, {0x17e0, 0x17e9}, {0x17f0, 0x17f9}, + {0x1800, 0x180d}, {0x1810, 0x1819}, {0x1820, 0x1877}, {0x1880, 0x18aa}, + {0x18b0, 0x18f5}, {0x1900, 0x191c}, {0x1920, 0x192b}, {0x1930, 0x193b}, + {0x1944, 0x196d}, {0x1970, 0x1974}, {0x1980, 0x19ab}, {0x19b0, 0x19c9}, + {0x19d0, 0x19da}, {0x19de, 0x1a1b}, {0x1a1e, 0x1a5e}, {0x1a60, 0x1a7c}, + {0x1a7f, 0x1a89}, {0x1a90, 0x1a99}, {0x1aa0, 0x1aad}, {0x1b00, 0x1b4b}, + {0x1b50, 0x1b7c}, {0x1b80, 0x1baa}, {0x1bae, 0x1bb9}, {0x1bc0, 0x1bf3}, + {0x1bfc, 0x1c37}, {0x1c3b, 0x1c49}, {0x1c4d, 0x1c7f}, {0x1cd0, 0x1cf2}, + {0x1d00, 0x1de6}, {0x1dfc, 0x1f15}, {0x1f18, 0x1f1d}, {0x1f20, 0x1f45}, + {0x1f48, 0x1f4d}, {0x1f50, 0x1f57}, {0x1f5f, 0x1f7d}, {0x1f80, 0x1fb4}, + {0x1fb6, 0x1fc4}, {0x1fc6, 0x1fd3}, {0x1fd6, 0x1fdb}, {0x1fdd, 0x1fef}, + {0x1ff2, 0x1ff4}, {0x1ff6, 0x1ffe}, {0x2010, 0x2027}, {0x2030, 0x205e}, {0x2074, 0x208e}, {0x2090, 0x209c}, {0x20a0, 0x20b9}, {0x20d0, 0x20f0}, {0x2100, 0x2189}, {0x2190, 0x23f3}, {0x2400, 0x2426}, {0x2440, 0x244a}, {0x2460, 0x26ff}, {0x2701, 0x27ca}, {0x27ce, 0x2b4c}, {0x2b50, 0x2b59}, @@ -535,7 +552,7 @@ static CONST crange graphRangeTable[] = { {0x2d30, 0x2d65}, {0x2d7f, 0x2d96}, {0x2da0, 0x2da6}, {0x2da8, 0x2dae}, {0x2db0, 0x2db6}, {0x2db8, 0x2dbe}, {0x2dc0, 0x2dc6}, {0x2dc8, 0x2dce}, {0x2dd0, 0x2dd6}, {0x2dd8, 0x2dde}, {0x2de0, 0x2e31}, {0x2e80, 0x2e99}, - {0x2e9b, 0x2ef3}, {0x2f00, 0x2fd5}, {0x2ff0, 0x2ffb}, {0x3000, 0x303f}, + {0x2e9b, 0x2ef3}, {0x2f00, 0x2fd5}, {0x2ff0, 0x2ffb}, {0x3001, 0x303f}, {0x3041, 0x3096}, {0x3099, 0x30ff}, {0x3105, 0x312d}, {0x3131, 0x318e}, {0x3190, 0x31ba}, {0x31c0, 0x31e3}, {0x31f0, 0x321e}, {0x3220, 0x32fe}, {0x3300, 0x4db5}, {0x4dc0, 0x9fcb}, {0xa000, 0xa48c}, {0xa490, 0xa4c6}, @@ -582,43 +599,6 @@ static CONST chr graphCharTable[] = { #define CH NOCELT /* - - nmcces - how many distinct MCCEs are there? - ^ static int nmcces(struct vars *); - */ -static int -nmcces(v) - struct vars *v; /* context */ -{ - /* - * No multi-character collating elements defined at the moment. - */ - return 0; -} - -/* - - nleaders - how many chrs can be first chrs of MCCEs? - ^ static int nleaders(struct vars *); - */ -static int -nleaders(v) - struct vars *v; /* context */ -{ - return 0; -} - -/* - - allmcces - return a cvec with all the MCCEs of the locale - ^ static struct cvec *allmcces(struct vars *, struct cvec *); - */ -static struct cvec * -allmcces(v, cv) - struct vars *v; /* context */ - struct cvec *cv; /* this is supposed to have enough room */ -{ - return clearcvec(cv); -} - -/* - element - map collating-element name to celt ^ static celt element(struct vars *, CONST chr *, CONST chr *); */ @@ -690,7 +670,7 @@ range(v, a, b, cases) } if (!cases) { /* easy version */ - cv = getcvec(v, 0, 1, 0); + cv = getcvec(v, 0, 1); NOERRN(); addrange(cv, a, b); return cv; @@ -704,7 +684,7 @@ range(v, a, b, cases) nchrs = (b - a + 1)*2 + 4; - cv = getcvec(v, nchrs, 0, 0); + cv = getcvec(v, nchrs, 0); NOERRN(); for (c=a; c<=b; c++) { @@ -759,7 +739,7 @@ eclass(v, c, cases) */ if ((v->cflags®_FAKE) && c == 'x') { - cv = getcvec(v, 4, 0, 0); + cv = getcvec(v, 4, 0); addchr(cv, (chr)'x'); addchr(cv, (chr)'y'); if (cases) { @@ -776,7 +756,7 @@ eclass(v, c, cases) if (cases) { return allcases(v, c); } - cv = getcvec(v, 1, 0, 0); + cv = getcvec(v, 1, 0); assert(cv != NULL); addchr(cv, (chr)c); return cv; @@ -825,15 +805,6 @@ cclass(v, startp, endp, cases) np = Tcl_UniCharToUtfDString(startp, (int)len, &ds); /* - * Remap lower and upper to alpha if the match is case insensitive. - */ - - if (cases && len == 5 && (strncmp("lower", np, 5) == 0 - || strncmp("upper", np, 5) == 0)) { - np = "alpha"; - } - - /* * Map the name to the corresponding enumerated value. */ @@ -851,13 +822,20 @@ cclass(v, startp, endp, cases) } /* + * Remap lower and upper to alpha if the match is case insensitive. + */ + + if (cases && ((index == CC_LOWER) || (index == CC_UPPER))) { + index = CC_ALNUM; + } + + /* * Now compute the character class contents. */ switch((enum classes) index) { - case CC_PRINT: case CC_ALNUM: - cv = getcvec(v, NUM_ALPHA_CHAR, NUM_DIGIT_RANGE + NUM_ALPHA_RANGE, 0); + cv = getcvec(v, NUM_ALPHA_CHAR, NUM_DIGIT_RANGE + NUM_ALPHA_RANGE); if (cv) { for (i=0 ; (size_t)icv = newcvec(100, 20, 10); if (v->cv == NULL) return freev(v, REG_ESPACE); - i = nmcces(v); - if (i > 0) { - v->mcces = newcvec(nleaders(v), 0, i); - CNOERR(); - v->mcces = allmcces(v, v->mcces); - leaders(v, v->mcces); - addmcce(v->mcces, (chr *)NULL, (chr *)NULL); /* dummy */ - } CNOERR(); /* parsing */ @@ -1356,7 +1343,7 @@ struct state *rp; assert(right->nins == 0); freestate(v->nfa, right); } - + /* - brackpart - handle one item (or range) within a bracket expression ^ static VOID brackpart(struct vars *, struct state *, struct state *); @@ -1493,50 +1480,6 @@ struct vars *v; } /* - - leaders - process a cvec of collating elements to also include leaders - * Also gives all characters involved their own colors, which is almost - * certainly necessary, and sets up little disconnected subNFA. - ^ static VOID leaders(struct vars *, struct cvec *); - */ -static VOID -leaders(v, cv) -struct vars *v; -struct cvec *cv; -{ - int mcce; - chr *p; - chr leader; - struct state *s; - struct arc *a; - - v->mccepbegin = newstate(v->nfa); - v->mccepend = newstate(v->nfa); - NOERR(); - - for (mcce = 0; mcce < cv->nmcces; mcce++) { - p = cv->mcces[mcce]; - leader = *p; - if (!haschr(cv, leader)) { - addchr(cv, leader); - s = newstate(v->nfa); - newarc(v->nfa, PLAIN, subcolor(v->cm, leader), - v->mccepbegin, s); - okcolors(v->nfa, v->cm); - } else { - a = findarc(v->mccepbegin, PLAIN, - GETCOLOR(v->cm, leader)); - assert(a != NULL); - s = a->to; - assert(s != v->mccepend); - } - p++; - assert(*p != 0 && *(p+1) == 0); /* only 2-char MCCEs for now */ - newarc(v->nfa, PLAIN, subcolor(v->cm, *p), s, v->mccepend); - okcolors(v->nfa, v->cm); - } -} - -/* - onechr - fill in arcs for a plain character, and possible case complements * This is mostly a shortcut for efficient handling of the common case. ^ static VOID onechr(struct vars *, pchr, struct state *, struct state *); @@ -1581,19 +1524,7 @@ struct state *rp; struct state *s; struct state *ps; /* state in prototype */ - /* need a place to store leaders, if any */ - if (nmcces(v) > 0) { - assert(v->mcces != NULL); - if (v->cv2 == NULL || v->cv2->nchrs < v->mcces->nchrs) { - if (v->cv2 != NULL) - free(v->cv2); - v->cv2 = newcvec(v->mcces->nchrs, 0, v->mcces->nmcces); - NOERR(); - leads = v->cv2; - } else - leads = clearcvec(v->cv2); - } else - leads = NULL; + leads = NULL; /* first, get the ordinary characters out of the way */ for (p = cv->chrs, i = cv->nchrs; i > 0; p++, i--) { @@ -2067,7 +1998,7 @@ FILE *f; GUTSMAGIC); fprintf(f, "\n\n\n========= DUMP ==========\n"); - fprintf(f, "nsub %d, info 0%lo, csize %d, ntree %d\n", + fprintf(f, "nsub %d, info 0%lo, csize %d, ntree %d\n", re->re_nsub, re->re_info, re->re_csize, g->ntree); dumpcolors(&g->cmap, f); diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 505dc91..6b5e2e8 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -21,36 +21,35 @@ * The following macros are used for fast character category tests. The * x_BITS values are shifted right by the category value to determine whether * the given category is included in the set. - */ + */ #define ALPHA_BITS ((1 << UPPERCASE_LETTER) | (1 << LOWERCASE_LETTER) \ | (1 << TITLECASE_LETTER) | (1 << MODIFIER_LETTER) | (1 << OTHER_LETTER)) +#define CONTROL_BITS ((1 << CONTROL) | (1 << FORMAT) | (1 << PRIVATE_USE)) + #define DIGIT_BITS (1 << DECIMAL_DIGIT_NUMBER) #define SPACE_BITS ((1 << SPACE_SEPARATOR) | (1 << LINE_SEPARATOR) \ | (1 << PARAGRAPH_SEPARATOR)) -#define CONNECTOR_BITS (1 << CONNECTOR_PUNCTUATION) - -#define PRINT_BITS (ALPHA_BITS | DIGIT_BITS | SPACE_BITS | \ - (1 << NON_SPACING_MARK) | (1 << ENCLOSING_MARK) | \ - (1 << COMBINING_SPACING_MARK) | (1 << LETTER_NUMBER) | \ - (1 << OTHER_NUMBER) | (1 << CONNECTOR_PUNCTUATION) | \ - (1 << DASH_PUNCTUATION) | (1 << OPEN_PUNCTUATION) | \ - (1 << CLOSE_PUNCTUATION) | (1 << INITIAL_QUOTE_PUNCTUATION) | \ - (1 << FINAL_QUOTE_PUNCTUATION) | (1 << OTHER_PUNCTUATION) | \ - (1 << MATH_SYMBOL) | (1 << CURRENCY_SYMBOL) | \ - (1 << MODIFIER_SYMBOL) | (1 << OTHER_SYMBOL)) +#define WORD_BITS (ALPHA_BITS | DIGIT_BITS | (1 << CONNECTOR_PUNCTUATION)) #define PUNCT_BITS ((1 << CONNECTOR_PUNCTUATION) | \ (1 << DASH_PUNCTUATION) | (1 << OPEN_PUNCTUATION) | \ (1 << CLOSE_PUNCTUATION) | (1 << INITIAL_QUOTE_PUNCTUATION) | \ (1 << FINAL_QUOTE_PUNCTUATION) | (1 << OTHER_PUNCTUATION)) +#define GRAPH_BITS (WORD_BITS | PUNCT_BITS | \ + (1 << NON_SPACING_MARK) | (1 << ENCLOSING_MARK) | \ + (1 << COMBINING_SPACING_MARK) | (1 << LETTER_NUMBER) | \ + (1 << OTHER_NUMBER) | \ + (1 << MATH_SYMBOL) | (1 << CURRENCY_SYMBOL) | \ + (1 << MODIFIER_SYMBOL) | (1 << OTHER_SYMBOL)) + /* - * Unicode characters less than this value are represented by themselves - * in UTF-8 strings. + * Unicode characters less than this value are represented by themselves + * in UTF-8 strings. */ #define UNICODE_SELF 0x80 @@ -108,7 +107,7 @@ static int UtfCount _ANSI_ARGS_((int ch)); * *--------------------------------------------------------------------------- */ - + INLINE static int UtfCount(ch) int ch; /* The Tcl_UniChar whose size is returned. */ @@ -146,14 +145,14 @@ UtfCount(ch) * * Results: * The return values is the number of bytes in the buffer that - * were consumed. + * were consumed. * * Side effects: * None. * *--------------------------------------------------------------------------- */ - + INLINE int Tcl_UniCharToUtf(ch, str) int ch; /* The Tcl_UniChar to be stored in the @@ -230,7 +229,7 @@ Tcl_UniCharToUtf(ch, str) * *--------------------------------------------------------------------------- */ - + char * Tcl_UniCharToUtfDString(wString, numChars, dsPtr) CONST Tcl_UniChar *wString; /* Unicode string to convert to UTF-8. */ @@ -289,7 +288,7 @@ Tcl_UniCharToUtfDString(wString, numChars, dsPtr) * *--------------------------------------------------------------------------- */ - + int Tcl_UtfToUniChar(str, chPtr) register CONST char *str; /* The UTF-8 string. */ @@ -297,7 +296,7 @@ Tcl_UtfToUniChar(str, chPtr) * by the UTF-8 string. */ { register int byte; - + /* * Unroll 1 to 3 byte UTF-8 sequences, use loop to handle longer ones. */ @@ -334,7 +333,7 @@ Tcl_UtfToUniChar(str, chPtr) * Three-byte-character lead byte followed by two trail bytes. */ - *chPtr = (Tcl_UniChar) (((byte & 0x0F) << 12) + *chPtr = (Tcl_UniChar) (((byte & 0x0F) << 12) | ((str[1] & 0x3F) << 6) | (str[2] & 0x3F)); return 3; } @@ -474,15 +473,15 @@ Tcl_UtfCharComplete(str, len) * Plan 9 utflen() and utfnlen(). * * Results: - * As above. + * As above. * * Side effects: * None. * *--------------------------------------------------------------------------- */ - -int + +int Tcl_NumUtfChars(str, len) register CONST char *str; /* The UTF-8 string to measure. */ int len; /* The length of the string in bytes, or -1 @@ -549,7 +548,7 @@ Tcl_UtfFindFirst(string, ch) { int len; Tcl_UniChar find; - + while (1) { len = TclUtfToUniChar(string, &find); if (find == ch) { @@ -590,7 +589,7 @@ Tcl_UtfFindLast(string, ch) int len; Tcl_UniChar find; CONST char *last; - + last = NULL; while (1) { len = TclUtfToUniChar(string, &find); @@ -624,9 +623,9 @@ Tcl_UtfFindLast(string, ch) * *--------------------------------------------------------------------------- */ - + CONST char * -Tcl_UtfNext(str) +Tcl_UtfNext(str) CONST char *str; /* The current location in the string. */ { Tcl_UniChar ch; @@ -664,7 +663,7 @@ Tcl_UtfPrev(str, start) { CONST char *look; int i, byte; - + str--; look = str; for (i = 0; i < TCL_UTF_MAX; i++) { @@ -685,7 +684,7 @@ Tcl_UtfPrev(str, start) } return str; } - + /* *--------------------------------------------------------------------------- * @@ -702,7 +701,7 @@ Tcl_UtfPrev(str, start) * *--------------------------------------------------------------------------- */ - + Tcl_UniChar Tcl_UniCharAtIndex(src, index) register CONST char *src; /* The UTF-8 string to dereference. */ @@ -740,7 +739,7 @@ Tcl_UtfAtIndex(src, index) register int index; /* The position of the desired character. */ { Tcl_UniChar ch; - + while (index > 0) { index--; src += TclUtfToUniChar(src, &ch); @@ -760,7 +759,7 @@ Tcl_UtfAtIndex(src, index) * returns the number of bytes written to dst. At most TCL_UTF_MAX * bytes are written to dst; dst must have been large enough to accept * those bytes. If readPtr isn't NULL then it is filled in with a - * count of the number of bytes in the backslash sequence. + * count of the number of bytes in the backslash sequence. * * Side effects: * The maximum number of bytes it takes to represent a Unicode @@ -839,7 +838,7 @@ Tcl_UtfToUpper(str) * the conversion (thereby causing a segfault), only copy the * upper case char to dst if its size is <= the original char. */ - + if (bytes < UtfCount(upChar)) { memcpy(dst, src, (size_t) bytes); dst += bytes; @@ -877,7 +876,7 @@ Tcl_UtfToLower(str) Tcl_UniChar ch, lowChar; char *src, *dst; int bytes; - + /* * Iterate over the string until we hit the terminating null. */ @@ -892,7 +891,7 @@ Tcl_UtfToLower(str) * the conversion (thereby causing a segfault), only copy the * lower case char to dst if its size is <= the original char. */ - + if (bytes < UtfCount(lowChar)) { memcpy(dst, src, (size_t) bytes); dst += bytes; @@ -931,7 +930,7 @@ Tcl_UtfToTitle(str) Tcl_UniChar ch, titleChar, lowChar; char *src, *dst; int bytes; - + /* * Capitalize the first character and then lowercase the rest of the * characters until we get to a null. @@ -1216,7 +1215,7 @@ Tcl_UniCharLen(str) CONST Tcl_UniChar *str; /* Unicode string to find length of. */ { int len = 0; - + while (*str != '\0') { len++; str++; @@ -1322,9 +1321,7 @@ int Tcl_UniCharIsAlnum(ch) int ch; /* Unicode character to test. */ { - register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); - - return (((ALPHA_BITS | DIGIT_BITS) >> category) & 1); + return (((ALPHA_BITS | DIGIT_BITS) >> GetCategory(ch)) & 1); } /* @@ -1347,8 +1344,7 @@ int Tcl_UniCharIsAlpha(ch) int ch; /* Unicode character to test. */ { - register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); - return ((ALPHA_BITS >> category) & 1); + return ((ALPHA_BITS >> GetCategory(ch)) & 1); } /* @@ -1371,7 +1367,7 @@ int Tcl_UniCharIsControl(ch) int ch; /* Unicode character to test. */ { - return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == CONTROL); + return ((CONTROL_BITS >> GetCategory(ch)) & 1); } /* @@ -1394,8 +1390,7 @@ int Tcl_UniCharIsDigit(ch) int ch; /* Unicode character to test. */ { - return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) - == DECIMAL_DIGIT_NUMBER); + return (GetCategory(ch) == DECIMAL_DIGIT_NUMBER); } /* @@ -1418,8 +1413,7 @@ int Tcl_UniCharIsGraph(ch) int ch; /* Unicode character to test. */ { - register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); - return (((PRINT_BITS >> category) & 1) && (ch != ' ')); + return ((GRAPH_BITS >> GetCategory(ch)) & 1); } /* @@ -1442,7 +1436,7 @@ int Tcl_UniCharIsLower(ch) int ch; /* Unicode character to test. */ { - return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == LOWERCASE_LETTER); + return (GetCategory(ch) == LOWERCASE_LETTER); } /* @@ -1465,8 +1459,7 @@ int Tcl_UniCharIsPrint(ch) int ch; /* Unicode character to test. */ { - register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); - return ((PRINT_BITS >> category) & 1); + return (((GRAPH_BITS|SPACE_BITS) >> GetCategory(ch)) & 1); } /* @@ -1489,8 +1482,7 @@ int Tcl_UniCharIsPunct(ch) int ch; /* Unicode character to test. */ { - register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); - return ((PUNCT_BITS >> category) & 1); + return ((PUNCT_BITS >> GetCategory(ch)) & 1); } /* @@ -1513,18 +1505,15 @@ int Tcl_UniCharIsSpace(ch) int ch; /* Unicode character to test. */ { - register int category; - /* * If the character is within the first 127 characters, just use the * standard C function, otherwise consult the Unicode table. */ - if (ch < 0x80) { + if (((Tcl_UniChar) ch) < ((Tcl_UniChar) 0x80)) { return isspace(UCHAR(ch)); /* INTL: ISO space */ } else { - category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); - return ((SPACE_BITS >> category) & 1); + return ((SPACE_BITS >> GetCategory(ch)) & 1); } } @@ -1548,7 +1537,7 @@ int Tcl_UniCharIsUpper(ch) int ch; /* Unicode character to test. */ { - return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == UPPERCASE_LETTER); + return (GetCategory(ch) == UPPERCASE_LETTER); } /* @@ -1572,9 +1561,7 @@ int Tcl_UniCharIsWordChar(ch) int ch; /* Unicode character to test. */ { - register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); - - return (((ALPHA_BITS | DIGIT_BITS | CONNECTOR_BITS) >> category) & 1); + return ((WORD_BITS >> GetCategory(ch)) & 1); } /* @@ -1609,16 +1596,16 @@ Tcl_UniCharCaseMatch(string, pattern, nocase) int nocase; /* 0 for case sensitive, 1 for insensitive */ { Tcl_UniChar ch1, p; - + while (1) { p = *pattern; - + /* * See if we're at the end of both the pattern and the string. If * so, we succeeded. If we're at the end of the pattern but not at * the end of the string, we failed. */ - + if (p == 0) { return (*string == 0); } @@ -1633,7 +1620,7 @@ Tcl_UniCharCaseMatch(string, pattern, nocase) * recursively for each postfix of string, until either we match or we * reach the end of the string. */ - + if (p == '*') { /* * Skip all successive *'s in the pattern @@ -1688,7 +1675,7 @@ Tcl_UniCharCaseMatch(string, pattern, nocase) * by a list of characters that are acceptable, or by a range * (two characters separated by "-"). */ - + if (p == '[') { Tcl_UniChar startChar, endChar; @@ -1818,7 +1805,7 @@ TclUniCharMatch(string, strLen, pattern, ptnLen, nocase) * recursively for each postfix of string, until either we match or we * reach the end of the string. */ - + if (p == '*') { /* * Skip all successive *'s in the pattern @@ -1876,7 +1863,7 @@ TclUniCharMatch(string, strLen, pattern, ptnLen, nocase) * by a list of characters that are acceptable, or by a range * (two characters separated by "-"). */ - + if (p == '[') { Tcl_UniChar ch1, startChar, endChar; diff --git a/tests/utf.test b/tests/utf.test index 6d6f301..1d263d0 100644 --- a/tests/utf.test +++ b/tests/utf.test @@ -293,6 +293,34 @@ test utf-21.5 {unicode graph char in regc_locale.c} { # [Bug 3464428] regexp {^[[:graph:]]+$} \u0120 } {1} +test utf-21.6 {TclUniCharIsGraph} { + # [Bug 3464428] + string is graph \u00a0 +} {0} +test utf-21.7 {unicode graph char in regc_locale.c} { + # [Bug 3464428] + regexp {[[:graph:]]} \u0020\u00a0\u2028\u2029 +} {0} +test utf-21.8 {TclUniCharIsPrint} { + # [Bug 3464428] + string is print \u0009 +} {0} +test utf-21.9 {unicode print char in regc_locale.c} { + # [Bug 3464428] + regexp {[[:print:]]} \u0009 +} {0} +test utf-21.10 {unicode print char in regc_locale.c} { + # [Bug 3464428] + regexp {[[:print:]]} \u0009 +} {0} +test utf-21.11 {TclUniCharIsControl} { + # [Bug 3464428] + string is control \u00ad +} {1} +test utf-21.12 {unicode control char in regc_locale.c} { + # [Bug 3464428] + regexp {^[[:cntrl:]]$} \u00ad +} {1} test utf-22.1 {TclUniCharIsWordChar} { string wordend "xyz123_bar fg" 0 diff --git a/tools/uniClass.tcl b/tools/uniClass.tcl index 9f30721..55aa44c 100644 --- a/tools/uniClass.tcl +++ b/tools/uniClass.tcl @@ -87,6 +87,7 @@ puts "/* foreach {type desc} { alpha "alphabetic characters" + control "control characters" digit "decimal digit characters" punct "punctuation characters" space "white space characters" -- cgit v0.12