summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--ChangeLog7
-rw-r--r--generic/regc_locale.c104
-rw-r--r--generic/tclUtf.c58
-rw-r--r--tests/utf.test28
-rw-r--r--tools/uniClass.tcl1
5 files changed, 130 insertions, 68 deletions
diff --git a/ChangeLog b/ChangeLog
index f12cb79..c3159a7 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,10 @@
+2012-01-09 Jan Nijtmans <nijtmans@users.sf.net>
+
+ * generic/tclUtf.c: [Bug 3464428] string is graph \u0120 is wrong
+ * generic/regc_locale.c: Add table for Unicode [:cntrl:] class
+ * tools/uniClass.tcl: Generate Unicode [:cntrl:] class table
+ * tests/utf.test:
+
2012-01-08 Kevin B. Kenny <kennykb@acm.org>
* library/clock.tcl (ReadZoneinfoFile): Corrected a bug where loading
diff --git a/generic/regc_locale.c b/generic/regc_locale.c
index f652f0e..cd98942 100644
--- a/generic/regc_locale.c
+++ b/generic/regc_locale.c
@@ -224,6 +224,23 @@ static const chr alphaCharTable[] = {
#define NUM_ALPHA_CHAR (sizeof(alphaCharTable)/sizeof(chr))
/*
+ * Unicode: control characters.
+ */
+
+static const crange controlRangeTable[] = {
+ {0x007f, 0x009f}, {0x0600, 0x0603}, {0x200b, 0x200f}, {0x202a, 0x202e},
+ {0x2060, 0x2064}, {0x206a, 0x206f}, {0xe000, 0xf8ff}, {0xfff9, 0xfffb}
+};
+
+#define NUM_CONTROL_RANGE (sizeof(controlRangeTable)/sizeof(crange))
+
+static const chr controlCharTable[] = {
+ 0x00ad, 0x06dd, 0x070f, 0x17b4, 0x17b5, 0xfeff
+};
+
+#define NUM_CONTROL_CHAR (sizeof(controlCharTable)/sizeof(chr))
+
+/*
* Unicode: decimal digit characters.
*/
@@ -478,7 +495,7 @@ static const chr upperCharTable[] = {
*/
static const crange graphRangeTable[] = {
- {0x0021, 0x007e}, {0x00a0, 0x00ac}, {0x00ae, 0x0377}, {0x037a, 0x037e},
+ {0x0021, 0x007e}, {0x00a1, 0x00ac}, {0x00ae, 0x0377}, {0x037a, 0x037e},
{0x0384, 0x038a}, {0x038e, 0x03a1}, {0x03a3, 0x0527}, {0x0531, 0x0556},
{0x0559, 0x055f}, {0x0561, 0x0587}, {0x0591, 0x05c7}, {0x05d0, 0x05ea},
{0x05f0, 0x05f4}, {0x0606, 0x061b}, {0x061e, 0x06dc}, {0x06de, 0x070d},
@@ -513,21 +530,21 @@ static const crange graphRangeTable[] = {
{0x1250, 0x1256}, {0x125a, 0x125d}, {0x1260, 0x1288}, {0x128a, 0x128d},
{0x1290, 0x12b0}, {0x12b2, 0x12b5}, {0x12b8, 0x12be}, {0x12c2, 0x12c5},
{0x12c8, 0x12d6}, {0x12d8, 0x1310}, {0x1312, 0x1315}, {0x1318, 0x135a},
- {0x135d, 0x137c}, {0x1380, 0x1399}, {0x13a0, 0x13f4}, {0x1400, 0x169c},
- {0x16a0, 0x16f0}, {0x1700, 0x170c}, {0x170e, 0x1714}, {0x1720, 0x1736},
- {0x1740, 0x1753}, {0x1760, 0x176c}, {0x176e, 0x1770}, {0x1780, 0x17b3},
- {0x17b6, 0x17dd}, {0x17e0, 0x17e9}, {0x17f0, 0x17f9}, {0x1800, 0x180e},
- {0x1810, 0x1819}, {0x1820, 0x1877}, {0x1880, 0x18aa}, {0x18b0, 0x18f5},
- {0x1900, 0x191c}, {0x1920, 0x192b}, {0x1930, 0x193b}, {0x1944, 0x196d},
- {0x1970, 0x1974}, {0x1980, 0x19ab}, {0x19b0, 0x19c9}, {0x19d0, 0x19da},
- {0x19de, 0x1a1b}, {0x1a1e, 0x1a5e}, {0x1a60, 0x1a7c}, {0x1a7f, 0x1a89},
- {0x1a90, 0x1a99}, {0x1aa0, 0x1aad}, {0x1b00, 0x1b4b}, {0x1b50, 0x1b7c},
- {0x1b80, 0x1baa}, {0x1bae, 0x1bb9}, {0x1bc0, 0x1bf3}, {0x1bfc, 0x1c37},
- {0x1c3b, 0x1c49}, {0x1c4d, 0x1c7f}, {0x1cd0, 0x1cf2}, {0x1d00, 0x1de6},
- {0x1dfc, 0x1f15}, {0x1f18, 0x1f1d}, {0x1f20, 0x1f45}, {0x1f48, 0x1f4d},
- {0x1f50, 0x1f57}, {0x1f5f, 0x1f7d}, {0x1f80, 0x1fb4}, {0x1fb6, 0x1fc4},
- {0x1fc6, 0x1fd3}, {0x1fd6, 0x1fdb}, {0x1fdd, 0x1fef}, {0x1ff2, 0x1ff4},
- {0x1ff6, 0x1ffe}, {0x2000, 0x200a}, {0x2010, 0x2029}, {0x202f, 0x205f},
+ {0x135d, 0x137c}, {0x1380, 0x1399}, {0x13a0, 0x13f4}, {0x1400, 0x167f},
+ {0x1681, 0x169c}, {0x16a0, 0x16f0}, {0x1700, 0x170c}, {0x170e, 0x1714},
+ {0x1720, 0x1736}, {0x1740, 0x1753}, {0x1760, 0x176c}, {0x176e, 0x1770},
+ {0x1780, 0x17b3}, {0x17b6, 0x17dd}, {0x17e0, 0x17e9}, {0x17f0, 0x17f9},
+ {0x1800, 0x180d}, {0x1810, 0x1819}, {0x1820, 0x1877}, {0x1880, 0x18aa},
+ {0x18b0, 0x18f5}, {0x1900, 0x191c}, {0x1920, 0x192b}, {0x1930, 0x193b},
+ {0x1944, 0x196d}, {0x1970, 0x1974}, {0x1980, 0x19ab}, {0x19b0, 0x19c9},
+ {0x19d0, 0x19da}, {0x19de, 0x1a1b}, {0x1a1e, 0x1a5e}, {0x1a60, 0x1a7c},
+ {0x1a7f, 0x1a89}, {0x1a90, 0x1a99}, {0x1aa0, 0x1aad}, {0x1b00, 0x1b4b},
+ {0x1b50, 0x1b7c}, {0x1b80, 0x1baa}, {0x1bae, 0x1bb9}, {0x1bc0, 0x1bf3},
+ {0x1bfc, 0x1c37}, {0x1c3b, 0x1c49}, {0x1c4d, 0x1c7f}, {0x1cd0, 0x1cf2},
+ {0x1d00, 0x1de6}, {0x1dfc, 0x1f15}, {0x1f18, 0x1f1d}, {0x1f20, 0x1f45},
+ {0x1f48, 0x1f4d}, {0x1f50, 0x1f57}, {0x1f5f, 0x1f7d}, {0x1f80, 0x1fb4},
+ {0x1fb6, 0x1fc4}, {0x1fc6, 0x1fd3}, {0x1fd6, 0x1fdb}, {0x1fdd, 0x1fef},
+ {0x1ff2, 0x1ff4}, {0x1ff6, 0x1ffe}, {0x2010, 0x2027}, {0x2030, 0x205e},
{0x2074, 0x208e}, {0x2090, 0x209c}, {0x20a0, 0x20b9}, {0x20d0, 0x20f0},
{0x2100, 0x2189}, {0x2190, 0x23f3}, {0x2400, 0x2426}, {0x2440, 0x244a},
{0x2460, 0x26ff}, {0x2701, 0x27ca}, {0x27ce, 0x2b4c}, {0x2b50, 0x2b59},
@@ -535,7 +552,7 @@ static const crange graphRangeTable[] = {
{0x2d30, 0x2d65}, {0x2d7f, 0x2d96}, {0x2da0, 0x2da6}, {0x2da8, 0x2dae},
{0x2db0, 0x2db6}, {0x2db8, 0x2dbe}, {0x2dc0, 0x2dc6}, {0x2dc8, 0x2dce},
{0x2dd0, 0x2dd6}, {0x2dd8, 0x2dde}, {0x2de0, 0x2e31}, {0x2e80, 0x2e99},
- {0x2e9b, 0x2ef3}, {0x2f00, 0x2fd5}, {0x2ff0, 0x2ffb}, {0x3000, 0x303f},
+ {0x2e9b, 0x2ef3}, {0x2f00, 0x2fd5}, {0x2ff0, 0x2ffb}, {0x3001, 0x303f},
{0x3041, 0x3096}, {0x3099, 0x30ff}, {0x3105, 0x312d}, {0x3131, 0x318e},
{0x3190, 0x31ba}, {0x31c0, 0x31e3}, {0x31f0, 0x321e}, {0x3220, 0x32fe},
{0x3300, 0x4db5}, {0x4dc0, 0x9fcb}, {0xa000, 0xa48c}, {0xa490, 0xa4c6},
@@ -788,15 +805,6 @@ cclass(
np = Tcl_UniCharToUtfDString(startp, (int)len, &ds);
/*
- * Remap lower and upper to alpha if the match is case insensitive.
- */
-
- if (cases && len == 5 && (strncmp("lower", np, 5) == 0
- || strncmp("upper", np, 5) == 0)) {
- np = "alpha";
- }
-
- /*
* Map the name to the corresponding enumerated value.
*/
@@ -814,6 +822,14 @@ cclass(
}
/*
+ * Remap lower and upper to alpha if the match is case insensitive.
+ */
+
+ if (cases && ((index == CC_LOWER) || (index == CC_UPPER))) {
+ index = CC_ALNUM;
+ }
+
+ /*
* Now compute the character class contents.
*/
@@ -858,9 +874,16 @@ cclass(
addchr(cv, ' ');
break;
case CC_CNTRL:
- cv = getcvec(v, 0, 2);
- addrange(cv, 0x0, 0x1f);
- addrange(cv, 0x7f, 0x9f);
+ cv = getcvec(v, NUM_CONTROL_CHAR, NUM_CONTROL_RANGE);
+ if (cv) {
+ for (i=0 ; (size_t)i<NUM_CONTROL_RANGE ; i++) {
+ addrange(cv, controlRangeTable[i].start,
+ controlRangeTable[i].end);
+ }
+ for (i=0 ; (size_t)i<NUM_CONTROL_CHAR ; i++) {
+ addchr(cv, controlCharTable[i]);
+ }
+ }
break;
case CC_DIGIT:
cv = getcvec(v, 0, NUM_DIGIT_RANGE);
@@ -937,13 +960,28 @@ cclass(
}
break;
case CC_PRINT:
+ cv = getcvec(v, NUM_SPACE_CHAR + NUM_GRAPH_CHAR, NUM_SPACE_RANGE + NUM_GRAPH_RANGE - 1);
+ if (cv) {
+ for (i=1 ; (size_t)i<NUM_SPACE_RANGE ; i++) {
+ addrange(cv, spaceRangeTable[i].start,
+ spaceRangeTable[i].end);
+ }
+ for (i=0 ; (size_t)i<NUM_SPACE_CHAR ; i++) {
+ addchr(cv, spaceCharTable[i]);
+ }
+ for (i=0 ; (size_t)i<NUM_GRAPH_RANGE ; i++) {
+ addrange(cv, graphRangeTable[i].start,
+ graphRangeTable[i].end);
+ }
+ for (i=0 ; (size_t)i<NUM_GRAPH_CHAR ; i++) {
+ addchr(cv, graphCharTable[i]);
+ }
+ }
+ break;
case CC_GRAPH:
cv = getcvec(v, NUM_GRAPH_CHAR, NUM_GRAPH_RANGE);
if (cv) {
- /* For CC_PRINT, include space as well */
- addrange(cv, graphRangeTable[0].start - (index == CC_PRINT),
- graphRangeTable[0].end);
- for (i=1 ; (size_t)i<NUM_GRAPH_RANGE ; i++) {
+ for (i=0 ; (size_t)i<NUM_GRAPH_RANGE ; i++) {
addrange(cv, graphRangeTable[i].start,
graphRangeTable[i].end);
}
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index 04e9d77..e13bf92 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -26,28 +26,27 @@
#define ALPHA_BITS ((1 << UPPERCASE_LETTER) | (1 << LOWERCASE_LETTER) \
| (1 << TITLECASE_LETTER) | (1 << MODIFIER_LETTER) | (1<<OTHER_LETTER))
+#define CONTROL_BITS ((1 << CONTROL) | (1 << FORMAT) | (1 << PRIVATE_USE))
+
#define DIGIT_BITS (1 << DECIMAL_DIGIT_NUMBER)
#define SPACE_BITS ((1 << SPACE_SEPARATOR) | (1 << LINE_SEPARATOR) \
| (1 << PARAGRAPH_SEPARATOR))
-#define CONNECTOR_BITS (1 << CONNECTOR_PUNCTUATION)
-
-#define PRINT_BITS (ALPHA_BITS | DIGIT_BITS | SPACE_BITS | \
- (1 << NON_SPACING_MARK) | (1 << ENCLOSING_MARK) | \
- (1 << COMBINING_SPACING_MARK) | (1 << LETTER_NUMBER) | \
- (1 << OTHER_NUMBER) | (1 << CONNECTOR_PUNCTUATION) | \
- (1 << DASH_PUNCTUATION) | (1 << OPEN_PUNCTUATION) | \
- (1 << CLOSE_PUNCTUATION) | (1 << INITIAL_QUOTE_PUNCTUATION) | \
- (1 << FINAL_QUOTE_PUNCTUATION) | (1 << OTHER_PUNCTUATION) | \
- (1 << MATH_SYMBOL) | (1 << CURRENCY_SYMBOL) | \
- (1 << MODIFIER_SYMBOL) | (1 << OTHER_SYMBOL))
+#define WORD_BITS (ALPHA_BITS | DIGIT_BITS | (1 << CONNECTOR_PUNCTUATION))
#define PUNCT_BITS ((1 << CONNECTOR_PUNCTUATION) | \
(1 << DASH_PUNCTUATION) | (1 << OPEN_PUNCTUATION) | \
(1 << CLOSE_PUNCTUATION) | (1 << INITIAL_QUOTE_PUNCTUATION) | \
(1 << FINAL_QUOTE_PUNCTUATION) | (1 << OTHER_PUNCTUATION))
+#define GRAPH_BITS (WORD_BITS | PUNCT_BITS | \
+ (1 << NON_SPACING_MARK) | (1 << ENCLOSING_MARK) | \
+ (1 << COMBINING_SPACING_MARK) | (1 << LETTER_NUMBER) | \
+ (1 << OTHER_NUMBER) | \
+ (1 << MATH_SYMBOL) | (1 << CURRENCY_SYMBOL) | \
+ (1 << MODIFIER_SYMBOL) | (1 << OTHER_SYMBOL))
+
/*
* Unicode characters less than this value are represented by themselves in
* UTF-8 strings.
@@ -1329,9 +1328,7 @@ int
Tcl_UniCharIsAlnum(
int ch) /* Unicode character to test. */
{
- register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
-
- return (((ALPHA_BITS | DIGIT_BITS) >> category) & 1);
+ return (((ALPHA_BITS | DIGIT_BITS) >> GetCategory(ch)) & 1);
}
/*
@@ -1354,8 +1351,7 @@ int
Tcl_UniCharIsAlpha(
int ch) /* Unicode character to test. */
{
- register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
- return ((ALPHA_BITS >> category) & 1);
+ return ((ALPHA_BITS >> GetCategory(ch)) & 1);
}
/*
@@ -1378,7 +1374,7 @@ int
Tcl_UniCharIsControl(
int ch) /* Unicode character to test. */
{
- return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == CONTROL);
+ return ((CONTROL_BITS >> GetCategory(ch)) & 1);
}
/*
@@ -1401,7 +1397,7 @@ int
Tcl_UniCharIsDigit(
int ch) /* Unicode character to test. */
{
- return (GetUniCharInfo(ch)&UNICODE_CATEGORY_MASK) == DECIMAL_DIGIT_NUMBER;
+ return (GetCategory(ch) == DECIMAL_DIGIT_NUMBER);
}
/*
@@ -1424,8 +1420,7 @@ int
Tcl_UniCharIsGraph(
int ch) /* Unicode character to test. */
{
- register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
- return (((PRINT_BITS >> category) & 1) && (ch != ' '));
+ return ((GRAPH_BITS >> GetCategory(ch)) & 1);
}
/*
@@ -1448,7 +1443,7 @@ int
Tcl_UniCharIsLower(
int ch) /* Unicode character to test. */
{
- return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == LOWERCASE_LETTER);
+ return (GetCategory(ch) == LOWERCASE_LETTER);
}
/*
@@ -1471,8 +1466,7 @@ int
Tcl_UniCharIsPrint(
int ch) /* Unicode character to test. */
{
- register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
- return ((PRINT_BITS >> category) & 1);
+ return (((GRAPH_BITS|SPACE_BITS) >> GetCategory(ch)) & 1);
}
/*
@@ -1495,8 +1489,7 @@ int
Tcl_UniCharIsPunct(
int ch) /* Unicode character to test. */
{
- register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
- return ((PUNCT_BITS >> category) & 1);
+ return ((PUNCT_BITS >> GetCategory(ch)) & 1);
}
/*
@@ -1519,18 +1512,15 @@ int
Tcl_UniCharIsSpace(
int ch) /* Unicode character to test. */
{
- register int category;
-
/*
* If the character is within the first 127 characters, just use the
* standard C function, otherwise consult the Unicode table.
*/
- if (ch < 0x80) {
- return TclIsSpaceProc((char)ch);
+ if (((Tcl_UniChar) ch) < ((Tcl_UniChar) 0x80)) {
+ return isspace(UCHAR(ch)); /* INTL: ISO space */
} else {
- category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
- return ((SPACE_BITS >> category) & 1);
+ return ((SPACE_BITS >> GetCategory(ch)) & 1);
}
}
@@ -1554,7 +1544,7 @@ int
Tcl_UniCharIsUpper(
int ch) /* Unicode character to test. */
{
- return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == UPPERCASE_LETTER);
+ return (GetCategory(ch) == UPPERCASE_LETTER);
}
/*
@@ -1577,9 +1567,7 @@ int
Tcl_UniCharIsWordChar(
int ch) /* Unicode character to test. */
{
- register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
-
- return (((ALPHA_BITS | DIGIT_BITS | CONNECTOR_BITS) >> category) & 1);
+ return ((WORD_BITS >> GetCategory(ch)) & 1);
}
/*
diff --git a/tests/utf.test b/tests/utf.test
index 875c5dc..fcd2a73 100644
--- a/tests/utf.test
+++ b/tests/utf.test
@@ -309,6 +309,34 @@ test utf-21.5 {unicode graph char in regc_locale.c} {
# [Bug 3464428]
regexp {^[[:graph:]]+$} \u0120
} {1}
+test utf-21.6 {TclUniCharIsGraph} {
+ # [Bug 3464428]
+ string is graph \u00a0
+} {0}
+test utf-21.7 {unicode graph char in regc_locale.c} {
+ # [Bug 3464428]
+ regexp {[[:graph:]]} \u0020\u00a0\u2028\u2029
+} {0}
+test utf-21.8 {TclUniCharIsPrint} {
+ # [Bug 3464428]
+ string is print \u0009
+} {0}
+test utf-21.9 {unicode print char in regc_locale.c} {
+ # [Bug 3464428]
+ regexp {[[:print:]]} \u0009
+} {0}
+test utf-21.10 {unicode print char in regc_locale.c} {
+ # [Bug 3464428]
+ regexp {[[:print:]]} \u0009
+} {0}
+test utf-21.11 {TclUniCharIsControl} {
+ # [Bug 3464428]
+ string is control \u00ad
+} {1}
+test utf-21.12 {unicode control char in regc_locale.c} {
+ # [Bug 3464428]
+ regexp {^[[:cntrl:]]$} \u00ad
+} {1}
test utf-22.1 {TclUniCharIsWordChar} {
string wordend "xyz123_bar fg" 0
diff --git a/tools/uniClass.tcl b/tools/uniClass.tcl
index 9a1bf13..282ce95 100644
--- a/tools/uniClass.tcl
+++ b/tools/uniClass.tcl
@@ -87,6 +87,7 @@ puts "/*
foreach {type desc} {
alpha "alphabetic characters"
+ control "control characters"
digit "decimal digit characters"
punct "punctuation characters"
space "white space characters"