summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorjan.nijtmans <nijtmans@users.sourceforge.net>2014-07-11 10:38:55 (GMT)
committerjan.nijtmans <nijtmans@users.sourceforge.net>2014-07-11 10:38:55 (GMT)
commitdf203baa1f787de574237d71c3df4491edc0dae4 (patch)
tree30ba70d08a8d04c578395576230330a8fc7e09ad
parent8f7dce1bcda533fc3f4cc6aecee46e7ab6a4a7b3 (diff)
downloadtcl-df203baa1f787de574237d71c3df4491edc0dae4.zip
tcl-df203baa1f787de574237d71c3df4491edc0dae4.tar.gz
tcl-df203baa1f787de574237d71c3df4491edc0dae4.tar.bz2
Starting with Unicode 6.3, the mongolian vowel separator (U+180e) is no longer a whitespace, but for Tcl it still is.
"NEL/Next Line" (U+0085) should have been a Unicode whitespace, but never was in Tcl. This is corrected in Tcl 8.6, but for legacy reasons not in Tcl 8.5. Update documentation accordingly, and extend test-cases for Unicode 7 compliance.
-rw-r--r--doc/string.n3
-rw-r--r--generic/regc_locale.c4
-rw-r--r--tests/utf.test30
3 files changed, 19 insertions, 18 deletions
diff --git a/doc/string.n b/doc/string.n
index f39d57c..7e427ab 100644
--- a/doc/string.n
+++ b/doc/string.n
@@ -161,7 +161,8 @@ Any Unicode printing character, including space.
.IP \fBpunct\fR 12
Any Unicode punctuation character.
.IP \fBspace\fR 12
-Any Unicode space character.
+Any Unicode whitespace character or mongolian vowel separator (U+180e),
+but not NEL/Next Line (U+0085).
.IP \fBtrue\fR 12
Any of the forms allowed to \fBTcl_GetBoolean\fR where the value is
true.
diff --git a/generic/regc_locale.c b/generic/regc_locale.c
index 8dba520..e056078 100644
--- a/generic/regc_locale.c
+++ b/generic/regc_locale.c
@@ -381,8 +381,8 @@ static const crange spaceRangeTable[] = {
#define NUM_SPACE_RANGE (sizeof(spaceRangeTable)/sizeof(crange))
static const chr spaceCharTable[] = {
- 0x20, 0xa0, 0x1680, 0x180e, 0x2028, 0x2029, 0x202f, 0x205f, 0x2060,
- 0x3000, 0xfeff
+ 0x20, 0xa0, 0x1680, 0x180e, 0x2028, 0x2029, 0x202f, 0x205f,
+ 0x3000
};
#define NUM_SPACE_CHAR (sizeof(spaceCharTable)/sizeof(chr))
diff --git a/tests/utf.test b/tests/utf.test
index 35c5f73..30200c1 100644
--- a/tests/utf.test
+++ b/tests/utf.test
@@ -278,15 +278,15 @@ test utf-20.1 {TclUniCharNcmp} {
} {}
test utf-21.1 {TclUniCharIsAlnum} {
- # this returns 1 with Unicode 6 compliance
+ # this returns 1 with Unicode 7 compliance
string is alnum \u1040\u021f\u0220
} {1}
test utf-21.2 {unicode alnum char in regc_locale.c} {
- # this returns 1 with Unicode 6 compliance
+ # this returns 1 with Unicode 7 compliance
list [regexp {^[[:alnum:]]+$} \u1040\u021f\u0220] [regexp {^\w+$} \u1040\u021f\u0220]
} {1 1}
test utf-21.3 {unicode print char in regc_locale.c} {
- # this returns 1 with Unicode 6 compliance
+ # this returns 1 with Unicode 7 compliance
regexp {^[[:print:]]+$} \ufbc1
} 1
test utf-21.4 {TclUniCharIsGraph} {
@@ -319,11 +319,11 @@ test utf-21.10 {unicode print char in regc_locale.c} {
} {0}
test utf-21.11 {TclUniCharIsControl} {
# [Bug 3464428]
- string is control \u00ad
+ string is control \u0000\u001f\u00ad\u0605\u061c\u180e\u2066\ufeff
} {1}
test utf-21.12 {unicode control char in regc_locale.c} {
# [Bug 3464428], [Bug a876646efe]
- regexp {^[[:cntrl:]]*$} \u0000\u001f\u00ad
+ regexp {^[[:cntrl:]]*$} \u0000\u001f\u00ad\u0605\u061c\u180e\u2066\ufeff
} {1}
test utf-22.1 {TclUniCharIsWordChar} {
@@ -334,30 +334,30 @@ test utf-22.2 {TclUniCharIsWordChar} {
} 10
test utf-23.1 {TclUniCharIsAlpha} {
- # this returns 1 with Unicode 6 compliance
- string is alpha \u021f\u0220
+ # this returns 1 with Unicode 7 compliance
+ string is alpha \u021f\u0220\u037f\u052f
} {1}
test utf-23.2 {unicode alpha char in regc_locale.c} {
- # this returns 1 with Unicode 6 compliance
- regexp {^[[:alpha:]]+$} \u021f\u0220
+ # this returns 1 with Unicode 7 compliance
+ regexp {^[[:alpha:]]+$} \u021f\u0220\u037f\u052f
} {1}
test utf-24.1 {TclUniCharIsDigit} {
- # this returns 1 with Unicode 6 compliance
+ # this returns 1 with Unicode 7 compliance
string is digit \u1040\uabf0
} {1}
test utf-24.2 {unicode digit char in regc_locale.c} {
- # this returns 1 with Unicode 6 compliance
+ # this returns 1 with Unicode 7 compliance
list [regexp {^[[:digit:]]+$} \u1040\uabf0] [regexp {^\d+$} \u1040\uabf0]
} {1 1}
test utf-24.3 {TclUniCharIsSpace} {
- # this returns 1 with Unicode 6 compliance
- string is space \u1680\u180e
+ # this returns 1 with Unicode 7 compliance
+ string is space \u1680\u180e\u202f
} {1}
test utf-24.4 {unicode space char in regc_locale.c} {
- # this returns 1 with Unicode 6 compliance
- list [regexp {^[[:space:]]+$} \u1680\u180e] [regexp {^\s+$} \u1680\u180e]
+ # this returns 1 with Unicode 7 compliance
+ list [regexp {^[[:space:]]+$} \u1680\u180e\u202f] [regexp {^\s+$} \u1680\u180e\u202f]
} {1 1}
testConstraint teststringobj [llength [info commands teststringobj]]