6 files changed, 62 insertions, 19 deletions
diff --git a/ChangeLog b/ChangeLog
index 7f75101..e6a6f97 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,18 @@
+2012-11-05  Jan Nijtmans  <nijtmans@users.sf.net>
+
+	IMPLEMENTATION OF TIP#413
+
+	Align the [string trim] and [string is space] commands, such that
+	[string trim] by default trims all characters for which [string is
+	space] returns 1, augmented with the NUL character.
+
+	* generic/tclUtf.c: Add NEL, BOM and two more characters to [string is
+	space]
+	* generic/tclCmdMZ.c: Modify [string trim] for Unicode modifications.
+	* generic/regc_locale.c: Regexp engine must match [string is space]
+	* doc/string.n
+	* tests/string.test
+
 2012-10-31  Jan Nijtmans  <nijtmans@users.sf.net>
 
 	* win/Makefile.in:   Dde version number to 1.4.0, ready for Tcl 8.6.0rc1
diff --git a/doc/string.n b/doc/string.n
index 1cbea16..3eae964 100644
--- a/doc/string.n
+++ b/doc/string.n
@@ -149,7 +149,8 @@ Any Unicode printing character, including space.
 .IP \fBpunct\fR 12
 Any Unicode punctuation character.
 .IP \fBspace\fR 12
-Any Unicode space character.
+Any Unicode whitespace character, zero width space (U+200b),
+word joiner (U+2060) and zero width no-break space (U+feff) (=BOM).
 .IP \fBtrue\fR 12
 Any of the forms allowed to \fBTcl_GetBoolean\fR where the value is
 true.
@@ -335,22 +336,22 @@ specified using the forms described in \fBSTRING INDICES\fR.
 .
 Returns a value equal to \fIstring\fR except that any leading or
 trailing characters present in the string given by \fIchars\fR are removed.  If
-\fIchars\fR is not specified then white space is removed (spaces,
-tabs, newlines, and carriage returns).
+\fIchars\fR is not specified then white space is removed (any character
+for which \fBstring is space\fR returns 1, and "\0").
 .TP
 \fBstring trimleft \fIstring\fR ?\fIchars\fR?
 .
 Returns a value equal to \fIstring\fR except that any leading
 characters present in the string given by \fIchars\fR are removed.  If
-\fIchars\fR is not specified then white space is removed (spaces,
-tabs, newlines, and carriage returns).
+\fIchars\fR is not specified then white space is removed (any character
+for which \fBstring is space\fR returns 1, and "\0").
 .TP
 \fBstring trimright \fIstring\fR ?\fIchars\fR?
 .
 Returns a value equal to \fIstring\fR except that any trailing
 characters present in the string given by \fIchars\fR are removed.  If
-\fIchars\fR is not specified then white space is removed (spaces,
-tabs, newlines, and carriage returns).
+\fIchars\fR is not specified then white space is removed (any character
+for which \fBstring is space\fR returns 1, and "\0").
 .TP
 \fBstring wordend \fIstring charIndex\fR
 .
diff --git a/generic/regc_locale.c b/generic/regc_locale.c
index 40791f4..f3db471 100644
--- a/generic/regc_locale.c
+++ b/generic/regc_locale.c
@@ -354,13 +354,14 @@ static const chr punctCharTable[] = {
  */
 
 static const crange spaceRangeTable[] = {
-    {0x9, 0xd}, {0x2000, 0x200a}
+    {0x9, 0xd}, {0x2000, 0x200b}
 };
 
 #define NUM_SPACE_RANGE (sizeof(spaceRangeTable)/sizeof(crange))
 
 static const chr spaceCharTable[] = {
-    0x20, 0xa0, 0x1680, 0x180e, 0x2028, 0x2029, 0x202f, 0x205f, 0x3000
+    0x20, 0x85, 0xa0, 0x1680, 0x180e, 0x2028, 0x2029, 0x202f, 0x205f,
+    0x2060, 0x3000, 0xfeff
 };
 
 #define NUM_SPACE_CHAR (sizeof(spaceCharTable)/sizeof(chr))
diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c
index 9e720ea..88b3420 100644
--- a/generic/tclCmdMZ.c
+++ b/generic/tclCmdMZ.c
@@ -34,12 +34,35 @@ static int		UniCharIsHexDigit(int character);
 
 /*
  * Default set of characters to trim in [string trim] and friends. This is a
- * UTF-8 literal string containing space, tab, newline, carriage return,
- * ethiopic wordspace (U+1361), ogham space mark (U+1680), and ideographic
- * space (U+3000). [TIP #318]
+ * UTF-8 literal string containing all Unicode space characters [TIP #413]
  */
 
-#define DEFAULT_TRIM_SET " \t\n\r\xe1\x8d\xa1\xe1\x9a\x80\xe3\x80\x80"
+#define DEFAULT_TRIM_SET \
+	"\x09\x0a\x0b\x0c\x0d " /* ASCII */\
+	"\xc0\x80" /*     nul (U+0000) */\
+	"\xc2\x85" /*     next line (U+0085) */\
+	"\xc2\xa0" /*     non-breaking space (U+00a0) */\
+	"\xe1\x9a\x80" /* ogham space mark (U+1680) */ \
+	"\xe1\xa0\x8e" /* mongolian vowel separator (U+180e) */\
+	"\xe2\x80\x80" /* en quad (U+2000) */\
+	"\xe2\x80\x81" /* em quad (U+2001) */\
+	"\xe2\x80\x82" /* en space (U+2002) */\
+	"\xe2\x80\x83" /* em space (U+2003) */\
+	"\xe2\x80\x84" /* three-per-em space (U+2004) */\
+	"\xe2\x80\x85" /* four-per-em space (U+2005) */\
+	"\xe2\x80\x86" /* six-per-em space (U+2006) */\
+	"\xe2\x80\x87" /* figure space (U+2007) */\
+	"\xe2\x80\x88" /* punctuation space (U+2008) */\
+	"\xe2\x80\x89" /* thin space (U+2009) */\
+	"\xe2\x80\x8a" /* hair space (U+200a) */\
+	"\xe2\x80\x8b" /* zero width space (U+200b) */\
+	"\xe2\x80\xa8" /* line separator (U+2028) */\
+	"\xe2\x80\xa9" /* paragraph separator (U+2029) */\
+	"\xe2\x80\xaf" /* narrow no-break space (U+202f) */\
+	"\xe2\x81\x9f" /* medium mathematical space (U+205f) */\
+	"\xe2\x81\xa0" /* word joiner (U+2060) */\
+	"\xe3\x80\x80" /* ideographic space (U+3000) */\
+	"\xef\xbb\xbf" /* zero width no-break space (U+feff) */
 
 /*
  *----------------------------------------------------------------------
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index f0d08e7..4b5b37b 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -1516,6 +1516,9 @@ Tcl_UniCharIsSpace(
 
     if (((Tcl_UniChar) ch) < ((Tcl_UniChar) 0x80)) {
 	return isspace(UCHAR(ch)); /* INTL: ISO space */
+    } else if ((Tcl_UniChar) ch == 0x0085 || (Tcl_UniChar) ch == 0x200b
+	    || (Tcl_UniChar) ch == 0x2060 || (Tcl_UniChar) ch == 0xfeff) {
+	return 1;
     } else {
 	return ((SPACE_BITS >> GetCategory(ch)) & 1);
     }
diff --git a/tests/string.test b/tests/string.test
index e86c0de..f558d30 100644
--- a/tests/string.test
+++ b/tests/string.test
@@ -1484,8 +1484,8 @@ test string-18.11 {string trim, unicode} {
     string trim "\xe7\xe8 AB\xe7C \xe8\xe7" \xe7\xe8
 } " AB\xe7C "
 test string-18.12 {string trim, unicode default} {
-    string trim ABC\u1361\u1680\u3000
-} ABC
+    string trim \ufeff\x00\u0085\u00a0\u1680\u180eABC\u1361\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u200b\u2028\u2029\u202f\u205f\u3000
+} ABC\u1361
 
 test string-19.1 {string trimleft} {
     list [catch {string trimleft} msg] $msg
@@ -1494,8 +1494,8 @@ test string-19.2 {string trimleft} {
     string trimleft "    XYZ      "
 } {XYZ      }
 test string-19.3 {string trimleft, unicode default} {
-    string trimleft \u1361\u1680\u3000ABC
-} ABC
+    string trimleft \ufeff\u0085\u00a0\x00\u1680\u180e\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u200b\u2028\u2029\u202f\u205f\u3000\u1361ABC
+} \u1361ABC
 
 test string-20.1 {string trimright errors} {
     list [catch {string trimright} msg] $msg
@@ -1513,8 +1513,8 @@ test string-20.5 {string trimright} {
     string trimright ""
 } {}
 test string-20.6 {string trimright, unicode default} {
-    string trimright ABC\u1361\u1680\u3000
-} ABC
+    string trimright ABC\u1361\u0085\x00\u00a0\u1680\u180e\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u200b\u2028\u2029\u202f\u205f\u3000
+} ABC\u1361
 
 test string-21.1 {string wordend} {
     list [catch {string wordend a} msg] $msg