From f61c3bdb284cedeb0db64a332f84bba54262565c Mon Sep 17 00:00:00 2001
From: "jan.nijtmans" <nijtmans@users.sourceforge.net>
Date: Tue, 1 May 2018 19:02:32 +0000
Subject: Start implementing TIP #497. regexp's now are >BMP-aware. WIP

---
 generic/regc_locale.c |   4 +-
 generic/regcustom.h   |  12 ++---
 generic/regex.h       |   2 +-
 generic/tclInt.h      |   9 ++++
 generic/tclRegexp.c   |  39 +++++++++-------
 generic/tclUtf.c      | 125 ++++++++++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 162 insertions(+), 29 deletions(-)

diff --git a/generic/regc_locale.c b/generic/regc_locale.c
index 002b264..19ac511 100644
--- a/generic/regc_locale.c
+++ b/generic/regc_locale.c
@@ -828,7 +828,7 @@ element(
      */
 
     Tcl_DStringInit(&ds);
-    np = Tcl_UniCharToUtfDString(startp, (int)len, &ds);
+    np = TclUnicodeToUtfDString(startp, (int)len, &ds);
     for (cn=cnames; cn->name!=NULL; cn++) {
 	if (strlen(cn->name)==len && strncmp(cn->name, np, len)==0) {
 	    break;			/* NOTE BREAK OUT */
@@ -1000,7 +1000,7 @@ cclass(
 
     len = endp - startp;
     Tcl_DStringInit(&ds);
-    np = Tcl_UniCharToUtfDString(startp, (int)len, &ds);
+    np = TclUnicodeToUtfDString(startp, (int)len, &ds);
 
     /*
      * Map the name to the corresponding enumerated value.
diff --git a/generic/regcustom.h b/generic/regcustom.h
index 095385d..5befada 100644
--- a/generic/regcustom.h
+++ b/generic/regcustom.h
@@ -66,7 +66,7 @@
 #undef __REG_NOCHAR
 #endif
 /* Interface types */
-#define	__REG_WIDE_T	Tcl_UniChar
+#define	__REG_WIDE_T	unsigned
 #define	__REG_REGOFF_T	long	/* Not really right, but good enough... */
 /* Names and declarations */
 #define	__REG_WIDE_COMPILE	TclReComp
@@ -81,22 +81,16 @@
  * Internal character type and related.
  */
 
-typedef Tcl_UniChar chr;	/* The type itself. */
+typedef unsigned chr;	/* The type itself. */
 typedef int pchr;		/* What it promotes to. */
 typedef unsigned uchr;		/* Unsigned type that will hold a chr. */
 typedef int celt;		/* Type to hold chr, or NOCELT */
 #define	NOCELT (-1)		/* Celt value which is not valid chr */
 #define	CHR(c) (UCHAR(c))	/* Turn char literal into chr literal */
 #define	DIGITVAL(c) ((c)-'0')	/* Turn chr digit into its value */
-#if TCL_UTF_MAX > 4
 #define	CHRBITS	32		/* Bits in a chr; must not use sizeof */
 #define	CHR_MIN	0x00000000	/* Smallest and largest chr; the value */
-#define	CHR_MAX	0x10ffff	/* CHR_MAX-CHR_MIN+1 should fit in uchr */
-#else
-#define	CHRBITS	16		/* Bits in a chr; must not use sizeof */
-#define	CHR_MIN	0x0000		/* Smallest and largest chr; the value */
-#define	CHR_MAX	0xffff		/* CHR_MAX-CHR_MIN+1 should fit in uchr */
-#endif
+#define	CHR_MAX	0x0010ffff	/* CHR_MAX-CHR_MIN+1 should fit in uchr */
 
 /*
  * Functions operating on chr.
diff --git a/generic/regex.h b/generic/regex.h
index 8845f72..0b559f4 100644
--- a/generic/regex.h
+++ b/generic/regex.h
@@ -99,7 +99,7 @@ extern "C" {
 #undef __REG_NOCHAR
 #endif
 /* interface types */
-#define	__REG_WIDE_T	Tcl_UniChar
+#define	__REG_WIDE_T	unsigned
 #define	__REG_REGOFF_T	long	/* not really right, but good enough... */
 /* names and declarations */
 #define	__REG_WIDE_COMPILE	TclReComp
diff --git a/generic/tclInt.h b/generic/tclInt.h
index 50048e9..28549d9 100644
--- a/generic/tclInt.h
+++ b/generic/tclInt.h
@@ -3985,6 +3985,15 @@ MODULE_SCOPE int	TclPtrUnsetVarIdx(Tcl_Interp *interp, Var *varPtr,
 MODULE_SCOPE void	TclInvalidateNsPath(Namespace *nsPtr);
 MODULE_SCOPE void	TclFindArrayPtrElements(Var *arrayPtr,
 			    Tcl_HashTable *tablePtr);
+#if TCL_UTF_MAX <= 4
+MODULE_SCOPE char *	TclUnicodeToUtfDString(const unsigned *uniStr,
+				int uniLength, Tcl_DString *dsPtr);
+MODULE_SCOPE unsigned *	TclUtfToUnicodeDString(const char *src, int length,
+				Tcl_DString *dsPtr);
+#else
+#   define TclUnicodeToUtfDString Tcl_UniCharToUtfDString
+#   define TclUtfToUnicodeDString Tcl_UtfToUniCharDString
+#endif
 
 /*
  * The new extended interface to the variable traces.
diff --git a/generic/tclRegexp.c b/generic/tclRegexp.c
index 5f8dc20..79b979c 100644
--- a/generic/tclRegexp.c
+++ b/generic/tclRegexp.c
@@ -90,8 +90,8 @@ static void		DupRegexpInternalRep(Tcl_Obj *srcPtr,
 static void		FinalizeRegexp(ClientData clientData);
 static void		FreeRegexp(TclRegexp *regexpPtr);
 static void		FreeRegexpInternalRep(Tcl_Obj *objPtr);
-static int		RegExpExecUniChar(Tcl_Interp *interp, Tcl_RegExp re,
-			    const Tcl_UniChar *uniString, int numChars,
+static int		RegExpExecUnicode(Tcl_Interp *interp, Tcl_RegExp re,
+			    const __REG_WIDE_T *uniString, int numChars,
 			    int nmatches, int flags);
 static int		SetRegexpFromAny(Tcl_Interp *interp, Tcl_Obj *objPtr);
 
@@ -175,7 +175,7 @@ Tcl_RegExpExec(
     int flags, result, numChars;
     TclRegexp *regexp = (TclRegexp *) re;
     Tcl_DString ds;
-    const Tcl_UniChar *ustr;
+    const __REG_WIDE_T *ustr;
 
     /*
      * If the starting point is offset from the beginning of the buffer, then
@@ -200,9 +200,9 @@ Tcl_RegExpExec(
      */
 
     Tcl_DStringInit(&ds);
-    ustr = Tcl_UtfToUniCharDString(text, -1, &ds);
-    numChars = Tcl_DStringLength(&ds) / sizeof(Tcl_UniChar);
-    result = RegExpExecUniChar(interp, re, ustr, numChars, -1 /* nmatches */,
+    ustr = TclUtfToUnicodeDString(text, -1, &ds);
+    numChars = Tcl_DStringLength(&ds) / sizeof(__REG_WIDE_T);
+    result = RegExpExecUnicode(interp, re, ustr, numChars, -1 /* nmatches */,
 	    flags);
     Tcl_DStringFree(&ds);
 
@@ -261,7 +261,7 @@ Tcl_RegExpRange(
 /*
  *---------------------------------------------------------------------------
  *
- * RegExpExecUniChar --
+ * RegExpExecUnicode --
  *
  *	Execute the regular expression matcher using a compiled form of a
  *	regular expression and save information about any match that is found.
@@ -279,12 +279,12 @@ Tcl_RegExpRange(
  */
 
 static int
-RegExpExecUniChar(
+RegExpExecUnicode(
     Tcl_Interp *interp,		/* Interpreter to use for error reporting. */
     Tcl_RegExp re,		/* Compiled regular expression; returned by a
 				 * previous call to Tcl_GetRegExpFromObj */
-    const Tcl_UniChar *wString,	/* String against which to match re. */
-    int numChars,		/* Length of Tcl_UniChar string (must be
+    const __REG_WIDE_T *wString,	/* String against which to match re. */
+    int numChars,		/* Length of Unicode string (must be
 				 * >=0). */
     int nmatches,		/* How many subexpression matches (counting
 				 * the whole match as subexpression 0) are of
@@ -432,8 +432,9 @@ Tcl_RegExpExecObj(
     int flags)			/* Regular expression execution flags. */
 {
     TclRegexp *regexpPtr = (TclRegexp *) re;
-    Tcl_UniChar *udata;
-    int length;
+    Tcl_DString ds;
+    __REG_WIDE_T *udata;
+    int length, result;
     int reflags = regexpPtr->flags;
 #define TCL_REG_GLOBOK_FLAGS \
 	(TCL_REG_ADVANCED | TCL_REG_NOSUB | TCL_REG_NOCASE)
@@ -464,7 +465,9 @@ Tcl_RegExpExecObj(
     regexpPtr->string = NULL;
     regexpPtr->objPtr = textObj;
 
-    udata = Tcl_GetUnicodeFromObj(textObj, &length);
+    Tcl_DStringInit(&ds);
+    udata = TclUtfToUnicodeDString(Tcl_GetString(textObj), -1, &ds);
+    length = Tcl_DStringLength(&ds)/sizeof(__REG_WIDE_T);
 
     if (offset > length) {
 	offset = length;
@@ -472,7 +475,9 @@ Tcl_RegExpExecObj(
     udata += offset;
     length -= offset;
 
-    return RegExpExecUniChar(interp, re, udata, length, nmatches, flags);
+    result = RegExpExecUnicode(interp, re, udata, length, nmatches, flags);
+    Tcl_DStringFree(&ds);
+    return result;
 }
 
 /*
@@ -858,7 +863,7 @@ CompileRegexp(
     int flags)			/* Compilation flags. */
 {
     TclRegexp *regexpPtr;
-    const Tcl_UniChar *uniString;
+    const __REG_WIDE_T *uniString;
     int numChars, status, i, exact;
     Tcl_DString stringBuf;
     ThreadSpecificData *tsdPtr = TCL_TSD_INIT(&dataKey);
@@ -923,8 +928,8 @@ CompileRegexp(
      */
 
     Tcl_DStringInit(&stringBuf);
-    uniString = Tcl_UtfToUniCharDString(string, length, &stringBuf);
-    numChars = Tcl_DStringLength(&stringBuf) / sizeof(Tcl_UniChar);
+    uniString = TclUtfToUnicodeDString(string, length, &stringBuf);
+    numChars = Tcl_DStringLength(&stringBuf) / sizeof(__REG_WIDE_T);
 
     /*
      * Compile the string and check for errors.
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index 1d73a7a..259a124 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -235,6 +235,63 @@ Tcl_UniCharToUtfDString(
 /*
  *---------------------------------------------------------------------------
  *
+ * TclUnicodeToUtfDString --
+ *
+ *	Convert the given Unicode string to UTF-8.
+ *
+ * Results:
+ *	The return value is a pointer to the UTF-8 representation of the
+ *	Unicode string. Storage for the return value is appended to the end of
+ *	dsPtr.
+ *
+ * Side effects:
+ *	None.
+ *
+ *---------------------------------------------------------------------------
+ */
+
+#if TCL_UTF_MAX <= 4
+char *
+TclUnicodeToUtfDString(
+    const unsigned *uniStr,	/* Unicode string to convert to UTF-8. */
+    int uniLength,		/* Length of Unicode string in Tcl_UniChars
+				 * (must be >= 0). */
+    Tcl_DString *dsPtr)		/* UTF-8 representation of string is appended
+				 * to this previously initialized DString. */
+{
+    const unsigned *w, *wEnd;
+    char *p, *string;
+    int oldLength;
+
+    /*
+     * UTF-8 string length in bytes will be <= Unicode string length * 4.
+     */
+
+    oldLength = Tcl_DStringLength(dsPtr);
+    Tcl_DStringSetLength(dsPtr, (oldLength + uniLength + 1) * 4);
+    string = Tcl_DStringValue(dsPtr) + oldLength;
+
+    p = string;
+    wEnd = uniStr + uniLength;
+    for (w = uniStr; w < wEnd; ) {
+	if ((*w & 0xD800) == 0xD800) {
+	    *p++ = (*w >> 12) | 0xE0;
+	    *p++ = ((*w >> 6) & 0x3F) | 0x80;
+	    *p++ = (*w & 0x3F) | 0xE0;
+	} else {
+	    p += Tcl_UniCharToUtf(*w, p);
+	}
+	w++;
+    }
+    Tcl_DStringSetLength(dsPtr, oldLength + (p - string));
+
+    return string;
+}
+#endif
+
+/*
+ *---------------------------------------------------------------------------
+ *
  * Tcl_UtfToUniChar --
  *
  *	Extract the Tcl_UniChar represented by the UTF-8 string. Bad UTF-8
@@ -439,6 +496,74 @@ Tcl_UtfToUniCharDString(
 /*
  *---------------------------------------------------------------------------
  *
+ * TclUtfToUnicodeDString --
+ *
+ *	Convert the UTF-8 string to Unicode.
+ *
+ * Results:
+ *	The return value is a pointer to the Unicode representation of the
+ *	UTF-8 string. Storage for the return value is appended to the end of
+ *	dsPtr. The Unicode string is terminated with a Unicode NULL character.
+ *
+ * Side effects:
+ *	None.
+ *
+ *---------------------------------------------------------------------------
+ */
+#if TCL_UTF_MAX <= 4
+unsigned *
+TclUtfToUnicodeDString(
+    const char *src,		/* UTF-8 string to convert to Unicode. */
+    int length,			/* Length of UTF-8 string in bytes, or -1 for
+				 * strlen(). */
+    Tcl_DString *dsPtr)		/* Unicode representation of string is
+				 * appended to this previously initialized
+				 * DString. */
+{
+    Tcl_UniChar ch = 0;
+    unsigned *w, *wString;
+    const char *p, *end;
+    int oldLength, len;
+
+    if (length < 0) {
+	length = strlen(src);
+    }
+
+    /*
+     * Unicode string length in Tcl_UniChars will be <= UTF-8 string length in
+     * bytes.
+     */
+
+    oldLength = Tcl_DStringLength(dsPtr);
+/* TODO: fix overreach! */
+    Tcl_DStringSetLength(dsPtr,
+	    (int) ((oldLength + length + 1) * sizeof(unsigned)));
+    wString = (unsigned *) (Tcl_DStringValue(dsPtr) + oldLength);
+
+    w = wString;
+    end = src + length;
+    for (p = src; p < end; ) {
+	len = TclUtfToUniChar(p, &ch);
+	if (!len) {
+	    int high = ch;
+	    len = TclUtfToUniChar(p, &ch);
+	    *w++ = ((high & 0x7ff) << 10) + (ch & 0x7ff) + 0x10000;
+	} else {
+	    *w++ = ch;
+	}
+	p += len;
+    }
+    *w = '\0';
+    Tcl_DStringSetLength(dsPtr,
+	    (oldLength + ((char *) w - (char *) wString)));
+
+    return wString;
+}
+#endif
+
+/*
+ *---------------------------------------------------------------------------
+ *
  * Tcl_UtfCharComplete --
  *
  *	Determine if the UTF-8 string of the given length is long enough to be
-- 
cgit v0.12


From 80a7abf7e553cc0c0ea01f10df7790996460b133 Mon Sep 17 00:00:00 2001
From: "jan.nijtmans" <nijtmans@users.sourceforge.net>
Date: Fri, 27 Sep 2019 13:02:39 +0000
Subject: Adapt test-case to full-utf correct behaviour

---
 tests/string.test | 14 +++++++-------
 tests/utf.test    | 10 +++++-----
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/tests/string.test b/tests/string.test
index c54b5ba..299c765 100644
--- a/tests/string.test
+++ b/tests/string.test
@@ -31,7 +31,7 @@ proc makeShared {s} {uplevel 1 [list lappend copy $s]; return $s}
 testConstraint testobj [expr {[info commands testobj] ne {}}]
 testConstraint testindexobj [expr {[info commands testindexobj] ne {}}]
 testConstraint testevalex [expr {[info commands testevalex] ne {}}]
-testConstraint tip389 [expr {[string length \U010000] == 2}]
+testConstraint fullutf [expr {[string length \U010000] == 1}]
 
 # Used for constraining memory leak tests
 testConstraint memory [llength [info commands memory]]
@@ -505,9 +505,9 @@ test string-5.19.$noComp {string index, bytearray object out of bounds} {
 test string-5.20.$noComp {string index, bytearray object out of bounds} {
     run {string index [binary format I* {0x50515253 0x52}] 20}
 } {}
-test string-5.21.$noComp {string index, surrogates, bug [11ae2be95dac9417]} tip389 {
+test string-5.21.$noComp {string index, surrogates, bug [11ae2be95dac9417]} fullutf {
     run {list [string index a\U100000b 1] [string index a\U100000b 2] [string index a\U100000b 3]}
-} [list \U100000 {} b]
+} [list \U100000 b {}]
 
 
 proc largest_int {} {
@@ -1502,9 +1502,9 @@ test string-12.22.$noComp {string range, shimmering binary/index} {
     binary scan $s a* x
     run {string range $s $s end}
 } 000000001
-test string-12.23.$noComp {string range, surrogates, bug [11ae2be95dac9417]} tip389 {
+test string-12.23.$noComp {string range, surrogates, bug [11ae2be95dac9417]} fullutf {
     run {list [string range a\U100000b 1 1] [string range a\U100000b 2 2] [string range a\U100000b 3 3]}
-} [list \U100000 {} b]
+} [list \U100000 b {}]
 
 test string-13.1.$noComp {string repeat} {
     list [catch {run {string repeat}} msg] $msg
@@ -1743,10 +1743,10 @@ test string-17.7.$noComp {string totitle, unicode} {
 test string-17.8.$noComp {string totitle, compiled} {
     lindex [run {string totitle [list aa bb [list cc]]}] 0
 } Aa
-test string-17.9.$noComp {string totitle, surrogates, bug [11ae2be95dac9417]} tip389 {
+test string-17.9.$noComp {string totitle, surrogates, bug [11ae2be95dac9417]} fullutf {
     run {list [string totitle a\U118c0c 1 1] [string totitle a\U118c0c 2 2] \
 	[string totitle a\U118c0c 3 3]}
-} [list a\U118a0c a\U118c0C a\U118c0C]
+} [list a\U118a0c a\U118c0C a\U118c0c]
 
 test string-18.1.$noComp {string trim} {
     list [catch {run {string trim}} msg] $msg
diff --git a/tests/utf.test b/tests/utf.test
index 979c4a6..45698e4 100644
--- a/tests/utf.test
+++ b/tests/utf.test
@@ -21,7 +21,7 @@ testConstraint testbytestring [llength [info commands testbytestring]]
 catch {unset x}
 
 # Some tests require support for 4-byte UTF-8 sequences
-testConstraint tip389 [expr {[string length \U010000] == 2}]
+testConstraint fullutf [expr {[string length \U010000] == 1}]
 
 test utf-1.1 {Tcl_UniCharToUtf: 1 byte sequences} testbytestring {
     expr {"\x01" eq [testbytestring "\x01"]}
@@ -78,12 +78,12 @@ test utf-2.6 {Tcl_UtfToUniChar: lead (3-byte) followed by 1 trail} testbytestrin
 test utf-2.7 {Tcl_UtfToUniChar: lead (3-byte) followed by 2 trail} testbytestring {
     string length [testbytestring "\xE4\xb9\x8e"]
 } {1}
-test utf-2.8 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail} -constraints {tip389 testbytestring} -body {
+test utf-2.8 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail} -constraints {fullutf testbytestring} -body {
     string length [testbytestring "\xF0\x90\x80\x80"]
-} -result {2}
-test utf-2.9 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail} -constraints {tip389 testbytestring} -body {
+} -result {1}
+test utf-2.9 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail} -constraints {fullutf testbytestring} -body {
     string length [testbytestring "\xF4\x8F\xBF\xBF"]
-} -result {2}
+} -result {1}
 test utf-2.10 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail, underflow} testbytestring {
     string length [testbytestring "\xF0\x8F\xBF\xBF"]
 } {4}
-- 
cgit v0.12