From bbcdfd9e87e6ffd0c35950b0d6777239785d892a Mon Sep 17 00:00:00 2001
From: "jan.nijtmans" <nijtmans@users.sourceforge.net>
Date: Tue, 6 Feb 2024 13:01:11 +0000
Subject: Proposed fix for [80d4c1c7e5]: Isolated surrogates are not replaced
 when encoding to / from utf-8

---
 generic/tclEncoding.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c
index d780299..63ebdf4 100644
--- a/generic/tclEncoding.c
+++ b/generic/tclEncoding.c
@@ -2485,7 +2485,6 @@ UtfToUtfProc(
     flags |= PTR2INT(clientData);
     dstEnd = dst + dstLen - ((flags & ENCODING_UTF) ? TCL_UTF_MAX : 6);
 
-
     profile = ENCODING_PROFILE_GET(flags);
     for (numChars = 0; src < srcEnd && numChars <= charLimit; numChars++) {
 
@@ -2596,6 +2595,8 @@ UtfToUtfProc(
 		result = (flags & ENCODING_INPUT) ? TCL_CONVERT_SYNTAX : TCL_CONVERT_UNKNOWN;
 		src = saveSrc;
 		break;
+	    } else if (PROFILE_REPLACE(profile) && SURROGATE(ch)) {
+		ch = UNICODE_REPLACE_CHAR;
 	    }
 	    dst += Tcl_UniCharToUtf(ch, dst);
 	}
-- 
cgit v0.12


From ed502a9c1fb2a4504b8de6a475fcf19e92bd6607 Mon Sep 17 00:00:00 2001
From: apnadkarni <apnmbx-wits@yahoo.com>
Date: Wed, 7 Feb 2024 03:06:05 +0000
Subject: Add test cases for [80d4c1c7e5]

---
 tests/encoding.test       | 6 ++++--
 tests/encodingVectors.tcl | 6 ++++--
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/tests/encoding.test b/tests/encoding.test
index 8bc096c..c27fcd3 100644
--- a/tests/encoding.test
+++ b/tests/encoding.test
@@ -881,8 +881,8 @@ test encoding-24.37 {Parse invalid utf-8 with -profile tcl8} -body {
 test encoding-24.38.1 {Try to generate invalid utf-8} -body {
     encoding convertto -profile tcl8 utf-8 \uD800
 } -result \xED\xA0\x80
-test encoding-24.38.2 {Try to generate invalid utf-8} -body {
-    encoding convertto -profile strict utf-8 \uD800
+test encoding-24.38.2 {Try to generate invalid utf-8 - default profile} -body {
+    encoding convertto utf-8 \uD800
 } -returnCodes 1 -result {unexpected character at index 0: 'U+00D800'}
 test encoding-24.39 {Try to generate invalid utf-8 with -profile strict} -body {
     encoding convertto -profile strict utf-8 \uD800
@@ -1176,6 +1176,8 @@ test encoding-bug-66ffafd309-2-replace {Bug [66ffafd309] - invalid DBCS} -body {
     encoding convertfrom -profile replace jis0208 \x78\x79
 } -result \uFFFD\uFFFD
 
+
+
 # cleanup
 namespace delete ::tcl::test::encoding
 ::tcltest::cleanupTests
diff --git a/tests/encodingVectors.tcl b/tests/encodingVectors.tcl
index 8bd6b87..6583473 100644
--- a/tests/encodingVectors.tcl
+++ b/tests/encodingVectors.tcl
@@ -641,10 +641,12 @@ lappend encUnencodableStrings {*}{
     iso8859-1 \u0141 tcl8    3f -1 {} unencodable
     iso8859-1 \u0141 strict  {}  0 {} unencodable
 
-    utf-8 \uD800 tcl8    eda080 -1 {} High-surrogate
-    utf-8 \uD800 strict  {}      0 {} High-surrogate
+    utf-8 \uD800 tcl8    eda080 -1 {} Low-surrogate
+    utf-8 \uD800 replace efbfbd -1 {} Low-surrogate
+    utf-8 \uD800 strict  {}      0 {} Low-surrogate
     utf-8 \uDC00 tcl8    edb080 -1 {} High-surrogate
     utf-8 \uDC00 strict  {}      0 {} High-surrogate
+    utf-8 \uDC00 replace efbfbd -1 {} High-surrogate
 }
 
 
-- 
cgit v0.12


From f531ea653b3043bcd9a542083ee9c6e07e286cfc Mon Sep 17 00:00:00 2001
From: pooryorick <com.digitalsmarties@pooryorick.com>
Date: Wed, 7 Feb 2024 14:47:07 +0000
Subject: Implementation of TIP 652.

---
 doc/UniCharIsAlpha.3    |  8 +-------
 generic/tcl.decls       |  8 +++++---
 generic/tclCmdMZ.c      |  7 ++-----
 generic/tclCompCmdsSZ.c |  8 ++------
 generic/tclCompile.h    |  1 -
 generic/tclStubInit.c   |  2 +-
 generic/tclUtf.c        | 30 ------------------------------
 tests/string.test       | 25 ++-----------------------
 8 files changed, 13 insertions(+), 76 deletions(-)

diff --git a/doc/UniCharIsAlpha.3 b/doc/UniCharIsAlpha.3
index 4ae4612..2b8c0d7 100644
--- a/doc/UniCharIsAlpha.3
+++ b/doc/UniCharIsAlpha.3
@@ -8,7 +8,7 @@
 .so man.macros
 .BS
 .SH NAME
-Tcl_UniCharIsAlnum, Tcl_UniCharIsAlpha, Tcl_UniCharIsControl, Tcl_UniCharIsDigit, Tcl_UniCharIsGraph, Tcl_UniCharIsLower, Tcl_UniCharIsPrint, Tcl_UniCharIsPunct, Tcl_UniCharIsSpace, Tcl_UniCharIsUpper, Tcl_UniCharIsUnicode, Tcl_UniCharIsWordChar \- routines for classification of Tcl_UniChar characters
+Tcl_UniCharIsAlnum, Tcl_UniCharIsAlpha, Tcl_UniCharIsControl, Tcl_UniCharIsDigit, Tcl_UniCharIsGraph, Tcl_UniCharIsLower, Tcl_UniCharIsPrint, Tcl_UniCharIsPunct, Tcl_UniCharIsSpace, Tcl_UniCharIsUpper, Tcl_UniCharIsWordChar \- routines for classification of Tcl_UniChar characters
 .SH SYNOPSIS
 .nf
 \fB#include <tcl.h>\fR
@@ -44,9 +44,6 @@ int
 \fBTcl_UniCharIsUpper\fR(\fIch\fR)
 .sp
 int
-\fBTcl_UniCharIsUnicode\fR(\fIch\fR)
-.sp
-int
 \fBTcl_UniCharIsWordChar\fR(\fIch\fR)
 .fi
 .SH ARGUMENTS
@@ -91,9 +88,6 @@ character.
 \fBTcl_UniCharIsUpper\fR tests if the character is an uppercase Unicode
 character.
 .PP
-\fBTcl_UniCharIsUnicode\fR tests if the character is a Unicode character,
-not being a surrogate or noncharacter.
-.PP
 \fBTcl_UniCharIsWordChar\fR tests if the character is alphanumeric or
 a connector punctuation mark.
 
diff --git a/generic/tcl.decls b/generic/tcl.decls
index 8e047d0..b8e1e1f 100644
--- a/generic/tcl.decls
+++ b/generic/tcl.decls
@@ -2499,9 +2499,11 @@ declare 655 {
 declare 656 {
     const char *Tcl_UtfPrev(const char *src, const char *start)
 }
-declare 657 {
-    int Tcl_UniCharIsUnicode(int ch)
-}
+# Removed by TIP #652
+#
+#declare 657 {
+#    int Tcl_UniCharIsUnicode(int ch)
+#}
 
 # TIP 656
 declare 658 {
diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c
index 38e04cb..2660ff1 100644
--- a/generic/tclCmdMZ.c
+++ b/generic/tclCmdMZ.c
@@ -1534,7 +1534,7 @@ StringIsCmd(
 	"boolean",	"dict",		"digit",	"double",
 	"entier",	"false",	"graph",	"integer",
 	"list",		"lower",	"print",	"punct",
-	"space",	"true",		"upper",	"unicode",
+	"space",	"true",		"upper",
 	"wideinteger", "wordchar",	"xdigit",	NULL
     };
     enum isClassesEnum {
@@ -1542,7 +1542,7 @@ StringIsCmd(
 	STR_IS_BOOL,	STR_IS_DICT,	STR_IS_DIGIT,	STR_IS_DOUBLE,
 	STR_IS_ENTIER,	STR_IS_FALSE,	STR_IS_GRAPH,	STR_IS_INT,
 	STR_IS_LIST,	STR_IS_LOWER,	STR_IS_PRINT,	STR_IS_PUNCT,
-	STR_IS_SPACE,	STR_IS_TRUE,	STR_IS_UPPER,	STR_IS_UNICODE,
+	STR_IS_SPACE,	STR_IS_TRUE,	STR_IS_UPPER,
 	STR_IS_WIDE,	STR_IS_WORD,	STR_IS_XDIGIT
     } index;
     static const char *const isOptions[] = {
@@ -1871,9 +1871,6 @@ StringIsCmd(
     case STR_IS_UPPER:
 	chcomp = Tcl_UniCharIsUpper;
 	break;
-    case STR_IS_UNICODE:
-	chcomp = Tcl_UniCharIsUnicode;
-	break;
     case STR_IS_WORD:
 	chcomp = Tcl_UniCharIsWordChar;
 	break;
diff --git a/generic/tclCompCmdsSZ.c b/generic/tclCompCmdsSZ.c
index 0a21226..d79b7b9 100644
--- a/generic/tclCompCmdsSZ.c
+++ b/generic/tclCompCmdsSZ.c
@@ -508,7 +508,7 @@ TclCompileStringIsCmd(
 	"boolean",	"dict",		"digit",	"double",
 	"entier",	"false",	"graph",	"integer",
 	"list",		"lower",	"print",	"punct",
-	"space",	"true",		"upper",	"unicode",
+	"space",	"true",		"upper",
 	"wideinteger", "wordchar",	"xdigit",	NULL
     };
     enum isClassesEnum {
@@ -516,7 +516,7 @@ TclCompileStringIsCmd(
 	STR_IS_BOOL,	STR_IS_DICT,	STR_IS_DIGIT,	STR_IS_DOUBLE,
 	STR_IS_ENTIER,	STR_IS_FALSE,	STR_IS_GRAPH,	STR_IS_INT,
 	STR_IS_LIST,	STR_IS_LOWER,	STR_IS_PRINT,	STR_IS_PUNCT,
-	STR_IS_SPACE,	STR_IS_TRUE,	STR_IS_UPPER,	STR_IS_UNICODE,
+	STR_IS_SPACE,	STR_IS_TRUE,	STR_IS_UPPER,
 	STR_IS_WIDE,	STR_IS_WORD,	STR_IS_XDIGIT
     } t;
     int range, allowEmpty = 0, end;
@@ -609,9 +609,6 @@ TclCompileStringIsCmd(
     case STR_IS_UPPER:
 	strClassType = STR_CLASS_UPPER;
 	goto compileStrClass;
-    case STR_IS_UNICODE:
-	strClassType = STR_CLASS_UNICODE;
-	goto compileStrClass;
     case STR_IS_WORD:
 	strClassType = STR_CLASS_WORD;
 	goto compileStrClass;
@@ -1423,7 +1420,6 @@ StringClassDesc const tclStringClassTable[] = {
     {"upper",	Tcl_UniCharIsUpper},
     {"word",	Tcl_UniCharIsWordChar},
     {"xdigit",	UniCharIsHexDigit},
-    {"unicode",	Tcl_UniCharIsUnicode},
     {"",	NULL}
 };
 
diff --git a/generic/tclCompile.h b/generic/tclCompile.h
index 2ea2565..5bbbb8f 100644
--- a/generic/tclCompile.h
+++ b/generic/tclCompile.h
@@ -918,7 +918,6 @@ typedef enum InstStringClassType {
 				 * punctuation) characters. */
     STR_CLASS_XDIGIT,		/* Characters that can be used as digits in
 				 * hexadecimal numbers ([0-9A-Fa-f]). */
-    STR_CLASS_UNICODE		/* Unicode characters. */
 } InstStringClassType;
 
 typedef struct StringClassDesc {
diff --git a/generic/tclStubInit.c b/generic/tclStubInit.c
index 34e8c27..9072796 100644
--- a/generic/tclStubInit.c
+++ b/generic/tclStubInit.c
@@ -1452,7 +1452,7 @@ const TclStubs tclStubs = {
     Tcl_UtfCharComplete, /* 654 */
     Tcl_UtfNext, /* 655 */
     Tcl_UtfPrev, /* 656 */
-    Tcl_UniCharIsUnicode, /* 657 */
+    0, /* 657 */
     Tcl_ExternalToUtfDStringEx, /* 658 */
     Tcl_UtfToExternalDStringEx, /* 659 */
     Tcl_AsyncMarkFromSignal, /* 660 */
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index ca4a166..9888772 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -2283,36 +2283,6 @@ Tcl_UniCharIsUpper(
 /*
  *----------------------------------------------------------------------
  *
- * Tcl_UniCharIsUnicode --
- *
- *	Test if a character is a Unicode character.
- *
- * Results:
- *	Returns non-zero if character belongs to the Unicode set.
- *
- *	Excluded are:
- *	  1) All characters > U+10FFFF
- *	  2) Surrogates U+D800 - U+DFFF
- *	  3) Last 2 characters of each plane, so U+??FFFE  and U+??FFFF
- *	  4) The characters in the range U+FDD0 - U+FDEF
- *
- * Side effects:
- *	None.
- *
- *----------------------------------------------------------------------
- */
-
-int
-Tcl_UniCharIsUnicode(
-    int ch)			/* Unicode character to test. */
-{
-    return ((unsigned int)ch <= 0x10FFFF) && ((ch & 0xFFF800) != 0xD800)
-	    && ((ch & 0xFFFE) != 0xFFFE) && ((unsigned int)(ch - 0xFDD0) >= 32);
-}
-
-/*
- *----------------------------------------------------------------------
- *
  * Tcl_UniCharIsWordChar --
  *
  *	Test if a character is alphanumeric or a connector punctuation mark.
diff --git a/tests/string.test b/tests/string.test
index a232f1e..26cd8a7 100644
--- a/tests/string.test
+++ b/tests/string.test
@@ -537,10 +537,10 @@ test string-6.4.$noComp {string is, too many args} {
 } {1 {wrong # args: should be "string is class ?-strict? ?-failindex var? str"}}
 test string-6.5.$noComp {string is, class check} {
     list [catch {run {string is bogus str}} msg] $msg
-} {1 {bad class "bogus": must be alnum, alpha, ascii, control, boolean, dict, digit, double, entier, false, graph, integer, list, lower, print, punct, space, true, upper, unicode, wideinteger, wordchar, or xdigit}}
+} {1 {bad class "bogus": must be alnum, alpha, ascii, control, boolean, dict, digit, double, entier, false, graph, integer, list, lower, print, punct, space, true, upper, wideinteger, wordchar, or xdigit}}
 test string-6.6.$noComp {string is, ambiguous class} {
     list [catch {run {string is al str}} msg] $msg
-} {1 {ambiguous class "al": must be alnum, alpha, ascii, control, boolean, dict, digit, double, entier, false, graph, integer, list, lower, print, punct, space, true, upper, unicode, wideinteger, wordchar, or xdigit}}
+} {1 {ambiguous class "al": must be alnum, alpha, ascii, control, boolean, dict, digit, double, entier, false, graph, integer, list, lower, print, punct, space, true, upper, wideinteger, wordchar, or xdigit}}
 test string-6.7.$noComp {string is alpha, all ok} {
     run {string is alpha -strict -failindex var abc}
 } 1
@@ -973,27 +973,6 @@ test string-6.130.1.$noComp {string is entier, false on bad octal} {
 test string-6.131.$noComp {string is entier, false on bad hex} {
     list [run {string is entier -fail var 0X12345611234123456123456562345612345612345612345612345612345612345612345612345612345345XYZ}] $var
 } {0 88}
-test string-6.132.$noComp {string is unicode} {
-    run {string is unicode \U10FFFD\uD7FF\uE000\uFDCF\uFDF0}
-} 1
-test string-6.133.$noComp {string is unicode, upper surrogate} {
-    run {string is unicode \uD800}
-} 0
-test string-6.134.$noComp {string is unicode, lower surrogate} {
-    run {string is unicode \uDFFF}
-} 0
-test string-6.135.$noComp {string is unicode, noncharacter} {
-    run {string is unicode \uFFFE}
-} 0
-test string-6.136.$noComp {string is unicode, noncharacter} {
-    run {string is unicode \uFFFF}
-} 0
-test string-6.137.$noComp {string is unicode, noncharacter} {
-    run {string is unicode \uFDD0}
-} 0
-test string-6.138.$noComp {string is unicode, noncharacter} {
-    run {string is unicode \uFDEF}
-} 0
 test string-6.139.$noComp {string is integer, bug [76ad7aeba3]} {
     run {string is integer 18446744073709551615}
 } 1
-- 
cgit v0.12


From 181274a8a0e892445b3150b57fd22c904bbe0ce2 Mon Sep 17 00:00:00 2001
From: "jan.nijtmans" <nijtmans@users.sourceforge.net>
Date: Fri, 9 Feb 2024 14:17:42 +0000
Subject: This should be removed too

---
 doc/string.n | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/doc/string.n b/doc/string.n
index 3b9af03..f07a591 100644
--- a/doc/string.n
+++ b/doc/string.n
@@ -181,13 +181,6 @@ zero width no-break space (U+feff) (=BOM).
 .IP \fBtrue\fR 12
 Any of the forms allowed to \fBTcl_GetBoolean\fR where the value is
 true.
-.IP \fBunicode\fR 12
-Any Unicode character, except surrogates and noncharacters.
-.RS
-.PP
-\fIWarning: this option is under discussion and may be renamed or replaced
-by another solution within the Tcl 9.0 series.\fR
-.RE
 .IP \fBupper\fR 12
 Any upper case alphabet character in the Unicode character set.
 .IP \fBwideinteger\fR 12
-- 
cgit v0.12