Treat invalid UTF-8 characters in the range 0x80-0x9F as cp1252: See [https://en.wikipedia.org/wiki/UTF-8]. To be added to TIP #389

author: jan.nijtmans <nijtmans@users.sourceforge.net> 2017-11-29 08:59:49 (GMT)
committer: jan.nijtmans <nijtmans@users.sourceforge.net> 2017-11-29 08:59:49 (GMT)
commit: 3af16acbcb63ea2935d71b905371252560dc4659 (patch)
tree: 0a9f024f95deaffc4eeb967fec06b82d0229a34e
parent: eef0b72b5e12dededc794db29219be26574f6daf (diff)
download: tcl-3af16acbcb63ea2935d71b905371252560dc4659.zip
tcl-3af16acbcb63ea2935d71b905371252560dc4659.tar.gz
tcl-3af16acbcb63ea2935d71b905371252560dc4659.tar.bz2
3 files changed, 19 insertions, 3 deletions
diff --git a/doc/Utf.3 b/doc/Utf.3
index 638f349..de9545d 100644
--- a/doc/Utf.3
+++ b/doc/Utf.3
@@ -140,6 +140,9 @@ number of bytes read from \fIsrc\fR.  The caller must ensure that the
 source buffer is long enough such that this routine does not run off the
 end and dereference non-existent or random memory; if the source buffer
 is known to be null-terminated, this will not happen.  If the input is
+a byte in the range 0x80 - 0x9F, \fBTcl_UtfToUniChar\fR assumes the
+cp1252 encoding, stores the corresponding Tcl_UniChar in \fI*chPtr\fR
+and returns 1. If the input is otherwise
 not in proper UTF-8 format, \fBTcl_UtfToUniChar\fR will store the first
 byte of \fIsrc\fR in \fI*chPtr\fR as a Tcl_UniChar between 0x0000 and
 0x00ff and return 1.
diff --git a/generic/tclInt.h b/generic/tclInt.h
index ef88bf5..d77889e 100644
--- a/generic/tclInt.h
+++ b/generic/tclInt.h
@@ -4446,7 +4446,7 @@ MODULE_SCOPE void	TclDbInitNewObj(Tcl_Obj *objPtr, const char *file,
  */
 
 #define TclUtfToUniChar(str, chPtr) \
-	((((unsigned char) *(str)) < 0xC0) ?		\
+	((((unsigned char) *(str)) < 0x80) ?		\
 	    ((*(chPtr) = (unsigned char) *(str)), 1)	\
 	    : Tcl_UtfToUniChar(str, chPtr))
 
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index 4ed201f..aed332f 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -272,6 +272,13 @@ Tcl_UniCharToUtfDString(
  *---------------------------------------------------------------------------
  */
 
+static const unsigned short cp1252[32] = {
+  0x20ac,   0x81, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021,
+  0x02C6, 0x2030, 0x0160, 0x2039, 0x0152,   0x8D, 0x017D,   0x8F,
+    0x90, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
+   0x2DC, 0x2122, 0x0161, 0x203A, 0x0153,   0x9D, 0x017E, 0x0178
+};
+
 int
 Tcl_UtfToUniChar(
     register const char *src,	/* The UTF-8 string. */
@@ -288,11 +295,17 @@ Tcl_UtfToUniChar(
     if (byte < 0xC0) {
 	/*
 	 * Handles properly formed UTF-8 characters between 0x01 and 0x7F.
-	 * Also treats \0 and naked trail bytes 0x80 to 0xBF as valid
+	 * Treats naked trail bytes 0x80 to 0x9F as valid characters from
+	 * the cp1252 table. See: <https://en.wikipedia.org/wiki/UTF-8>
+	 * Also treats \0 and other naked trail bytes 0xA0 to 0xBF as valid
 	 * characters representing themselves.
 	 */
 
-	*chPtr = (Tcl_UniChar) byte;
+	if ((unsigned)(byte-0x80) < (unsigned) 0x20) {
+	    *chPtr = (Tcl_UniChar) cp1252[byte-0x80];
+	} else {
+	    *chPtr = (Tcl_UniChar) byte;
+	}
 	return 1;
     } else if (byte < 0xE0) {
 	if ((src[1] & 0xC0) == 0x80) {
author	jan.nijtmans <nijtmans@users.sourceforge.net>	2017-11-29 08:59:49 (GMT)
committer	jan.nijtmans <nijtmans@users.sourceforge.net>	2017-11-29 08:59:49 (GMT)
commit	3af16acbcb63ea2935d71b905371252560dc4659 (patch)
tree	0a9f024f95deaffc4eeb967fec06b82d0229a34e
parent	eef0b72b5e12dededc794db29219be26574f6daf (diff)
download	tcl-3af16acbcb63ea2935d71b905371252560dc4659.zip tcl-3af16acbcb63ea2935d71b905371252560dc4659.tar.gz tcl-3af16acbcb63ea2935d71b905371252560dc4659.tar.bz2