From 3af16acbcb63ea2935d71b905371252560dc4659 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Wed, 29 Nov 2017 08:59:49 +0000 Subject: Treat invalid UTF-8 characters in the range 0x80-0x9F as cp1252: See [https://en.wikipedia.org/wiki/UTF-8]. To be added to TIP #389 --- doc/Utf.3 | 3 +++ generic/tclInt.h | 2 +- generic/tclUtf.c | 17 +++++++++++++++-- 3 files changed, 19 insertions(+), 3 deletions(-) diff --git a/doc/Utf.3 b/doc/Utf.3 index 638f349..de9545d 100644 --- a/doc/Utf.3 +++ b/doc/Utf.3 @@ -140,6 +140,9 @@ number of bytes read from \fIsrc\fR. The caller must ensure that the source buffer is long enough such that this routine does not run off the end and dereference non-existent or random memory; if the source buffer is known to be null-terminated, this will not happen. If the input is +a byte in the range 0x80 - 0x9F, \fBTcl_UtfToUniChar\fR assumes the +cp1252 encoding, stores the corresponding Tcl_UniChar in \fI*chPtr\fR +and returns 1. If the input is otherwise not in proper UTF-8 format, \fBTcl_UtfToUniChar\fR will store the first byte of \fIsrc\fR in \fI*chPtr\fR as a Tcl_UniChar between 0x0000 and 0x00ff and return 1. diff --git a/generic/tclInt.h b/generic/tclInt.h index ef88bf5..d77889e 100644 --- a/generic/tclInt.h +++ b/generic/tclInt.h @@ -4446,7 +4446,7 @@ MODULE_SCOPE void TclDbInitNewObj(Tcl_Obj *objPtr, const char *file, */ #define TclUtfToUniChar(str, chPtr) \ - ((((unsigned char) *(str)) < 0xC0) ? \ + ((((unsigned char) *(str)) < 0x80) ? \ ((*(chPtr) = (unsigned char) *(str)), 1) \ : Tcl_UtfToUniChar(str, chPtr)) diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 4ed201f..aed332f 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -272,6 +272,13 @@ Tcl_UniCharToUtfDString( *--------------------------------------------------------------------------- */ +static const unsigned short cp1252[32] = { + 0x20ac, 0x81, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, + 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x8D, 0x017D, 0x8F, + 0x90, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, + 0x2DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x9D, 0x017E, 0x0178 +}; + int Tcl_UtfToUniChar( register const char *src, /* The UTF-8 string. */ @@ -288,11 +295,17 @@ Tcl_UtfToUniChar( if (byte < 0xC0) { /* * Handles properly formed UTF-8 characters between 0x01 and 0x7F. - * Also treats \0 and naked trail bytes 0x80 to 0xBF as valid + * Treats naked trail bytes 0x80 to 0x9F as valid characters from + * the cp1252 table. See: + * Also treats \0 and other naked trail bytes 0xA0 to 0xBF as valid * characters representing themselves. */ - *chPtr = (Tcl_UniChar) byte; + if ((unsigned)(byte-0x80) < (unsigned) 0x20) { + *chPtr = (Tcl_UniChar) cp1252[byte-0x80]; + } else { + *chPtr = (Tcl_UniChar) byte; + } return 1; } else if (byte < 0xE0) { if ((src[1] & 0xC0) == 0x80) { -- cgit v0.12