From 23d9ca0ec4772f703cd24c476d5fa485fd91e828 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Sun, 19 Feb 2023 11:41:44 +0000 Subject: Proposed fix for [5607d6482c]: strict ucs-2 never implemented (TIP #346/#656) --- generic/tclEncoding.c | 36 ++++++++++++++++++++++++------------ tests/encoding.test | 8 +++++++- 2 files changed, 31 insertions(+), 13 deletions(-) diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index 1d3a3eb..d2b0efc 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -573,13 +573,13 @@ TclInitEncodingSubsystem(void) type.freeProc = NULL; type.nullSize = 2; type.encodingName = "ucs-2le"; - type.clientData = INT2PTR(TCL_ENCODING_LE|TCL_ENCODING_NOCOMPLAIN); + type.clientData = INT2PTR(TCL_ENCODING_LE); Tcl_CreateEncoding(&type); type.encodingName = "ucs-2be"; - type.clientData = INT2PTR(TCL_ENCODING_NOCOMPLAIN); + type.clientData = INT2PTR(0); Tcl_CreateEncoding(&type); type.encodingName = "ucs-2"; - type.clientData = INT2PTR(isLe.c|TCL_ENCODING_NOCOMPLAIN); + type.clientData = INT2PTR(isLe.c); Tcl_CreateEncoding(&type); type.toUtfProc = Utf32ToUtfProc; @@ -601,13 +601,13 @@ TclInitEncodingSubsystem(void) type.freeProc = NULL; type.nullSize = 2; type.encodingName = "utf-16le"; - type.clientData = INT2PTR(TCL_ENCODING_LE); + type.clientData = INT2PTR(TCL_ENCODING_LE|ENCODING_UTF); Tcl_CreateEncoding(&type); type.encodingName = "utf-16be"; - type.clientData = INT2PTR(0); + type.clientData = INT2PTR(ENCODING_UTF); Tcl_CreateEncoding(&type); type.encodingName = "utf-16"; - type.clientData = INT2PTR(isLe.c); + type.clientData = INT2PTR(isLe.c|ENCODING_UTF); Tcl_CreateEncoding(&type); #ifndef TCL_NO_DEPRECATED @@ -2984,10 +2984,7 @@ UtfToUcs2Proc( * output buffer. */ { const char *srcStart, *srcEnd, *srcClose, *dstStart, *dstEnd; - int result, numChars; -#if TCL_UTF_MAX < 4 - int len; -#endif + int result, numChars, len; Tcl_UniChar ch = 0; flags |= PTR2INT(clientData); @@ -3017,17 +3014,32 @@ UtfToUcs2Proc( break; } #if TCL_UTF_MAX < 4 - src += (len = TclUtfToUniChar(src, &ch)); + len = TclUtfToUniChar(src, &ch); if ((ch >= 0xD800) && (len < 3)) { + if (STOPONERROR) { + result = TCL_CONVERT_UNKNOWN; + break; + } + src += len; src += TclUtfToUniChar(src, &ch); ch = 0xFFFD; } #else - src += TclUtfToUniChar(src, &ch); + len = TclUtfToUniChar(src, &ch); if (ch > 0xFFFF) { + if (STOPONERROR) { + result = TCL_CONVERT_UNKNOWN; + break; + } ch = 0xFFFD; } #endif + if (STOPONERROR && ((ch & ~0x7FF) == 0xD800)) { + result = TCL_CONVERT_SYNTAX; + break; + } + + src += len; /* * Need to handle this in a way that won't cause misalignment by diff --git a/tests/encoding.test b/tests/encoding.test index 03f0273..83e75be 100644 --- a/tests/encoding.test +++ b/tests/encoding.test @@ -561,7 +561,7 @@ test encoding-16.9 { test encoding-17.1 {UtfToUtf16Proc} -body { encoding convertto utf-16 "\U460DC" } -result "\xD8\xD8\xDC\xDC" -test encoding-17.2 {UtfToUcs2Proc} -body { +test encoding-17.2 {UtfToUcs2Proc, invalid testcase, see [5607d6482c]} -constraints deprecated -body { encoding convertfrom utf-16 [encoding convertto ucs-2 "\U460DC"] } -result "\uFFFD" test encoding-17.3 {UtfToUtf16Proc} -body { @@ -853,6 +853,12 @@ test encoding-24.42 {Parse invalid utf-8, fallback to cp1252 [885c86a9a0]} -body test encoding-24.43 {Parse invalid utf-8, fallback to cp1252 [885c86a9a0]} -body { encoding convertfrom -nocomplain utf-8 \x80 } -result \u20AC +test encoding-24.44 {Try to generate invalid ucs-2 with -strict} -body { + encoding convertto -strict ucs-2 \uD800 +} -returnCodes 1 -result {unexpected character at index 0: 'U+00D800'} +test encoding-24.45 {Try to generate invalid ucs-2 with -strict} -body { + encoding convertto -strict ucs-2 \U10000 +} -returnCodes 1 -result {unexpected character at index 0: 'U+010000'} file delete [file join [temporaryDirectory] iso2022.txt] -- cgit v0.12