From b488f3edf6ee202281aca13745c0d4212310f654 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Thu, 3 Mar 2022 13:05:38 +0000 Subject: TIP #619 implementation. tests not working yet --- generic/tcl.h | 7 +++++++ generic/tclCmdMZ.c | 2 ++ generic/tclDecls.h | 8 ++++++++ generic/tclEncoding.c | 21 ++++++++++++--------- generic/tclParse.c | 7 ++++--- generic/tclUtf.c | 18 +++++++++++++++--- tests/utf.test | 6 +++--- 7 files changed, 51 insertions(+), 18 deletions(-) diff --git a/generic/tcl.h b/generic/tcl.h index 6b69929..8778203 100644 --- a/generic/tcl.h +++ b/generic/tcl.h @@ -835,6 +835,13 @@ typedef struct Tcl_DString { #define TCL_INDEX_NULL_OK 4 /* + * Flags that may be passed to Tcl_UniCharToUtf. + * TCL_COMBINE Combine surrogates + */ + +#define TCL_COMBINE 0x200000 + +/* *---------------------------------------------------------------------------- * Flag values passed to Tcl_RecordAndEval, Tcl_EvalObj, Tcl_EvalObjv. * WARNING: these bit choices must not conflict with the bit choices for diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c index 85174ec..b50eacb 100644 --- a/generic/tclCmdMZ.c +++ b/generic/tclCmdMZ.c @@ -1432,9 +1432,11 @@ StringIndexCmd( char buf[4] = ""; end = Tcl_UniCharToUtf(ch, buf); +#if TCL_UTF_MAX < 4 if ((ch >= 0xD800) && (end < 3)) { end += Tcl_UniCharToUtf(-1, buf + end); } +#endif Tcl_SetObjResult(interp, Tcl_NewStringObj(buf, end)); } } diff --git a/generic/tclDecls.h b/generic/tclDecls.h index 9205401..d073edd 100644 --- a/generic/tclDecls.h +++ b/generic/tclDecls.h @@ -3919,6 +3919,14 @@ extern const TclStubs *tclStubsPtr; # define Tcl_UtfToUniChar Tcl_UtfToChar16 # undef Tcl_UniCharLen # define Tcl_UniCharLen Tcl_Char16Len +# undef Tcl_UniCharToUtf +# if defined(USE_TCL_STUBS) +# define Tcl_UniCharToUtf(c, p) \ + (tclStubsPtr->tcl_UniCharToUtf((c)|TCL_COMBINE, (p))) +# else +# define Tcl_UniCharToUtf(c, p) \ + ((Tcl_UniCharToUtf)((c)|TCL_COMBINE, (p))) +# endif #endif #if defined(USE_TCL_STUBS) # define Tcl_WCharToUtfDString (sizeof(wchar_t) != sizeof(short) \ diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index 3a6385f..765f98b 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -2228,7 +2228,6 @@ UtfToUtfProc( } dst += Tcl_UniCharToUtf(ch, dst); } else { - int low; const char *saveSrc = src; size_t len = TclUtfToUCS4(src, &ch); if ((len < 2) && (ch != 0) && (flags & TCL_ENCODING_STOPONERROR) @@ -2246,13 +2245,20 @@ UtfToUtfProc( *dst++ = (char) (((ch >> 10) & 0x3F) | 0x80); ch = (ch & 0x0CFF) | 0xDC00; } - goto cesu8; +#if TCL_UTF_MAX < 4 + cesu8: +#endif + *dst++ = (char) (((ch >> 12) | 0xE0) & 0xEF); + *dst++ = (char) (((ch >> 6) | 0x80) & 0xBF); + *dst++ = (char) ((ch | 0x80) & 0xBF); + continue; +#if TCL_UTF_MAX < 4 } else if ((ch | 0x7FF) == 0xDFFF) { /* * A surrogate character is detected, handle especially. */ - low = ch; + int low = ch; len = (src <= srcEnd-3) ? TclUtfToUCS4(src, &low) : 0; if (((low & ~0x3FF) != 0xDC00) || (ch & 0x400)) { @@ -2261,15 +2267,12 @@ UtfToUtfProc( src = saveSrc; break; } - cesu8: - *dst++ = (char) (((ch >> 12) | 0xE0) & 0xEF); - *dst++ = (char) (((ch >> 6) | 0x80) & 0xBF); - *dst++ = (char) ((ch | 0x80) & 0xBF); - continue; + goto cesu8; } src += len; dst += Tcl_UniCharToUtf(ch, dst); ch = low; +#endif } else if (!Tcl_UniCharIsUnicode(ch)) { if (flags & TCL_ENCODING_STOPONERROR) { result = TCL_CONVERT_UNKNOWN; @@ -2578,7 +2581,7 @@ Utf16ToUtfProc( if (ch && ch < 0x80) { *dst++ = (ch & 0xFF); } else { - dst += Tcl_UniCharToUtf(ch, dst); + dst += Tcl_UniCharToUtf(ch | TCL_COMBINE, dst); } src += sizeof(unsigned short); } diff --git a/generic/tclParse.c b/generic/tclParse.c index 614401f..fdd1478 100644 --- a/generic/tclParse.c +++ b/generic/tclParse.c @@ -869,6 +869,7 @@ TclParseBackslash( * No hexdigits -> This is just "u". */ result = 'u'; +#if TCL_UTF_MAX < 4 } else if (((result & 0xFC00) == 0xD800) && (count == 6) && (p[5] == '\\') && (p[6] == 'u') && (numBytes >= 10)) { /* If high surrogate is immediately followed by a low surrogate @@ -879,6 +880,7 @@ TclParseBackslash( result = ((result & 0x3FF)<<10 | (low & 0x3FF)) + 0x10000; count += count2 + 2; } +#endif } break; case 'U': @@ -888,9 +890,6 @@ TclParseBackslash( * No hexdigits -> This is just "U". */ result = 'U'; - } else if ((result | 0x7FF) == 0xDFFF) { - /* Upper or lower surrogate, not allowed in this syntax. */ - result = 0xFFFD; } break; case '\n': @@ -954,10 +953,12 @@ TclParseBackslash( *readPtr = count; } count = Tcl_UniCharToUtf(result, dst); +#if TCL_UTF_MAX < 4 if ((result >= 0xD800) && (count < 3)) { /* Special case for handling high surrogates. */ count += Tcl_UniCharToUtf(-1, dst + count); } +#endif return count; } diff --git a/generic/tclUtf.c b/generic/tclUtf.c index e353b7f..a04e41c 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -208,15 +208,23 @@ Invalid( *--------------------------------------------------------------------------- */ +#undef Tcl_UniCharToUtf int Tcl_UniCharToUtf( int ch, /* The Tcl_UniChar to be stored in the - * buffer. */ + * buffer. Can be or'ed with flag TCL_COMBINE */ char *buf) /* Buffer in which the UTF-8 representation of * the Tcl_UniChar is stored. Buffer must be * large enough to hold the UTF-8 character * (at most 4 bytes). */ { +#if TCL_UTF_MAX > 3 + int flags = ch; +#endif + + if (ch >= TCL_COMBINE) { + ch &= (TCL_COMBINE - 1); + } if ((unsigned)(ch - 1) < (UNICODE_SELF - 1)) { buf[0] = (char) ch; return 1; @@ -228,7 +236,11 @@ Tcl_UniCharToUtf( return 2; } if (ch <= 0xFFFF) { - if ((ch & 0xF800) == 0xD800) { + if ( +#if TCL_UTF_MAX > 3 + (flags & TCL_COMBINE) && +#endif + ((ch & 0xF800) == 0xD800)) { if (ch & 0x0400) { /* Low surrogate */ if (((buf[0] & 0xC0) == 0x80) && ((buf[1] & 0xCF) == 0)) { @@ -377,7 +389,7 @@ Tcl_Char16ToUtfDString( /* Special case for handling high surrogates. */ p += Tcl_UniCharToUtf(-1, p); } - len = Tcl_UniCharToUtf(*w, p); + len = Tcl_UniCharToUtf(*w | TCL_COMBINE, p); p += len; if ((*w >= 0xD800) && (len < 3)) { len = 0; /* Indication that high surrogate was found */ diff --git a/tests/utf.test b/tests/utf.test index 09599b6..f094a23 100644 --- a/tests/utf.test +++ b/tests/utf.test @@ -78,11 +78,11 @@ test utf-1.11 {Tcl_UniCharToUtf: 3 byte sequence, low surrogate} testbytestring test utf-1.12 {Tcl_UniCharToUtf: 4 byte sequence, high/low surrogate} {pairsTo4bytes testbytestring} { expr {"\uD842\uDC42" eq [testbytestring \xF0\xA0\xA1\x82]} } 1 -test utf-1.13.0 {Tcl_UniCharToUtf: Invalid surrogate} {Uesc ucs2} { +test utf-1.13.0 {Tcl_UniCharToUtf: Invalid surrogate} {Uesc} { expr {"\UD842" eq "\uD842"} } 1 -test utf-1.13.1 {Tcl_UniCharToUtf: Invalid surrogate} {fullutf testbytestring} { - expr {"\UD842" eq [testbytestring \xEF\xBF\xBD]} +test utf-1.13.1 {Tcl_UniCharToUtf: Invalid surrogate} {testbytestring} { + expr {"\UD842" eq [testbytestring \xED\xA1\x82]} } 1 test utf-1.14 {Tcl_UniCharToUtf: surrogate pairs from concat} { set lo \uDE02 -- cgit v0.12 From e9e4041670725dbaa04756d51351bb717a17fa46 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Thu, 31 Mar 2022 07:20:00 +0000 Subject: Minor change in utf.test --- tests/utf.test | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/utf.test b/tests/utf.test index f094a23..389bbce 100644 --- a/tests/utf.test +++ b/tests/utf.test @@ -81,7 +81,7 @@ test utf-1.12 {Tcl_UniCharToUtf: 4 byte sequence, high/low surrogate} {pairsTo4b test utf-1.13.0 {Tcl_UniCharToUtf: Invalid surrogate} {Uesc} { expr {"\UD842" eq "\uD842"} } 1 -test utf-1.13.1 {Tcl_UniCharToUtf: Invalid surrogate} {testbytestring} { +test utf-1.13.1 {Tcl_UniCharToUtf: Invalid surrogate} {fullutf testbytestring} { expr {"\UD842" eq [testbytestring \xED\xA1\x82]} } 1 test utf-1.14 {Tcl_UniCharToUtf: surrogate pairs from concat} { -- cgit v0.12 From e9f9cac4be915b90b69b1ee6d0b72c2f57ce590d Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Thu, 31 Mar 2022 09:44:20 +0000 Subject: Oops --- generic/tclStringObj.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/generic/tclStringObj.c b/generic/tclStringObj.c index f1dc0f5..47b532d 100644 --- a/generic/tclStringObj.c +++ b/generic/tclStringObj.c @@ -70,7 +70,7 @@ static void SetUnicodeObj(Tcl_Obj *objPtr, static size_t UnicodeLength(const Tcl_UniChar *unicode); static void UpdateStringOfString(Tcl_Obj *objPtr); -#if TCL+UTF_MAX > 3 +#if TCL_UTF_MAX > 3 #define ISCONTINUATION(bytes) (\ ((bytes)[0] & 0xC0) == 0x80) #else -- cgit v0.12 From e0f543167d530ab16a23c67931db48f52c6a8ef3 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Thu, 31 Mar 2022 10:08:13 +0000 Subject: Final tweaks in testcases --- tests/encoding.test | 10 +++++----- tests/utf.test | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/encoding.test b/tests/encoding.test index dfe844f..99ea70c 100644 --- a/tests/encoding.test +++ b/tests/encoding.test @@ -340,13 +340,13 @@ test encoding-15.3 {UtfToUtfProc null character input} teststringbytes { test encoding-15.4 {UtfToUtfProc emoji character input} -body { set x \xED\xA0\xBD\xED\xB8\x82 set y [encoding convertfrom -nocomplain utf-8 \xED\xA0\xBD\xED\xB8\x82] - list [string length $y] $y -} -result "2 \uD83D\uDE02" -test encoding-15.5 {UtfToUtfProc emoji character input} ucs4 { + list [string length $x] $y +} -result "6 \uD83D\uDE02" +test encoding-15.5 {UtfToUtfProc emoji character input} { set x \xF0\x9F\x98\x82 set y [encoding convertfrom utf-8 \xF0\x9F\x98\x82] - list [string length $y] $y -} "1 😂" + list [string length $x] $y +} "4 😂" test encoding-15.6 {UtfToUtfProc emoji character output} ucs4 { set x \uDE02\uD83D\uDE02\uD83D set y [encoding convertto -nocomplain utf-8 \uDE02\uD83D\uDE02\uD83D] diff --git a/tests/utf.test b/tests/utf.test index 389bbce..4a1c063 100644 --- a/tests/utf.test +++ b/tests/utf.test @@ -78,7 +78,7 @@ test utf-1.11 {Tcl_UniCharToUtf: 3 byte sequence, low surrogate} testbytestring test utf-1.12 {Tcl_UniCharToUtf: 4 byte sequence, high/low surrogate} {pairsTo4bytes testbytestring} { expr {"\uD842\uDC42" eq [testbytestring \xF0\xA0\xA1\x82]} } 1 -test utf-1.13.0 {Tcl_UniCharToUtf: Invalid surrogate} {Uesc} { +test utf-1.13.0 {Tcl_UniCharToUtf: Invalid surrogate} Uesc { expr {"\UD842" eq "\uD842"} } 1 test utf-1.13.1 {Tcl_UniCharToUtf: Invalid surrogate} {fullutf testbytestring} { -- cgit v0.12 From 05de893f6ff1e5b322d9579f183a83ad49be48df Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Fri, 1 Apr 2022 09:48:44 +0000 Subject: Fix some more testcases (involving string reverse/trim) --- tests/string.test | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/tests/string.test b/tests/string.test index 70b0e0f..8d99e88 100644 --- a/tests/string.test +++ b/tests/string.test @@ -34,6 +34,8 @@ testConstraint testindexobj [expr {[info commands testindexobj] ne {}}] testConstraint testevalex [expr {[info commands testevalex] ne {}}] testConstraint fullutf [expr {[string length \U010000] == 1}] testConstraint testbytestring [llength [info commands testbytestring]] +testConstraint ucs4 [expr {[testConstraint fullutf] + && [string length [format %c 0x10000]] == 1}] # Used for constraining memory leak tests testConstraint memory [llength [info commands memory]] @@ -1941,13 +1943,13 @@ test string-21.22.$noComp {string trimright, unicode} { run {string trimright "\uF602Hello world!\uF602" \uD83D\uDE02} } "\uF602Hello world!\uF602" test string-21.23.$noComp {string trim, unicode} { - run {string trim "\uD83D\uDE02Hello world!\uD83D\uDE02" \uD93D\uDE02} + run {string trim "\uD83D\uDE02Hello world!\uD83D\uDE02" \uD93D} } "\uD83D\uDE02Hello world!\uD83D\uDE02" test string-21.24.$noComp {string trimleft, unicode} { run {string trimleft "\uD83D\uDE02Hello world!\uD83D\uDE02" \uD93D\uDE02} } "\uD83D\uDE02Hello world!\uD83D\uDE02" test string-21.25.$noComp {string trimright, unicode} { - run {string trimright "\uD83D\uDE02Hello world!\uD83D\uDE02" \uD93D\uDE02} + run {string trimright "\uD83D\uDE02Hello world!\uD83D\uDE02" \uD93D} } "\uD83D\uDE02Hello world!\uD83D\uDE02" test string-22.1.$noComp {string wordstart} -body { @@ -2111,24 +2113,24 @@ test string-24.15.$noComp {string reverse command - pure bytearray} { binary scan [run {tcl::string::reverse [binary format H* 010203]}] H* x set x } 030201 -test string-24.16.$noComp {string reverse command - surrogates} { +test string-24.16.$noComp {string reverse command - surrogates} ucs4 { run {string reverse \u0444bulb\uD83D\uDE02} -} \uD83D\uDE02blub\u0444 -test string-24.17.$noComp {string reverse command - surrogates} { +} \uDE02\uD83Dblub\u0444 +test string-24.17.$noComp {string reverse command - surrogates} ucs4 { run {string reverse \uD83D\uDE02hello\uD83D\uDE02} -} \uD83D\uDE02olleh\uD83D\uDE02 -test string-24.18.$noComp {string reverse command - surrogates} { +} \uDE02\uD83Dolleh\uDE02\uD83D +test string-24.18.$noComp {string reverse command - surrogates} ucs4 { set s \u0444bulb\uD83D\uDE02 # shim shimmery ... string index $s 0 run {string reverse $s} -} \uD83D\uDE02blub\u0444 -test string-24.19.$noComp {string reverse command - surrogates} { +} \uDE02\uD83Dblub\u0444 +test string-24.19.$noComp {string reverse command - surrogates} ucs4 { set s \uD83D\uDE02hello\uD83D\uDE02 # shim shimmery ... string index $s 0 run {string reverse $s} -} \uD83D\uDE02olleh\uD83D\uDE02 +} \uDE02\uD83Dolleh\uDE02\uD83D test string-25.1.$noComp {string is list} { run {string is list {a b c}} -- cgit v0.12 From 4d0bc87fb41014bd1ca72ebc565ac4d6b8230bd4 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Mon, 4 Jul 2022 08:31:08 +0000 Subject: Add TCL_COMBINE, just a NOP for now (will get a meaning in 9.0 --- generic/tcl.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/generic/tcl.h b/generic/tcl.h index d53c0f2..d99e9fa 100644 --- a/generic/tcl.h +++ b/generic/tcl.h @@ -993,6 +993,13 @@ typedef struct Tcl_DString { #define TCL_INDEX_TEMP_TABLE 64 /* + * Flags that may be passed to Tcl_UniCharToUtf. + * TCL_COMBINE Combine surrogates (default in Tcl 8.x) + */ + +#define TCL_COMBINE 0 + +/* *---------------------------------------------------------------------------- * Flag values passed to Tcl_RecordAndEval, Tcl_EvalObj, Tcl_EvalObjv. * WARNING: these bit choices must not conflict with the bit choices for -- cgit v0.12