From b488f3edf6ee202281aca13745c0d4212310f654 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Thu, 3 Mar 2022 13:05:38 +0000 Subject: TIP #619 implementation. tests not working yet --- generic/tcl.h | 7 +++++++ generic/tclCmdMZ.c | 2 ++ generic/tclDecls.h | 8 ++++++++ generic/tclEncoding.c | 21 ++++++++++++--------- generic/tclParse.c | 7 ++++--- generic/tclUtf.c | 18 +++++++++++++++--- tests/utf.test | 6 +++--- 7 files changed, 51 insertions(+), 18 deletions(-) diff --git a/generic/tcl.h b/generic/tcl.h index 6b69929..8778203 100644 --- a/generic/tcl.h +++ b/generic/tcl.h @@ -835,6 +835,13 @@ typedef struct Tcl_DString { #define TCL_INDEX_NULL_OK 4 /* + * Flags that may be passed to Tcl_UniCharToUtf. + * TCL_COMBINE Combine surrogates + */ + +#define TCL_COMBINE 0x200000 + +/* *---------------------------------------------------------------------------- * Flag values passed to Tcl_RecordAndEval, Tcl_EvalObj, Tcl_EvalObjv. * WARNING: these bit choices must not conflict with the bit choices for diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c index 85174ec..b50eacb 100644 --- a/generic/tclCmdMZ.c +++ b/generic/tclCmdMZ.c @@ -1432,9 +1432,11 @@ StringIndexCmd( char buf[4] = ""; end = Tcl_UniCharToUtf(ch, buf); +#if TCL_UTF_MAX < 4 if ((ch >= 0xD800) && (end < 3)) { end += Tcl_UniCharToUtf(-1, buf + end); } +#endif Tcl_SetObjResult(interp, Tcl_NewStringObj(buf, end)); } } diff --git a/generic/tclDecls.h b/generic/tclDecls.h index 9205401..d073edd 100644 --- a/generic/tclDecls.h +++ b/generic/tclDecls.h @@ -3919,6 +3919,14 @@ extern const TclStubs *tclStubsPtr; # define Tcl_UtfToUniChar Tcl_UtfToChar16 # undef Tcl_UniCharLen # define Tcl_UniCharLen Tcl_Char16Len +# undef Tcl_UniCharToUtf +# if defined(USE_TCL_STUBS) +# define Tcl_UniCharToUtf(c, p) \ + (tclStubsPtr->tcl_UniCharToUtf((c)|TCL_COMBINE, (p))) +# else +# define Tcl_UniCharToUtf(c, p) \ + ((Tcl_UniCharToUtf)((c)|TCL_COMBINE, (p))) +# endif #endif #if defined(USE_TCL_STUBS) # define Tcl_WCharToUtfDString (sizeof(wchar_t) != sizeof(short) \ diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index 3a6385f..765f98b 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -2228,7 +2228,6 @@ UtfToUtfProc( } dst += Tcl_UniCharToUtf(ch, dst); } else { - int low; const char *saveSrc = src; size_t len = TclUtfToUCS4(src, &ch); if ((len < 2) && (ch != 0) && (flags & TCL_ENCODING_STOPONERROR) @@ -2246,13 +2245,20 @@ UtfToUtfProc( *dst++ = (char) (((ch >> 10) & 0x3F) | 0x80); ch = (ch & 0x0CFF) | 0xDC00; } - goto cesu8; +#if TCL_UTF_MAX < 4 + cesu8: +#endif + *dst++ = (char) (((ch >> 12) | 0xE0) & 0xEF); + *dst++ = (char) (((ch >> 6) | 0x80) & 0xBF); + *dst++ = (char) ((ch | 0x80) & 0xBF); + continue; +#if TCL_UTF_MAX < 4 } else if ((ch | 0x7FF) == 0xDFFF) { /* * A surrogate character is detected, handle especially. */ - low = ch; + int low = ch; len = (src <= srcEnd-3) ? TclUtfToUCS4(src, &low) : 0; if (((low & ~0x3FF) != 0xDC00) || (ch & 0x400)) { @@ -2261,15 +2267,12 @@ UtfToUtfProc( src = saveSrc; break; } - cesu8: - *dst++ = (char) (((ch >> 12) | 0xE0) & 0xEF); - *dst++ = (char) (((ch >> 6) | 0x80) & 0xBF); - *dst++ = (char) ((ch | 0x80) & 0xBF); - continue; + goto cesu8; } src += len; dst += Tcl_UniCharToUtf(ch, dst); ch = low; +#endif } else if (!Tcl_UniCharIsUnicode(ch)) { if (flags & TCL_ENCODING_STOPONERROR) { result = TCL_CONVERT_UNKNOWN; @@ -2578,7 +2581,7 @@ Utf16ToUtfProc( if (ch && ch < 0x80) { *dst++ = (ch & 0xFF); } else { - dst += Tcl_UniCharToUtf(ch, dst); + dst += Tcl_UniCharToUtf(ch | TCL_COMBINE, dst); } src += sizeof(unsigned short); } diff --git a/generic/tclParse.c b/generic/tclParse.c index 614401f..fdd1478 100644 --- a/generic/tclParse.c +++ b/generic/tclParse.c @@ -869,6 +869,7 @@ TclParseBackslash( * No hexdigits -> This is just "u". */ result = 'u'; +#if TCL_UTF_MAX < 4 } else if (((result & 0xFC00) == 0xD800) && (count == 6) && (p[5] == '\\') && (p[6] == 'u') && (numBytes >= 10)) { /* If high surrogate is immediately followed by a low surrogate @@ -879,6 +880,7 @@ TclParseBackslash( result = ((result & 0x3FF)<<10 | (low & 0x3FF)) + 0x10000; count += count2 + 2; } +#endif } break; case 'U': @@ -888,9 +890,6 @@ TclParseBackslash( * No hexdigits -> This is just "U". */ result = 'U'; - } else if ((result | 0x7FF) == 0xDFFF) { - /* Upper or lower surrogate, not allowed in this syntax. */ - result = 0xFFFD; } break; case '\n': @@ -954,10 +953,12 @@ TclParseBackslash( *readPtr = count; } count = Tcl_UniCharToUtf(result, dst); +#if TCL_UTF_MAX < 4 if ((result >= 0xD800) && (count < 3)) { /* Special case for handling high surrogates. */ count += Tcl_UniCharToUtf(-1, dst + count); } +#endif return count; } diff --git a/generic/tclUtf.c b/generic/tclUtf.c index e353b7f..a04e41c 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -208,15 +208,23 @@ Invalid( *--------------------------------------------------------------------------- */ +#undef Tcl_UniCharToUtf int Tcl_UniCharToUtf( int ch, /* The Tcl_UniChar to be stored in the - * buffer. */ + * buffer. Can be or'ed with flag TCL_COMBINE */ char *buf) /* Buffer in which the UTF-8 representation of * the Tcl_UniChar is stored. Buffer must be * large enough to hold the UTF-8 character * (at most 4 bytes). */ { +#if TCL_UTF_MAX > 3 + int flags = ch; +#endif + + if (ch >= TCL_COMBINE) { + ch &= (TCL_COMBINE - 1); + } if ((unsigned)(ch - 1) < (UNICODE_SELF - 1)) { buf[0] = (char) ch; return 1; @@ -228,7 +236,11 @@ Tcl_UniCharToUtf( return 2; } if (ch <= 0xFFFF) { - if ((ch & 0xF800) == 0xD800) { + if ( +#if TCL_UTF_MAX > 3 + (flags & TCL_COMBINE) && +#endif + ((ch & 0xF800) == 0xD800)) { if (ch & 0x0400) { /* Low surrogate */ if (((buf[0] & 0xC0) == 0x80) && ((buf[1] & 0xCF) == 0)) { @@ -377,7 +389,7 @@ Tcl_Char16ToUtfDString( /* Special case for handling high surrogates. */ p += Tcl_UniCharToUtf(-1, p); } - len = Tcl_UniCharToUtf(*w, p); + len = Tcl_UniCharToUtf(*w | TCL_COMBINE, p); p += len; if ((*w >= 0xD800) && (len < 3)) { len = 0; /* Indication that high surrogate was found */ diff --git a/tests/utf.test b/tests/utf.test index 09599b6..f094a23 100644 --- a/tests/utf.test +++ b/tests/utf.test @@ -78,11 +78,11 @@ test utf-1.11 {Tcl_UniCharToUtf: 3 byte sequence, low surrogate} testbytestring test utf-1.12 {Tcl_UniCharToUtf: 4 byte sequence, high/low surrogate} {pairsTo4bytes testbytestring} { expr {"\uD842\uDC42" eq [testbytestring \xF0\xA0\xA1\x82]} } 1 -test utf-1.13.0 {Tcl_UniCharToUtf: Invalid surrogate} {Uesc ucs2} { +test utf-1.13.0 {Tcl_UniCharToUtf: Invalid surrogate} {Uesc} { expr {"\UD842" eq "\uD842"} } 1 -test utf-1.13.1 {Tcl_UniCharToUtf: Invalid surrogate} {fullutf testbytestring} { - expr {"\UD842" eq [testbytestring \xEF\xBF\xBD]} +test utf-1.13.1 {Tcl_UniCharToUtf: Invalid surrogate} {testbytestring} { + expr {"\UD842" eq [testbytestring \xED\xA1\x82]} } 1 test utf-1.14 {Tcl_UniCharToUtf: surrogate pairs from concat} { set lo \uDE02 -- cgit v0.12