diff options
author | jan.nijtmans <nijtmans@users.sourceforge.net> | 2021-03-17 12:46:31 (GMT) |
---|---|---|
committer | jan.nijtmans <nijtmans@users.sourceforge.net> | 2021-03-17 12:46:31 (GMT) |
commit | 74f570670c3ca45a4ff87a051d9ecdadb396dc56 (patch) | |
tree | d4dffcd7bdf37dcc8542df142df27ad4b5e40d69 /generic/tclEncoding.c | |
parent | 9d25ea9cd7a4e93d8870ddcb15b4677e5d908e2e (diff) | |
download | tcl-74f570670c3ca45a4ff87a051d9ecdadb396dc56.zip tcl-74f570670c3ca45a4ff87a051d9ecdadb396dc56.tar.gz tcl-74f570670c3ca45a4ff87a051d9ecdadb396dc56.tar.bz2 |
Add wtf-16 encodings to the set. With testcases
Diffstat (limited to 'generic/tclEncoding.c')
-rw-r--r-- | generic/tclEncoding.c | 119 |
1 files changed, 53 insertions, 66 deletions
diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index 2198c33..2707972 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -587,12 +587,21 @@ TclInitEncodingSubsystem(void) type.encodingName = "utf-16le"; type.clientData = INT2PTR(FLAG_LE); Tcl_CreateEncoding(&type); + type.encodingName = "wtf-16le"; + type.clientData = INT2PTR(FLAG_LE + FLAG_WTF); + Tcl_CreateEncoding(&type); type.encodingName = "utf-16be"; type.clientData = INT2PTR(0); Tcl_CreateEncoding(&type); + type.encodingName = "wtf-16be"; + type.clientData = INT2PTR(FLAG_WTF); + Tcl_CreateEncoding(&type); type.encodingName = "utf-16"; type.clientData = INT2PTR(isLe.c); Tcl_CreateEncoding(&type); + type.encodingName = "wtf-16"; + type.clientData = INT2PTR(isLe.c + FLAG_WTF); + Tcl_CreateEncoding(&type); #ifndef TCL_NO_DEPRECATED type.encodingName = "unicode"; @@ -2281,11 +2290,7 @@ UtfToUtfProc( const char *src, /* Source string in UTF-8. */ int srcLen, /* Source string length in bytes. */ int flags, /* Conversion control flags. */ - Tcl_EncodingState *statePtr,/* Place for conversion routine to store state - * information used during a piecewise - * conversion. Contents of statePtr are - * initialized and/or reset by conversion - * routine under control of flags argument. */ + TCL_UNUSED(Tcl_EncodingState *), char *dst, /* Output buffer in which converted string is * stored. */ int dstLen, /* The maximum length of output buffer in @@ -2309,11 +2314,8 @@ UtfToUtfProc( const char *dstStart, *dstEnd; int result, numChars, charLimit = INT_MAX; int encflags = PTR2INT(clientData); - int *chPtr = (int *) statePtr; + int ch; - if (flags & TCL_ENCODING_START) { - *statePtr = 0; - } result = TCL_OK; srcStart = src; @@ -2350,7 +2352,6 @@ UtfToUtfProc( */ *dst++ = *src++; - *chPtr = 0; /* reset surrogate handling */ } else if (pureNullMode == 1 && UCHAR(*src) == 0xC0 && (src + 1 < srcEnd) && UCHAR(*(src+1)) == 0x80) { /* @@ -2358,7 +2359,6 @@ UtfToUtfProc( */ *dst++ = 0; - *chPtr = 0; /* reset surrogate handling */ src += 2; } else if (!Tcl_UtfCharComplete(src, srcEnd - src)) { /* @@ -2367,32 +2367,32 @@ UtfToUtfProc( * incomplete char its bytes are made to represent themselves. */ - *chPtr = UCHAR(*src); + ch = UCHAR(*src); src += 1; - dst += Tcl_UniCharToUtf(*chPtr, dst); + dst += Tcl_UniCharToUtf(ch, dst); } else { - src += TclUtfToUCS4(src, chPtr); - if ((*chPtr | 0x7FF) == 0xDFFF) { + src += TclUtfToUCS4(src, &ch); + if ((ch | 0x7FF) == 0xDFFF) { /* A surrogate character is detected, handle especially */ - int low = *chPtr; + int low = ch; size_t len = (src <= srcEnd-3) ? TclUtfToUCS4(src, &low) : 0; - if (((low & ~0x3FF) != 0xDC00) || (*chPtr & 0x400)) { + if (((low & ~0x3FF) != 0xDC00) || (ch & 0x400)) { if ((pureNullMode == 1) && !(encflags & FLAG_WTF)) { - *chPtr = 0xFFFD; + ch = 0xFFFD; } - *dst++ = (char) (((*chPtr >> 12) | 0xE0) & 0xEF); - *dst++ = (char) (((*chPtr >> 6) | 0x80) & 0xBF); - *dst++ = (char) ((*chPtr | 0x80) & 0xBF); + *dst++ = (char) (((ch >> 12) | 0xE0) & 0xEF); + *dst++ = (char) (((ch >> 6) | 0x80) & 0xBF); + *dst++ = (char) ((ch | 0x80) & 0xBF); continue; } src += len; - dst += Tcl_UniCharToUtf(*chPtr, dst); - *chPtr = low; + dst += Tcl_UniCharToUtf(ch, dst); + ch = low; } else if ((pureNullMode == 1) && !(encflags & FLAG_WTF) - && !Tcl_UniCharIsUnicode(*chPtr)) { - *chPtr = 0xFFFD; + && !Tcl_UniCharIsUnicode(ch)) { + ch = 0xFFFD; } - dst += Tcl_UniCharToUtf(*chPtr, dst); + dst += Tcl_UniCharToUtf(ch, dst); } } @@ -2420,7 +2420,7 @@ UtfToUtfProc( static int Utf16ToUtfProc( - ClientData clientData, /* != NULL means LE, == NUL means BE */ + ClientData clientData, /* flags */ const char *src, /* Source string in Unicode. */ int srcLen, /* Source string length in bytes. */ int flags, /* Conversion control flags. */ @@ -2444,6 +2444,7 @@ Utf16ToUtfProc( const char *srcStart, *srcEnd; const char *dstEnd, *dstStart; int result, numChars, charLimit = INT_MAX; + int encflags = PTR2INT(clientData); unsigned short ch; if (flags & TCL_ENCODING_CHAR_LIMIT) { @@ -2457,7 +2458,7 @@ Utf16ToUtfProc( srcLen--; } /* If last code point is a high surrogate, we cannot handle that yet */ - if ((srcLen >= 2) && ((src[srcLen - (clientData?1:2)] & 0xFC) == 0xD8)) { + if ((srcLen >= 2) && ((src[srcLen - ((encflags & FLAG_LE)?1:2)] & 0xFC) == 0xD8)) { result = TCL_CONVERT_MULTIBYTE; srcLen-= 2; } @@ -2474,7 +2475,7 @@ Utf16ToUtfProc( break; } - if (clientData) { + if (encflags & FLAG_LE) { ch = (src[1] & 0xFF) << 8 | (src[0] & 0xFF); } else { ch = (src[0] & 0xFF) << 8 | (src[1] & 0xFF); @@ -2515,15 +2516,11 @@ Utf16ToUtfProc( static int UtfToUtf16Proc( - ClientData clientData, /* != NULL means LE, == NUL means BE */ + ClientData clientData1, /* flags */ const char *src, /* Source string in UTF-8. */ int srcLen, /* Source string length in bytes. */ int flags, /* Conversion control flags. */ - Tcl_EncodingState *statePtr,/* Place for conversion routine to store state - * information used during a piecewise - * conversion. Contents of statePtr are - * initialized and/or reset by conversion - * routine under control of flags argument. */ + TCL_UNUSED(Tcl_EncodingState *), char *dst, /* Output buffer in which converted string is * stored. */ int dstLen, /* The maximum length of output buffer in @@ -2542,11 +2539,9 @@ UtfToUtf16Proc( { const char *srcStart, *srcEnd, *srcClose, *dstStart, *dstEnd; int result, numChars; - Tcl_UniChar *chPtr = (Tcl_UniChar *) statePtr; + int encflags = PTR2INT(clientData1); + int ch; - if (flags & TCL_ENCODING_START) { - *statePtr = 0; - } srcStart = src; srcEnd = src + srcLen; srcClose = srcEnd; @@ -2572,38 +2567,30 @@ UtfToUtf16Proc( result = TCL_CONVERT_NOSPACE; break; } - src += TclUtfToUniChar(src, chPtr); - - if (clientData) { -#if TCL_UTF_MAX > 3 - if (*chPtr <= 0xFFFF) { - *dst++ = (*chPtr & 0xFF); - *dst++ = (*chPtr >> 8); + src += TclUtfToUCS4(src, &ch); + if (!(encflags & FLAG_WTF) && !Tcl_UniCharIsUnicode(ch)) { + ch = 0xFFFD; + } + if (encflags & FLAG_LE) { + if (ch <= 0xFFFF) { + *dst++ = (ch & 0xFF); + *dst++ = (ch >> 8); } else { - *dst++ = (((*chPtr - 0x10000) >> 10) & 0xFF); - *dst++ = (((*chPtr - 0x10000) >> 18) & 0x3) | 0xD8; - *dst++ = (*chPtr & 0xFF); - *dst++ = ((*chPtr >> 8) & 0x3) | 0xDC; + *dst++ = (((ch - 0x10000) >> 10) & 0xFF); + *dst++ = (((ch - 0x10000) >> 18) & 0x3) | 0xD8; + *dst++ = (ch & 0xFF); + *dst++ = ((ch >> 8) & 0x3) | 0xDC; } -#else - *dst++ = (*chPtr & 0xFF); - *dst++ = (*chPtr >> 8); -#endif } else { -#if TCL_UTF_MAX > 3 - if (*chPtr <= 0xFFFF) { - *dst++ = (*chPtr >> 8); - *dst++ = (*chPtr & 0xFF); + if (ch <= 0xFFFF) { + *dst++ = (ch >> 8); + *dst++ = (ch & 0xFF); } else { - *dst++ = (((*chPtr - 0x10000) >> 18) & 0x3) | 0xD8; - *dst++ = (((*chPtr - 0x10000) >> 10) & 0xFF); - *dst++ = ((*chPtr >> 8) & 0x3) | 0xDC; - *dst++ = (*chPtr & 0xFF); + *dst++ = (((ch - 0x10000) >> 18) & 0x3) | 0xD8; + *dst++ = (((ch - 0x10000) >> 10) & 0xFF); + *dst++ = ((ch >> 8) & 0x3) | 0xDC; + *dst++ = (ch & 0xFF); } -#else - *dst++ = (*chPtr >> 8); - *dst++ = (*chPtr & 0xFF); -#endif } } *srcReadPtr = src - srcStart; |