diff options
author | jan.nijtmans <nijtmans@users.sourceforge.net> | 2019-03-10 21:04:41 (GMT) |
---|---|---|
committer | jan.nijtmans <nijtmans@users.sourceforge.net> | 2019-03-10 21:04:41 (GMT) |
commit | 919d62dcd3d557c7976c331687ef9e8bbf6560c9 (patch) | |
tree | e68684c2b7317009033535896b177fcdfefb8dce /generic/tclUtf.c | |
parent | e6a53eb44dab26c44e01f4620467c2c5ae0f27e5 (diff) | |
parent | 934e6a98376ded432d70c77b3778869bc49763d4 (diff) | |
download | tcl-919d62dcd3d557c7976c331687ef9e8bbf6560c9.zip tcl-919d62dcd3d557c7976c331687ef9e8bbf6560c9.tar.gz tcl-919d62dcd3d557c7976c331687ef9e8bbf6560c9.tar.bz2 |
Merge 8.7
Diffstat (limited to 'generic/tclUtf.c')
-rw-r--r-- | generic/tclUtf.c | 204 |
1 files changed, 202 insertions, 2 deletions
diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 8c0a5b7..59116c4 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -267,6 +267,50 @@ Tcl_UniCharToUtfDString( return string; } +#if (TCL_UTF_MAX > 4) && (defined(__CYGWIN__) || defined(_WIN32)) +char * +TclWCharToUtfDString( + const WCHAR *uniStr, /* WCHAR string to convert to UTF-8. */ + int uniLength, /* Length of WCHAR string in Tcl_UniChars + * (must be >= 0). */ + Tcl_DString *dsPtr) /* UTF-8 representation of string is appended + * to this previously initialized DString. */ +{ + const WCHAR *w, *wEnd; + char *p, *string; + int oldLength, len = 1; + + /* + * UTF-8 string length in bytes will be <= Unicode string length * 4. + */ + + oldLength = Tcl_DStringLength(dsPtr); + Tcl_DStringSetLength(dsPtr, oldLength + (uniLength + 1) * 4); + string = Tcl_DStringValue(dsPtr) + oldLength; + + p = string; + wEnd = uniStr + uniLength; + for (w = uniStr; w < wEnd; ) { + if (!len && ((*w & 0xFC00) != 0xDC00)) { + /* Special case for handling high surrogates. */ + p += Tcl_UniCharToUtf(-1, p); + } + len = Tcl_UniCharToUtf(*w, p); + p += len; + if ((*w >= 0xD800) && (len < 3)) { + len = 0; /* Indication that high surrogate was found */ + } + w++; + } + if (!len) { + /* Special case for handling high surrogates. */ + p += Tcl_UniCharToUtf(-1, p); + } + Tcl_DStringSetLength(dsPtr, oldLength + (p - string)); + + return string; +} +#endif /* *--------------------------------------------------------------------------- * @@ -418,7 +462,109 @@ Tcl_UtfToUniChar( *chPtr = byte; return 1; } - + +#if (TCL_UTF_MAX > 4) && (defined(__CYGWIN__) || defined(_WIN32)) +int +TclUtfToWChar( + const char *src, /* The UTF-8 string. */ + WCHAR *chPtr)/* Filled with the WCHAR represented by + * the UTF-8 string. */ +{ + WCHAR byte; + + /* + * Unroll 1 to 4 byte UTF-8 sequences. + */ + + byte = *((unsigned char *) src); + if (byte < 0xC0) { + /* + * Handles properly formed UTF-8 characters between 0x01 and 0x7F. + * Treats naked trail bytes 0x80 to 0x9F as valid characters from + * the cp1252 table. See: <https://en.wikipedia.org/wiki/UTF-8> + * Also treats \0 and other naked trail bytes 0xA0 to 0xBF as valid + * characters representing themselves. + */ + + /* If *chPtr contains a high surrogate (produced by a previous + * Tcl_UtfToUniChar() call) and the next 3 bytes are UTF-8 continuation + * bytes, then we must produce a follow-up low surrogate. We only + * do that if the high surrogate matches the bits we encounter. + */ + if ((byte >= 0x80) + && (((((byte - 0x10) << 2) & 0xFC) | 0xD800) == (*chPtr & 0xFCFC)) + && ((src[1] & 0xF0) == (((*chPtr << 4) & 0x30) | 0x80)) + && ((src[2] & 0xC0) == 0x80)) { + *chPtr = ((src[1] & 0x0F) << 6) + (src[2] & 0x3F) + 0xDC00; + return 3; + } + if ((unsigned)(byte-0x80) < (unsigned)0x20) { + *chPtr = cp1252[byte-0x80]; + } else { + *chPtr = byte; + } + return 1; + } else if (byte < 0xE0) { + if ((src[1] & 0xC0) == 0x80) { + /* + * Two-byte-character lead-byte followed by a trail-byte. + */ + + *chPtr = (((byte & 0x1F) << 6) | (src[1] & 0x3F)); + if ((unsigned)(*chPtr - 1) >= (UNICODE_SELF - 1)) { + return 2; + } + } + + /* + * A two-byte-character lead-byte not followed by trail-byte + * represents itself. + */ + } else if (byte < 0xF0) { + if (((src[1] & 0xC0) == 0x80) && ((src[2] & 0xC0) == 0x80)) { + /* + * Three-byte-character lead byte followed by two trail bytes. + */ + + *chPtr = (((byte & 0x0F) << 12) + | ((src[1] & 0x3F) << 6) | (src[2] & 0x3F)); + if (*chPtr > 0x7FF) { + return 3; + } + } + + /* + * A three-byte-character lead-byte not followed by two trail-bytes + * represents itself. + */ + } + else if (byte < 0xF8) { + if (((src[1] & 0xC0) == 0x80) && ((src[2] & 0xC0) == 0x80) && ((src[3] & 0xC0) == 0x80)) { + /* + * Four-byte-character lead byte followed by three trail bytes. + */ + WCHAR high = (((byte & 0x07) << 8) | ((src[1] & 0x3F) << 2) + | ((src[2] & 0x3F) >> 4)) - 0x40; + if (high >= 0x400) { + /* out of range, < 0x10000 or > 0x10ffff */ + } else { + /* produce high surrogate, advance source pointer */ + *chPtr = 0xD800 + high; + return 1; + } + } + + /* + * A four-byte-character lead-byte not followed by three trail-bytes + * represents itself. + */ + } + + *chPtr = byte; + return 1; +} +#endif + /* *--------------------------------------------------------------------------- * @@ -489,7 +635,61 @@ Tcl_UtfToUniCharDString( return wString; } - + +#if (TCL_UTF_MAX > 4) && (defined(__CYGWIN__) || defined(_WIN32)) +WCHAR * +TclUtfToWCharDString( + const char *src, /* UTF-8 string to convert to Unicode. */ + int length, /* Length of UTF-8 string in bytes, or -1 for + * strlen(). */ + Tcl_DString *dsPtr) /* Unicode representation of string is + * appended to this previously initialized + * DString. */ +{ + WCHAR ch = 0, *w, *wString; + const char *p, *end; + int oldLength; + + if (length < 0) { + length = strlen(src); + } + + /* + * Unicode string length in Tcl_UniChars will be <= UTF-8 string length in + * bytes. + */ + + oldLength = Tcl_DStringLength(dsPtr); + + Tcl_DStringSetLength(dsPtr, + oldLength + (int) ((length + 1) * sizeof(WCHAR))); + wString = (WCHAR *) (Tcl_DStringValue(dsPtr) + oldLength); + + w = wString; + p = src; + end = src + length - 4; + while (p < end) { + p += TclUtfToWChar(p, &ch); + *w++ = ch; + } + end += 4; + while (p < end) { + if (Tcl_UtfCharComplete(p, end-p)) { + p += TclUtfToWChar(p, &ch); + } else if (((UCHAR(*p)-0x80)) < 0x20) { + ch = cp1252[UCHAR(*p++)-0x80]; + } else { + ch = UCHAR(*p++); + } + *w++ = ch; + } + *w = '\0'; + Tcl_DStringSetLength(dsPtr, + oldLength + ((char *) w - (char *) wString)); + + return wString; +} +#endif /* *--------------------------------------------------------------------------- * |