summaryrefslogtreecommitdiffstats
path: root/generic/tclUtf.c
diff options
context:
space:
mode:
authorjan.nijtmans <nijtmans@users.sourceforge.net>2019-03-10 21:04:41 (GMT)
committerjan.nijtmans <nijtmans@users.sourceforge.net>2019-03-10 21:04:41 (GMT)
commit919d62dcd3d557c7976c331687ef9e8bbf6560c9 (patch)
treee68684c2b7317009033535896b177fcdfefb8dce /generic/tclUtf.c
parente6a53eb44dab26c44e01f4620467c2c5ae0f27e5 (diff)
parent934e6a98376ded432d70c77b3778869bc49763d4 (diff)
downloadtcl-919d62dcd3d557c7976c331687ef9e8bbf6560c9.zip
tcl-919d62dcd3d557c7976c331687ef9e8bbf6560c9.tar.gz
tcl-919d62dcd3d557c7976c331687ef9e8bbf6560c9.tar.bz2
Merge 8.7
Diffstat (limited to 'generic/tclUtf.c')
-rw-r--r--generic/tclUtf.c204
1 files changed, 202 insertions, 2 deletions
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index 8c0a5b7..59116c4 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -267,6 +267,50 @@ Tcl_UniCharToUtfDString(
return string;
}
+#if (TCL_UTF_MAX > 4) && (defined(__CYGWIN__) || defined(_WIN32))
+char *
+TclWCharToUtfDString(
+ const WCHAR *uniStr, /* WCHAR string to convert to UTF-8. */
+ int uniLength, /* Length of WCHAR string in Tcl_UniChars
+ * (must be >= 0). */
+ Tcl_DString *dsPtr) /* UTF-8 representation of string is appended
+ * to this previously initialized DString. */
+{
+ const WCHAR *w, *wEnd;
+ char *p, *string;
+ int oldLength, len = 1;
+
+ /*
+ * UTF-8 string length in bytes will be <= Unicode string length * 4.
+ */
+
+ oldLength = Tcl_DStringLength(dsPtr);
+ Tcl_DStringSetLength(dsPtr, oldLength + (uniLength + 1) * 4);
+ string = Tcl_DStringValue(dsPtr) + oldLength;
+
+ p = string;
+ wEnd = uniStr + uniLength;
+ for (w = uniStr; w < wEnd; ) {
+ if (!len && ((*w & 0xFC00) != 0xDC00)) {
+ /* Special case for handling high surrogates. */
+ p += Tcl_UniCharToUtf(-1, p);
+ }
+ len = Tcl_UniCharToUtf(*w, p);
+ p += len;
+ if ((*w >= 0xD800) && (len < 3)) {
+ len = 0; /* Indication that high surrogate was found */
+ }
+ w++;
+ }
+ if (!len) {
+ /* Special case for handling high surrogates. */
+ p += Tcl_UniCharToUtf(-1, p);
+ }
+ Tcl_DStringSetLength(dsPtr, oldLength + (p - string));
+
+ return string;
+}
+#endif
/*
*---------------------------------------------------------------------------
*
@@ -418,7 +462,109 @@ Tcl_UtfToUniChar(
*chPtr = byte;
return 1;
}
-
+
+#if (TCL_UTF_MAX > 4) && (defined(__CYGWIN__) || defined(_WIN32))
+int
+TclUtfToWChar(
+ const char *src, /* The UTF-8 string. */
+ WCHAR *chPtr)/* Filled with the WCHAR represented by
+ * the UTF-8 string. */
+{
+ WCHAR byte;
+
+ /*
+ * Unroll 1 to 4 byte UTF-8 sequences.
+ */
+
+ byte = *((unsigned char *) src);
+ if (byte < 0xC0) {
+ /*
+ * Handles properly formed UTF-8 characters between 0x01 and 0x7F.
+ * Treats naked trail bytes 0x80 to 0x9F as valid characters from
+ * the cp1252 table. See: <https://en.wikipedia.org/wiki/UTF-8>
+ * Also treats \0 and other naked trail bytes 0xA0 to 0xBF as valid
+ * characters representing themselves.
+ */
+
+ /* If *chPtr contains a high surrogate (produced by a previous
+ * Tcl_UtfToUniChar() call) and the next 3 bytes are UTF-8 continuation
+ * bytes, then we must produce a follow-up low surrogate. We only
+ * do that if the high surrogate matches the bits we encounter.
+ */
+ if ((byte >= 0x80)
+ && (((((byte - 0x10) << 2) & 0xFC) | 0xD800) == (*chPtr & 0xFCFC))
+ && ((src[1] & 0xF0) == (((*chPtr << 4) & 0x30) | 0x80))
+ && ((src[2] & 0xC0) == 0x80)) {
+ *chPtr = ((src[1] & 0x0F) << 6) + (src[2] & 0x3F) + 0xDC00;
+ return 3;
+ }
+ if ((unsigned)(byte-0x80) < (unsigned)0x20) {
+ *chPtr = cp1252[byte-0x80];
+ } else {
+ *chPtr = byte;
+ }
+ return 1;
+ } else if (byte < 0xE0) {
+ if ((src[1] & 0xC0) == 0x80) {
+ /*
+ * Two-byte-character lead-byte followed by a trail-byte.
+ */
+
+ *chPtr = (((byte & 0x1F) << 6) | (src[1] & 0x3F));
+ if ((unsigned)(*chPtr - 1) >= (UNICODE_SELF - 1)) {
+ return 2;
+ }
+ }
+
+ /*
+ * A two-byte-character lead-byte not followed by trail-byte
+ * represents itself.
+ */
+ } else if (byte < 0xF0) {
+ if (((src[1] & 0xC0) == 0x80) && ((src[2] & 0xC0) == 0x80)) {
+ /*
+ * Three-byte-character lead byte followed by two trail bytes.
+ */
+
+ *chPtr = (((byte & 0x0F) << 12)
+ | ((src[1] & 0x3F) << 6) | (src[2] & 0x3F));
+ if (*chPtr > 0x7FF) {
+ return 3;
+ }
+ }
+
+ /*
+ * A three-byte-character lead-byte not followed by two trail-bytes
+ * represents itself.
+ */
+ }
+ else if (byte < 0xF8) {
+ if (((src[1] & 0xC0) == 0x80) && ((src[2] & 0xC0) == 0x80) && ((src[3] & 0xC0) == 0x80)) {
+ /*
+ * Four-byte-character lead byte followed by three trail bytes.
+ */
+ WCHAR high = (((byte & 0x07) << 8) | ((src[1] & 0x3F) << 2)
+ | ((src[2] & 0x3F) >> 4)) - 0x40;
+ if (high >= 0x400) {
+ /* out of range, < 0x10000 or > 0x10ffff */
+ } else {
+ /* produce high surrogate, advance source pointer */
+ *chPtr = 0xD800 + high;
+ return 1;
+ }
+ }
+
+ /*
+ * A four-byte-character lead-byte not followed by three trail-bytes
+ * represents itself.
+ */
+ }
+
+ *chPtr = byte;
+ return 1;
+}
+#endif
+
/*
*---------------------------------------------------------------------------
*
@@ -489,7 +635,61 @@ Tcl_UtfToUniCharDString(
return wString;
}
-
+
+#if (TCL_UTF_MAX > 4) && (defined(__CYGWIN__) || defined(_WIN32))
+WCHAR *
+TclUtfToWCharDString(
+ const char *src, /* UTF-8 string to convert to Unicode. */
+ int length, /* Length of UTF-8 string in bytes, or -1 for
+ * strlen(). */
+ Tcl_DString *dsPtr) /* Unicode representation of string is
+ * appended to this previously initialized
+ * DString. */
+{
+ WCHAR ch = 0, *w, *wString;
+ const char *p, *end;
+ int oldLength;
+
+ if (length < 0) {
+ length = strlen(src);
+ }
+
+ /*
+ * Unicode string length in Tcl_UniChars will be <= UTF-8 string length in
+ * bytes.
+ */
+
+ oldLength = Tcl_DStringLength(dsPtr);
+
+ Tcl_DStringSetLength(dsPtr,
+ oldLength + (int) ((length + 1) * sizeof(WCHAR)));
+ wString = (WCHAR *) (Tcl_DStringValue(dsPtr) + oldLength);
+
+ w = wString;
+ p = src;
+ end = src + length - 4;
+ while (p < end) {
+ p += TclUtfToWChar(p, &ch);
+ *w++ = ch;
+ }
+ end += 4;
+ while (p < end) {
+ if (Tcl_UtfCharComplete(p, end-p)) {
+ p += TclUtfToWChar(p, &ch);
+ } else if (((UCHAR(*p)-0x80)) < 0x20) {
+ ch = cp1252[UCHAR(*p++)-0x80];
+ } else {
+ ch = UCHAR(*p++);
+ }
+ *w++ = ch;
+ }
+ *w = '\0';
+ Tcl_DStringSetLength(dsPtr,
+ oldLength + ((char *) w - (char *) wString));
+
+ return wString;
+}
+#endif
/*
*---------------------------------------------------------------------------
*