Merge 8.7

author: jan.nijtmans <nijtmans@users.sourceforge.net> 2019-03-10 21:04:41 (GMT)
committer: jan.nijtmans <nijtmans@users.sourceforge.net> 2019-03-10 21:04:41 (GMT)
commit: 919d62dcd3d557c7976c331687ef9e8bbf6560c9 (patch)
tree: e68684c2b7317009033535896b177fcdfefb8dce /generic/tclUtf.c
parent: e6a53eb44dab26c44e01f4620467c2c5ae0f27e5 (diff)
parent: 934e6a98376ded432d70c77b3778869bc49763d4 (diff)
download: tcl-919d62dcd3d557c7976c331687ef9e8bbf6560c9.zip
tcl-919d62dcd3d557c7976c331687ef9e8bbf6560c9.tar.gz
tcl-919d62dcd3d557c7976c331687ef9e8bbf6560c9.tar.bz2
1 files changed, 202 insertions, 2 deletions
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index 8c0a5b7..59116c4 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -267,6 +267,50 @@ Tcl_UniCharToUtfDString(
     return string;
 }
 
+#if (TCL_UTF_MAX > 4) && (defined(__CYGWIN__) || defined(_WIN32))
+char *
+TclWCharToUtfDString(
+    const WCHAR *uniStr,	/* WCHAR string to convert to UTF-8. */
+    int uniLength,		/* Length of WCHAR string in Tcl_UniChars
+				 * (must be >= 0). */
+    Tcl_DString *dsPtr)		/* UTF-8 representation of string is appended
+				 * to this previously initialized DString. */
+{
+    const WCHAR *w, *wEnd;
+    char *p, *string;
+    int oldLength, len = 1;
+
+    /*
+     * UTF-8 string length in bytes will be <= Unicode string length * 4.
+     */
+
+    oldLength = Tcl_DStringLength(dsPtr);
+    Tcl_DStringSetLength(dsPtr, oldLength + (uniLength + 1) * 4);
+    string = Tcl_DStringValue(dsPtr) + oldLength;
+
+    p = string;
+    wEnd = uniStr + uniLength;
+    for (w = uniStr; w < wEnd; ) {
+	if (!len && ((*w & 0xFC00) != 0xDC00)) {
+	    /* Special case for handling high surrogates. */
+	    p += Tcl_UniCharToUtf(-1, p);
+	}
+	len = Tcl_UniCharToUtf(*w, p);
+	p += len;
+	if ((*w >= 0xD800) && (len < 3)) {
+	    len = 0; /* Indication that high surrogate was found */
+	}
+	w++;
+    }
+    if (!len) {
+	/* Special case for handling high surrogates. */
+	p += Tcl_UniCharToUtf(-1, p);
+    }
+    Tcl_DStringSetLength(dsPtr, oldLength + (p - string));
+
+    return string;
+}
+#endif
 /*
  *---------------------------------------------------------------------------
  *
@@ -418,7 +462,109 @@ Tcl_UtfToUniChar(
     *chPtr = byte;
     return 1;
 }
-
+
+#if (TCL_UTF_MAX > 4) && (defined(__CYGWIN__) || defined(_WIN32))
+int
+TclUtfToWChar(
+    const char *src,	/* The UTF-8 string. */
+    WCHAR *chPtr)/* Filled with the WCHAR represented by
+				 * the UTF-8 string. */
+{
+    WCHAR byte;
+
+    /*
+     * Unroll 1 to 4 byte UTF-8 sequences.
+     */
+
+    byte = *((unsigned char *) src);
+    if (byte < 0xC0) {
+	/*
+	 * Handles properly formed UTF-8 characters between 0x01 and 0x7F.
+	 * Treats naked trail bytes 0x80 to 0x9F as valid characters from
+	 * the cp1252 table. See: <https://en.wikipedia.org/wiki/UTF-8>
+	 * Also treats \0 and other naked trail bytes 0xA0 to 0xBF as valid
+	 * characters representing themselves.
+	 */
+
+	/* If *chPtr contains a high surrogate (produced by a previous
+	 * Tcl_UtfToUniChar() call) and the next 3 bytes are UTF-8 continuation
+	 * bytes, then we must produce a follow-up low surrogate. We only
+	 * do that if the high surrogate matches the bits we encounter.
+	 */
+	if ((byte >= 0x80)
+		&& (((((byte - 0x10) << 2) & 0xFC) | 0xD800) == (*chPtr & 0xFCFC))
+		&& ((src[1] & 0xF0) == (((*chPtr << 4) & 0x30) | 0x80))
+		&& ((src[2] & 0xC0) == 0x80)) {
+	    *chPtr = ((src[1] & 0x0F) << 6) + (src[2] & 0x3F) + 0xDC00;
+	    return 3;
+	}
+	if ((unsigned)(byte-0x80) < (unsigned)0x20) {
+	    *chPtr = cp1252[byte-0x80];
+	} else {
+	    *chPtr = byte;
+	}
+	return 1;
+    } else if (byte < 0xE0) {
+	if ((src[1] & 0xC0) == 0x80) {
+	    /*
+	     * Two-byte-character lead-byte followed by a trail-byte.
+	     */
+
+	    *chPtr = (((byte & 0x1F) << 6) | (src[1] & 0x3F));
+	    if ((unsigned)(*chPtr - 1) >= (UNICODE_SELF - 1)) {
+		return 2;
+	    }
+	}
+
+	/*
+	 * A two-byte-character lead-byte not followed by trail-byte
+	 * represents itself.
+	 */
+    } else if (byte < 0xF0) {
+	if (((src[1] & 0xC0) == 0x80) && ((src[2] & 0xC0) == 0x80)) {
+	    /*
+	     * Three-byte-character lead byte followed by two trail bytes.
+	     */
+
+	    *chPtr = (((byte & 0x0F) << 12)
+		    | ((src[1] & 0x3F) << 6) | (src[2] & 0x3F));
+	    if (*chPtr > 0x7FF) {
+		return 3;
+	    }
+	}
+
+	/*
+	 * A three-byte-character lead-byte not followed by two trail-bytes
+	 * represents itself.
+	 */
+    }
+    else if (byte < 0xF8) {
+	if (((src[1] & 0xC0) == 0x80) && ((src[2] & 0xC0) == 0x80) && ((src[3] & 0xC0) == 0x80)) {
+	    /*
+	     * Four-byte-character lead byte followed by three trail bytes.
+	     */
+	    WCHAR high = (((byte & 0x07) << 8) | ((src[1] & 0x3F) << 2)
+		    | ((src[2] & 0x3F) >> 4)) - 0x40;
+	    if (high >= 0x400) {
+		/* out of range, < 0x10000 or > 0x10ffff */
+	    } else {
+		/* produce high surrogate, advance source pointer */
+		*chPtr = 0xD800 + high;
+		return 1;
+	    }
+	}
+
+	/*
+	 * A four-byte-character lead-byte not followed by three trail-bytes
+	 * represents itself.
+	 */
+    }
+
+    *chPtr = byte;
+    return 1;
+}
+#endif
+
 /*
  *---------------------------------------------------------------------------
  *
@@ -489,7 +635,61 @@ Tcl_UtfToUniCharDString(
 
     return wString;
 }
-
+
+#if (TCL_UTF_MAX > 4) && (defined(__CYGWIN__) || defined(_WIN32))
+WCHAR *
+TclUtfToWCharDString(
+    const char *src,		/* UTF-8 string to convert to Unicode. */
+    int length,			/* Length of UTF-8 string in bytes, or -1 for
+				 * strlen(). */
+    Tcl_DString *dsPtr)		/* Unicode representation of string is
+				 * appended to this previously initialized
+				 * DString. */
+{
+    WCHAR ch = 0, *w, *wString;
+    const char *p, *end;
+    int oldLength;
+
+    if (length < 0) {
+	length = strlen(src);
+    }
+
+    /*
+     * Unicode string length in Tcl_UniChars will be <= UTF-8 string length in
+     * bytes.
+     */
+
+    oldLength = Tcl_DStringLength(dsPtr);
+
+    Tcl_DStringSetLength(dsPtr,
+	    oldLength + (int) ((length + 1) * sizeof(WCHAR)));
+    wString = (WCHAR *) (Tcl_DStringValue(dsPtr) + oldLength);
+
+    w = wString;
+    p = src;
+    end = src + length - 4;
+    while (p < end) {
+	p += TclUtfToWChar(p, &ch);
+	*w++ = ch;
+    }
+    end += 4;
+    while (p < end) {
+	if (Tcl_UtfCharComplete(p, end-p)) {
+	    p += TclUtfToWChar(p, &ch);
+	} else if (((UCHAR(*p)-0x80)) < 0x20) {
+	    ch = cp1252[UCHAR(*p++)-0x80];
+	} else {
+	    ch = UCHAR(*p++);
+	}
+	*w++ = ch;
+    }
+    *w = '\0';
+    Tcl_DStringSetLength(dsPtr,
+	    oldLength + ((char *) w - (char *) wString));
+
+    return wString;
+}
+#endif
 /*
  *---------------------------------------------------------------------------
  *
author	jan.nijtmans <nijtmans@users.sourceforge.net>	2019-03-10 21:04:41 (GMT)
committer	jan.nijtmans <nijtmans@users.sourceforge.net>	2019-03-10 21:04:41 (GMT)
commit	919d62dcd3d557c7976c331687ef9e8bbf6560c9 (patch)
tree	e68684c2b7317009033535896b177fcdfefb8dce /generic/tclUtf.c
parent	e6a53eb44dab26c44e01f4620467c2c5ae0f27e5 (diff)
parent	934e6a98376ded432d70c77b3778869bc49763d4 (diff)
download	tcl-919d62dcd3d557c7976c331687ef9e8bbf6560c9.zip tcl-919d62dcd3d557c7976c331687ef9e8bbf6560c9.tar.gz tcl-919d62dcd3d557c7976c331687ef9e8bbf6560c9.tar.bz2