Improvement: always export both 16-bit and 32-bit UTF function

author: jan.nijtmans <nijtmans@users.sourceforge.net> 2019-07-05 09:03:12 (GMT)
committer: jan.nijtmans <nijtmans@users.sourceforge.net> 2019-07-05 09:03:12 (GMT)
commit: 30b0dabb9b5092b7453ff56eae75c009f30eaa1f (patch)
tree: e95abf5ef1adf6677e12d167fb247fc671a8d556 /generic/tclUtf.c
parent: 26a830241ea90ce41d6cddc8f2a720f4cc5b2e73 (diff)
download: tcl-30b0dabb9b5092b7453ff56eae75c009f30eaa1f.zip
tcl-30b0dabb9b5092b7453ff56eae75c009f30eaa1f.tar.gz
tcl-30b0dabb9b5092b7453ff56eae75c009f30eaa1f.tar.bz2
1 files changed, 22 insertions, 50 deletions
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index de696a0..7a5fb36 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -221,15 +221,15 @@ three:
  *---------------------------------------------------------------------------
  */
 
-#if TCL_UTF_MAX > 4
+#undef Tcl_UniCharToUtfDString
 char *
 Tcl_UniCharToUtfDString(
-    const Tcl_UniChar *uniStr,	/* Unicode string to convert to UTF-8. */
+    const unsigned int *uniStr,	/* Unicode string to convert to UTF-8. */
     int uniLength,		/* Length of Unicode string. */
     Tcl_DString *dsPtr)		/* UTF-8 representation of string is appended
 				 * to this previously initialized DString. */
 {
-    const Tcl_UniChar *w, *wEnd;
+    const unsigned int *w, *wEnd;
     char *p, *string;
     int oldLength;
 
@@ -259,10 +259,9 @@ Tcl_UniCharToUtfDString(
 
     return string;
 }
-#endif /* TCL_UTF_MAX > 4 */
 
 char *
-Tcl_Utf16ToUtfDString(
+Tcl_WCharToUtfDString(
     const unsigned short *uniStr,	/* Utf-16 string to convert to UTF-8. */
     int uniLength,		/* Length of Utf-16 string. */
     Tcl_DString *dsPtr)		/* UTF-8 representation of string is appended
@@ -353,13 +352,14 @@ static const unsigned short cp1252[32] = {
    0x2DC, 0x2122, 0x0161, 0x203A, 0x0153,   0x9D, 0x017E, 0x0178
 };
 
+#undef Tcl_UtfToUniChar
 int
 Tcl_UtfToUniChar(
     register const char *src,	/* The UTF-8 string. */
-    register Tcl_UniChar *chPtr)/* Filled with the Tcl_UniChar represented by
+    register unsigned int *chPtr)/* Filled with the unsigned int represented by
 				 * the UTF-8 string. */
 {
-    Tcl_UniChar byte;
+    unsigned int byte;
 
     /*
      * Unroll 1 to 4 byte UTF-8 sequences.
@@ -375,20 +375,6 @@ Tcl_UtfToUniChar(
 	 * characters representing themselves.
 	 */
 
-#if TCL_UTF_MAX <= 4
-	/* If *chPtr contains a high surrogate (produced by a previous
-	 * Tcl_UtfToUniChar() call) and the next 3 bytes are UTF-8 continuation
-	 * bytes, then we must produce a follow-up low surrogate. We only
-	 * do that if the high surrogate matches the bits we encounter.
-	 */
-	if ((byte >= 0x80)
-		&& (((((byte - 0x10) << 2) & 0xFC) | 0xD800) == (*chPtr & 0xFCFC))
-		&& ((src[1] & 0xF0) == (((*chPtr << 4) & 0x30) | 0x80))
-		&& ((src[2] & 0xC0) == 0x80)) {
-	    *chPtr = ((src[1] & 0x0F) << 6) + (src[2] & 0x3F) + 0xDC00;
-	    return 3;
-	}
-#endif
 	if ((unsigned)(byte-0x80) < (unsigned)0x20) {
 	    *chPtr = cp1252[byte-0x80];
 	} else {
@@ -434,23 +420,11 @@ Tcl_UtfToUniChar(
 	    /*
 	     * Four-byte-character lead byte followed by three trail bytes.
 	     */
-#if TCL_UTF_MAX <= 4
-	    Tcl_UniChar high = (((byte & 0x07) << 8) | ((src[1] & 0x3F) << 2)
-		    | ((src[2] & 0x3F) >> 4)) - 0x40;
-	    if (high >= 0x400) {
-		/* out of range, < 0x10000 or > 0x10ffff */
-	    } else {
-		/* produce high surrogate, advance source pointer */
-		*chPtr = 0xD800 + high;
-		return 1;
-	    }
-#else
 	    *chPtr = (((byte & 0x07) << 18) | ((src[1] & 0x3F) << 12)
 		    | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F));
 	    if ((unsigned)(*chPtr - 0x10000) <= 0xFFFFF) {
 		return 4;
 	    }
-#endif
 	}
 
 	/*
@@ -463,14 +437,14 @@ Tcl_UtfToUniChar(
     return 1;
 }
 
-#if (TCL_UTF_MAX > 4) && (defined(__CYGWIN__) || defined(_WIN32))
+#undef Tcl_UtfToWChar
 int
-TclUtfToWChar(
+Tcl_UtfToWChar(
     const char *src,	/* The UTF-8 string. */
-    WCHAR *chPtr)/* Filled with the WCHAR represented by
+    unsigned short *chPtr)/* Filled with the unsigned short represented by
 				 * the UTF-8 string. */
 {
-    WCHAR byte;
+    unsigned short byte;
 
     /*
      * Unroll 1 to 4 byte UTF-8 sequences.
@@ -563,7 +537,6 @@ TclUtfToWChar(
     *chPtr = byte;
     return 1;
 }
-#endif
 
 /*
  *---------------------------------------------------------------------------
@@ -583,8 +556,8 @@ TclUtfToWChar(
  *---------------------------------------------------------------------------
  */
 
-#if TCL_UTF_MAX > 4
-Tcl_UniChar *
+#undef Tcl_UtfToUniCharDString
+unsigned int *
 Tcl_UtfToUniCharDString(
     const char *src,		/* UTF-8 string to convert to Unicode. */
     int length,			/* Length of UTF-8 string in bytes, or -1 for
@@ -593,7 +566,7 @@ Tcl_UtfToUniCharDString(
 				 * appended to this previously initialized
 				 * DString. */
 {
-    Tcl_UniChar ch = 0, *w, *wString;
+    unsigned int ch = 0, *w, *wString;
     const char *p, *end;
     int oldLength;
 
@@ -609,20 +582,20 @@ Tcl_UtfToUniCharDString(
     oldLength = Tcl_DStringLength(dsPtr);
 
     Tcl_DStringSetLength(dsPtr,
-	    oldLength + ((length + 1) * sizeof(Tcl_UniChar)));
-    wString = (Tcl_UniChar *) (Tcl_DStringValue(dsPtr) + oldLength);
+	    oldLength + ((length + 1) * sizeof(unsigned int)));
+    wString = (unsigned int *) (Tcl_DStringValue(dsPtr) + oldLength);
 
     w = wString;
     p = src;
     end = src + length - 4;
     while (p < end) {
-	p += TclUtfToUniChar(p, &ch);
+	p += Tcl_UtfToUniChar(p, &ch);
 	*w++ = ch;
     }
     end += 4;
     while (p < end) {
 	if (Tcl_UtfCharComplete(p, end-p)) {
-	    p += TclUtfToUniChar(p, &ch);
+	    p += Tcl_UtfToUniChar(p, &ch);
 	} else {
 	    ch = UCHAR(*p++);
 	}
@@ -634,10 +607,9 @@ Tcl_UtfToUniCharDString(
 
     return wString;
 }
-#endif /* TCL_UTF_MAX > 4 */
 
 unsigned short *
-Tcl_UtfToUtf16DString(
+Tcl_UtfToWCharDString(
     const char *src,		/* UTF-8 string to convert to Unicode. */
     int length,			/* Length of UTF-8 string in bytes, or -1 for
 				 * strlen(). */
@@ -645,7 +617,7 @@ Tcl_UtfToUtf16DString(
 				 * appended to this previously initialized
 				 * DString. */
 {
-    Tcl_UniChar ch = 0;
+    unsigned short ch = 0;
     unsigned short *w, *wString;
     const char *p, *end;
     int oldLength;
@@ -669,13 +641,13 @@ Tcl_UtfToUtf16DString(
     p = src;
     end = src + length - 4;
     while (p < end) {
-	p += Tcl_UtfToUniChar(p, &ch);
+	p += Tcl_UtfToWChar(p, &ch);
 	*w++ = ch;
     }
     end += 4;
     while (p < end) {
 	if (Tcl_UtfCharComplete(p, end-p)) {
-	    p += Tcl_UtfToUniChar(p, &ch);
+	    p += Tcl_UtfToWChar(p, &ch);
 	} else {
 	    ch = UCHAR(*p++);
 	}
author	jan.nijtmans <nijtmans@users.sourceforge.net>	2019-07-05 09:03:12 (GMT)
committer	jan.nijtmans <nijtmans@users.sourceforge.net>	2019-07-05 09:03:12 (GMT)
commit	30b0dabb9b5092b7453ff56eae75c009f30eaa1f (patch)
tree	e95abf5ef1adf6677e12d167fb247fc671a8d556 /generic/tclUtf.c
parent	26a830241ea90ce41d6cddc8f2a720f4cc5b2e73 (diff)
download	tcl-30b0dabb9b5092b7453ff56eae75c009f30eaa1f.zip tcl-30b0dabb9b5092b7453ff56eae75c009f30eaa1f.tar.gz tcl-30b0dabb9b5092b7453ff56eae75c009f30eaa1f.tar.bz2