diff options
| -rw-r--r-- | generic/tclBinary.c | 16 | ||||
| -rw-r--r-- | generic/tclCmdMZ.c | 9 | ||||
| -rw-r--r-- | generic/tclInt.h | 2 | ||||
| -rw-r--r-- | generic/tclParse.c | 23 | ||||
| -rw-r--r-- | generic/tclUtf.c | 68 |
5 files changed, 77 insertions, 41 deletions
diff --git a/generic/tclBinary.c b/generic/tclBinary.c index 6306159..52ef457 100644 --- a/generic/tclBinary.c +++ b/generic/tclBinary.c @@ -1222,11 +1222,11 @@ BinaryFormatCmd( badField: { - Tcl_UniChar ch = 0; - char buf[TCL_UTF_MAX + 1] = ""; + int ch; + char buf[8] = ""; - TclUtfToUniChar(errorString, &ch); - buf[Tcl_UniCharToUtf(ch, buf)] = '\0'; + TclUtfToUCS4(errorString, &ch); + buf[TclUCS4ToUtf(ch, buf)] = '\0'; Tcl_SetObjResult(interp, Tcl_ObjPrintf( "bad field specifier \"%s\"", buf)); return TCL_ERROR; @@ -1592,11 +1592,11 @@ BinaryScanCmd( badField: { - Tcl_UniChar ch = 0; - char buf[TCL_UTF_MAX + 1] = ""; + int ch; + char buf[8] = ""; - TclUtfToUniChar(errorString, &ch); - buf[Tcl_UniCharToUtf(ch, buf)] = '\0'; + TclUtfToUCS4(errorString, &ch); + buf[TclUCS4ToUtf(ch, buf)] = '\0'; Tcl_SetObjResult(interp, Tcl_ObjPrintf( "bad field specifier \"%s\"", buf)); return TCL_ERROR; diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c index 162a5a6..011164b 100644 --- a/generic/tclCmdMZ.c +++ b/generic/tclCmdMZ.c @@ -1413,14 +1413,9 @@ StringIndexCmd( Tcl_SetObjResult(interp, Tcl_NewByteArrayObj(&uch, 1)); } else { - char buf[TCL_UTF_MAX] = ""; + char buf[8] = ""; - length = Tcl_UniCharToUtf(ch, buf); -#if TCL_UTF_MAX > 3 - if ((ch >= 0xD800) && (length < 3)) { - length += Tcl_UniCharToUtf(-1, buf + length); - } -#endif + length = TclUCS4ToUtf(ch, buf); Tcl_SetObjResult(interp, Tcl_NewStringObj(buf, length)); } } diff --git a/generic/tclInt.h b/generic/tclInt.h index 5c46470..6f024a6 100644 --- a/generic/tclInt.h +++ b/generic/tclInt.h @@ -3184,6 +3184,8 @@ MODULE_SCOPE int TclTrimRight(const char *bytes, int numBytes, const char *trim, int numTrim); MODULE_SCOPE int TclUtfCasecmp(const char *cs, const char *ct); MODULE_SCOPE int TclUtfToUCS4(const char *src, int *ucs4Ptr); +MODULE_SCOPE int TclUCS4ToUtf(int, char *); + /* * Bytes F0-F4 are start-bytes for 4-byte sequences. * Byte 0xED can be the start-byte of an upper surrogate. In that case, diff --git a/generic/tclParse.c b/generic/tclParse.c index 7beaeea..23a07cf 100644 --- a/generic/tclParse.c +++ b/generic/tclParse.c @@ -843,7 +843,6 @@ TclParseBackslash( * written there. */ { register const char *p = src+1; - Tcl_UniChar unichar = 0; int result; int count; char buf[TCL_UTF_MAX] = ""; @@ -943,7 +942,7 @@ TclParseBackslash( * No hexdigits -> This is just "U". */ result = 'U'; - } else if ((result | 0x7FF) == 0xDFFF) { + } else if ((result & ~0x7FF) == 0xD800) { /* Upper or lower surrogate, not allowed in this syntax. */ result = 0xFFFD; } @@ -991,16 +990,15 @@ TclParseBackslash( * #217987] test subst-3.2 */ - if (Tcl_UtfCharComplete(p, numBytes - 1)) { - count = TclUtfToUniChar(p, &unichar) + 1; /* +1 for '\' */ + if (TclUCS4Complete(p, numBytes - 1)) { + count = TclUtfToUCS4(p, &result) + 1; /* +1 for '\' */ } else { - char utfBytes[TCL_UTF_MAX]; + char utfBytes[8]; - memcpy(utfBytes, p, (size_t) (numBytes - 1)); + memcpy(utfBytes, p, numBytes - 1); utfBytes[numBytes - 1] = '\0'; - count = TclUtfToUniChar(utfBytes, &unichar) + 1; + count = TclUtfToUCS4(utfBytes, &result) + 1; } - result = unichar; break; } @@ -1008,13 +1006,12 @@ TclParseBackslash( if (readPtr != NULL) { *readPtr = count; } - count = Tcl_UniCharToUtf(result, dst); -#if TCL_UTF_MAX > 3 - if ((result >= 0xD800) && (count < 3)) { - count += Tcl_UniCharToUtf(-1, dst + count); +#if TCL_UTF_MAX < 4 + if (result > 0xFFFF) { + result = 0xFFFD; } #endif - return count; + return TclUCS4ToUtf(result, dst); } /* diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 03a7ca9..a14ce71 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -2354,7 +2354,7 @@ TclUniCharMatch( * routine does not run off the end and dereference non-existent memory * looking for trail bytes. If the source buffer is known to be '\0' * terminated, this cannot happen. Otherwise, the caller should call - * Tcl_UtfCharComplete() before calling this routine to ensure that + * TclUCS4Complete() before calling this routine to ensure that * enough bytes remain in the string. * * Results: @@ -2373,26 +2373,68 @@ TclUtfToUCS4( int *ucs4Ptr) /* Filled with the UCS4 codepoint represented * by the UTF-8 string. */ { - int len, fullchar; Tcl_UniChar ch = 0; + int len = Tcl_UtfToUniChar(src, &ch); - len = TclUtfToUniChar(src, &ch); - fullchar = ch; - -#if TCL_UTF_MAX == 4 - /* 4-byte UTF-8 is supported; decode surrogates */ - - if ((ch >= 0xD800) && len < 3) { - len += Tcl_UtfToUniChar(src + len, &ch); - fullchar = (((fullchar & 0x3FF) << 10) | (ch & 0x3FF)) + 0x10000; +#if TCL_UTF_MAX <= 4 + if ((ch & ~0x3FF) == 0xD800) { + Tcl_UniChar low = ch; + int len2 = Tcl_UtfToUniChar(src+len, &low); + if ((low & ~0x3FF) == 0xDC00) { + *ucs4Ptr = (((ch & 0x3FF) << 10) | (low & 0x3FF)) + 0x10000; + return len + len2; + } } #endif - - *ucs4Ptr = fullchar; + *ucs4Ptr = (int)ch; return len; } /* + *--------------------------------------------------------------------------- + * + * TclUCS4ToUtf -- + * + * Store the given Unicode character as a sequence of UTF-8 bytes in the + * provided buffer. Might output 6 bytes, if the code point > 0xFFFF. + * + * Results: + * The return values is the number of bytes in the buffer that were + * consumed. + * + * Side effects: + * None. + * + *--------------------------------------------------------------------------- + */ + +int +TclUCS4ToUtf( + int ch, /* Unicode character to be stored in the + * buffer. */ + char *buf) /* Buffer in which the UTF-8 representation of + * the Unicode character is stored. Buffer must be + * large enough to hold the UTF-8 character(s) + * (at most 6 bytes). */ +{ +#if TCL_UTF_MAX <= 4 + if (((unsigned)(ch - 0x10000) <= 0xFFFFF)) { + /* Spit out a 4-byte UTF-8 character or 2 x 3-byte UTF-8 characters, depending on Tcl + * version and/or TCL_UTF_MAX build value */ + int len = Tcl_UniCharToUtf(0xD800 | ((ch - 0x10000) >> 10), buf); + return len + Tcl_UniCharToUtf(0xDC00 | (ch & 0x7FF), buf + len); + } +#endif + if ((ch & ~0x7FF) == 0xD800) { + buf[2] = (char) ((ch | 0x80) & 0xBF); + buf[1] = (char) (((ch >> 6) | 0x80) & 0xBF); + buf[0] = (char) ((ch >> 12) | 0xE0); + return 3; + } + return Tcl_UniCharToUtf(ch, buf); +} + +/* * Local Variables: * mode: c * c-basic-offset: 4 |
