summaryrefslogtreecommitdiffstats
path: root/generic/tclUtf.c
diff options
context:
space:
mode:
authorjan.nijtmans <nijtmans@users.sourceforge.net>2017-06-08 08:26:58 (GMT)
committerjan.nijtmans <nijtmans@users.sourceforge.net>2017-06-08 08:26:58 (GMT)
commit16f3f234e8500f5f71e4d9321689a8bdf9efc809 (patch)
tree5d0ad393849e7c2d6b1b88d3f6d413ff2b505f14 /generic/tclUtf.c
parent73a3dfdeeabb1a43c73101b4b6a9826f83866b32 (diff)
downloadtcl-16f3f234e8500f5f71e4d9321689a8bdf9efc809.zip
tcl-16f3f234e8500f5f71e4d9321689a8bdf9efc809.tar.gz
tcl-16f3f234e8500f5f71e4d9321689a8bdf9efc809.tar.bz2
Better UTF-8 surrogate handling, only functional when TCL_UTF_MAX>3
Diffstat (limited to 'generic/tclUtf.c')
-rw-r--r--generic/tclUtf.c68
1 files changed, 49 insertions, 19 deletions
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index 52b4291..db941e2 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -134,7 +134,7 @@ UtfCount(
*---------------------------------------------------------------------------
*/
-INLINE int
+int
Tcl_UniCharToUtf(
int ch, /* The Tcl_UniChar to be stored in the
* buffer. */
@@ -259,6 +259,15 @@ Tcl_UniCharToUtfDString(
* Tcl_UtfCharComplete() before calling this routine to ensure that
* enough bytes remain in the string.
*
+ * If TCL_UTF_MAX == 4, special handling of Surrogate pairs is done:
+ * For any UTF-8 string containing a character outside of the BMP, the
+ * first call to this function will fill *chPtr with the high surrogate
+ * and generate a return value of 0. Calling Tcl_UtfToUniChar again
+ * will produce the low surrogate and a return value of 4. Because *chPtr
+ * is used to remember whether the high surrogate is already produced, it
+ * is recommended to initialize the variable it points to as 0 before
+ * the first call to Tcl_UtfToUniChar is done.
+ *
* Results:
* *chPtr is filled with the Tcl_UniChar, and the return value is the
* number of bytes from the UTF-8 string that were consumed.
@@ -278,7 +287,7 @@ Tcl_UtfToUniChar(
register int byte;
/*
- * Unroll 1 to 3 byte UTF-8 sequences, use loop to handle longer ones.
+ * Unroll 1 to 3 (or 4) byte UTF-8 sequences.
*/
byte = *((unsigned char *) src);
@@ -331,12 +340,30 @@ Tcl_UtfToUniChar(
/*
* Four-byte-character lead byte followed by three trail bytes.
*/
-
+#if TCL_UTF_MAX == 4
+ Tcl_UniChar surrogate;
+
+ byte = (((byte & 0x07) << 18) | ((src[1] & 0x3F) << 12)
+ | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F)) - 0x10000;
+ surrogate = (Tcl_UniChar) (0xD800 + (byte >> 10));
+ if (byte & 0x100000) {
+ /* out of range, < 0x10000 or > 0x10ffff */
+ } else if (*chPtr != surrogate) {
+ /* produce high surrogate, but don't advance source pointer */
+ *chPtr = surrogate;
+ return 0;
+ } else {
+ /* produce low surrogate, and advance source pointer */
+ *chPtr = (Tcl_UniChar) (0xDC00 | (byte & 0x3FF));
+ return 4;
+ }
+#else
*chPtr = (Tcl_UniChar) (((byte & 0x07) << 18) | ((src[1] & 0x3F) << 12)
| ((src[2] & 0x3F) << 6) | (src[3] & 0x3F));
if ((unsigned)(*chPtr - 0x10000) <= 0xFFFFF) {
return 4;
}
+#endif
}
/*
@@ -377,7 +404,7 @@ Tcl_UtfToUniCharDString(
* appended to this previously initialized
* DString. */
{
- Tcl_UniChar *w, *wString;
+ Tcl_UniChar ch, *w, *wString;
const char *p, *end;
int oldLength;
@@ -399,8 +426,8 @@ Tcl_UtfToUniCharDString(
w = wString;
end = src + length;
for (p = src; p < end; ) {
- p += TclUtfToUniChar(p, w);
- w++;
+ p += TclUtfToUniChar(p, &ch);
+ *w++ = ch;
}
*w = '\0';
Tcl_DStringSetLength(dsPtr,
@@ -434,9 +461,8 @@ Tcl_UtfCharComplete(
* a complete UTF-8 character. */
int length) /* Length of above string in bytes. */
{
- int ch;
+ int ch = *((unsigned char *) src);
- ch = *((unsigned char *) src);
return length >= totalBytes[ch];
}
@@ -464,8 +490,7 @@ Tcl_NumUtfChars(
int length) /* The length of the string in bytes, or -1
* for strlen(string). */
{
- Tcl_UniChar ch;
- register Tcl_UniChar *chPtr = &ch;
+ Tcl_UniChar ch = 0;
register int i;
/*
@@ -478,7 +503,7 @@ Tcl_NumUtfChars(
i = 0;
if (length < 0) {
while (*src != '\0') {
- src += TclUtfToUniChar(src, chPtr);
+ src += TclUtfToUniChar(src, &ch);
i++;
}
} else {
@@ -489,7 +514,7 @@ Tcl_NumUtfChars(
length--;
src++;
} else {
- n = Tcl_UtfToUniChar(src, chPtr);
+ n = Tcl_UtfToUniChar(src, &ch);
length -= n;
src += n;
}
@@ -524,7 +549,7 @@ Tcl_UtfFindFirst(
int ch) /* The Tcl_UniChar to search for. */
{
int len;
- Tcl_UniChar find;
+ Tcl_UniChar find = 0;
while (1) {
len = TclUtfToUniChar(src, &find);
@@ -563,7 +588,7 @@ Tcl_UtfFindLast(
int ch) /* The Tcl_UniChar to search for. */
{
int len;
- Tcl_UniChar find;
+ Tcl_UniChar find = 0;
const char *last;
last = NULL;
@@ -603,9 +628,15 @@ const char *
Tcl_UtfNext(
const char *src) /* The current location in the string. */
{
- Tcl_UniChar ch;
+ Tcl_UniChar ch = 0;
+ int len = TclUtfToUniChar(src, &ch);
- return src + TclUtfToUniChar(src, &ch);
+#if TCL_UTF_MAX == 4
+ if (len == 0) {
+ len = TclUtfToUniChar(src, &ch);
+ }
+#endif
+ return src + len;
}
/*
@@ -638,8 +669,7 @@ Tcl_UtfPrev(
const char *look;
int i, byte;
- src--;
- look = src;
+ look = --src;
for (i = 0; i < TCL_UTF_MAX; i++) {
if (look < start) {
if (src < start) {
@@ -712,7 +742,7 @@ Tcl_UtfAtIndex(
register const char *src, /* The UTF-8 string. */
register int index) /* The position of the desired character. */
{
- Tcl_UniChar ch;
+ Tcl_UniChar ch = 0;
while (index > 0) {
index--;