diff options
author | jan.nijtmans <nijtmans@users.sourceforge.net> | 2017-06-08 08:26:58 (GMT) |
---|---|---|
committer | jan.nijtmans <nijtmans@users.sourceforge.net> | 2017-06-08 08:26:58 (GMT) |
commit | 16f3f234e8500f5f71e4d9321689a8bdf9efc809 (patch) | |
tree | 5d0ad393849e7c2d6b1b88d3f6d413ff2b505f14 /generic/tclUtf.c | |
parent | 73a3dfdeeabb1a43c73101b4b6a9826f83866b32 (diff) | |
download | tcl-16f3f234e8500f5f71e4d9321689a8bdf9efc809.zip tcl-16f3f234e8500f5f71e4d9321689a8bdf9efc809.tar.gz tcl-16f3f234e8500f5f71e4d9321689a8bdf9efc809.tar.bz2 |
Better UTF-8 surrogate handling, only functional when TCL_UTF_MAX>3
Diffstat (limited to 'generic/tclUtf.c')
-rw-r--r-- | generic/tclUtf.c | 68 |
1 files changed, 49 insertions, 19 deletions
diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 52b4291..db941e2 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -134,7 +134,7 @@ UtfCount( *--------------------------------------------------------------------------- */ -INLINE int +int Tcl_UniCharToUtf( int ch, /* The Tcl_UniChar to be stored in the * buffer. */ @@ -259,6 +259,15 @@ Tcl_UniCharToUtfDString( * Tcl_UtfCharComplete() before calling this routine to ensure that * enough bytes remain in the string. * + * If TCL_UTF_MAX == 4, special handling of Surrogate pairs is done: + * For any UTF-8 string containing a character outside of the BMP, the + * first call to this function will fill *chPtr with the high surrogate + * and generate a return value of 0. Calling Tcl_UtfToUniChar again + * will produce the low surrogate and a return value of 4. Because *chPtr + * is used to remember whether the high surrogate is already produced, it + * is recommended to initialize the variable it points to as 0 before + * the first call to Tcl_UtfToUniChar is done. + * * Results: * *chPtr is filled with the Tcl_UniChar, and the return value is the * number of bytes from the UTF-8 string that were consumed. @@ -278,7 +287,7 @@ Tcl_UtfToUniChar( register int byte; /* - * Unroll 1 to 3 byte UTF-8 sequences, use loop to handle longer ones. + * Unroll 1 to 3 (or 4) byte UTF-8 sequences. */ byte = *((unsigned char *) src); @@ -331,12 +340,30 @@ Tcl_UtfToUniChar( /* * Four-byte-character lead byte followed by three trail bytes. */ - +#if TCL_UTF_MAX == 4 + Tcl_UniChar surrogate; + + byte = (((byte & 0x07) << 18) | ((src[1] & 0x3F) << 12) + | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F)) - 0x10000; + surrogate = (Tcl_UniChar) (0xD800 + (byte >> 10)); + if (byte & 0x100000) { + /* out of range, < 0x10000 or > 0x10ffff */ + } else if (*chPtr != surrogate) { + /* produce high surrogate, but don't advance source pointer */ + *chPtr = surrogate; + return 0; + } else { + /* produce low surrogate, and advance source pointer */ + *chPtr = (Tcl_UniChar) (0xDC00 | (byte & 0x3FF)); + return 4; + } +#else *chPtr = (Tcl_UniChar) (((byte & 0x07) << 18) | ((src[1] & 0x3F) << 12) | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F)); if ((unsigned)(*chPtr - 0x10000) <= 0xFFFFF) { return 4; } +#endif } /* @@ -377,7 +404,7 @@ Tcl_UtfToUniCharDString( * appended to this previously initialized * DString. */ { - Tcl_UniChar *w, *wString; + Tcl_UniChar ch, *w, *wString; const char *p, *end; int oldLength; @@ -399,8 +426,8 @@ Tcl_UtfToUniCharDString( w = wString; end = src + length; for (p = src; p < end; ) { - p += TclUtfToUniChar(p, w); - w++; + p += TclUtfToUniChar(p, &ch); + *w++ = ch; } *w = '\0'; Tcl_DStringSetLength(dsPtr, @@ -434,9 +461,8 @@ Tcl_UtfCharComplete( * a complete UTF-8 character. */ int length) /* Length of above string in bytes. */ { - int ch; + int ch = *((unsigned char *) src); - ch = *((unsigned char *) src); return length >= totalBytes[ch]; } @@ -464,8 +490,7 @@ Tcl_NumUtfChars( int length) /* The length of the string in bytes, or -1 * for strlen(string). */ { - Tcl_UniChar ch; - register Tcl_UniChar *chPtr = &ch; + Tcl_UniChar ch = 0; register int i; /* @@ -478,7 +503,7 @@ Tcl_NumUtfChars( i = 0; if (length < 0) { while (*src != '\0') { - src += TclUtfToUniChar(src, chPtr); + src += TclUtfToUniChar(src, &ch); i++; } } else { @@ -489,7 +514,7 @@ Tcl_NumUtfChars( length--; src++; } else { - n = Tcl_UtfToUniChar(src, chPtr); + n = Tcl_UtfToUniChar(src, &ch); length -= n; src += n; } @@ -524,7 +549,7 @@ Tcl_UtfFindFirst( int ch) /* The Tcl_UniChar to search for. */ { int len; - Tcl_UniChar find; + Tcl_UniChar find = 0; while (1) { len = TclUtfToUniChar(src, &find); @@ -563,7 +588,7 @@ Tcl_UtfFindLast( int ch) /* The Tcl_UniChar to search for. */ { int len; - Tcl_UniChar find; + Tcl_UniChar find = 0; const char *last; last = NULL; @@ -603,9 +628,15 @@ const char * Tcl_UtfNext( const char *src) /* The current location in the string. */ { - Tcl_UniChar ch; + Tcl_UniChar ch = 0; + int len = TclUtfToUniChar(src, &ch); - return src + TclUtfToUniChar(src, &ch); +#if TCL_UTF_MAX == 4 + if (len == 0) { + len = TclUtfToUniChar(src, &ch); + } +#endif + return src + len; } /* @@ -638,8 +669,7 @@ Tcl_UtfPrev( const char *look; int i, byte; - src--; - look = src; + look = --src; for (i = 0; i < TCL_UTF_MAX; i++) { if (look < start) { if (src < start) { @@ -712,7 +742,7 @@ Tcl_UtfAtIndex( register const char *src, /* The UTF-8 string. */ register int index) /* The position of the desired character. */ { - Tcl_UniChar ch; + Tcl_UniChar ch = 0; while (index > 0) { index--; |