From 4fc9a4c991e75f3060fad431f5951cda27377a5a Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Wed, 29 Nov 2017 11:04:20 +0000 Subject: Update some functions in tclUtf.c to handle surrogate pairs when TCL_UTF_MAX == 4. Also update documentation to distinguish better between "Tcl_UniChar" and "Unicode character": Those are not necessary the same when TCL_UTF_MAX == 4. No change when TCL_UTF_MAX == 4 or TCL_UTF_MAX == 6. --- doc/ToUpper.3 | 2 +- doc/UniCharIsAlpha.3 | 5 +--- doc/Utf.3 | 2 +- generic/tclUtf.c | 74 ++++++++++++++++++++++++++++++++++++++++------------ 4 files changed, 61 insertions(+), 22 deletions(-) diff --git a/doc/ToUpper.3 b/doc/ToUpper.3 index b933e9c..be614e7 100644 --- a/doc/ToUpper.3 +++ b/doc/ToUpper.3 @@ -33,7 +33,7 @@ int .SH ARGUMENTS .AS char *str in/out .AP int ch in -The Tcl_UniChar to be converted. +The Unicode character to be converted. .AP char *str in/out Pointer to UTF-8 string to be converted in place. .BE diff --git a/doc/UniCharIsAlpha.3 b/doc/UniCharIsAlpha.3 index 2336c34..5ba3fc9 100644 --- a/doc/UniCharIsAlpha.3 +++ b/doc/UniCharIsAlpha.3 @@ -53,14 +53,11 @@ The Tcl_UniChar to be examined. .SH DESCRIPTION .PP -All of the routines described examine Tcl_UniChars and return a +All of the routines described examine Unicode characters and return a boolean value. A non-zero return value means that the character does belong to the character class associated with the called routine. The rest of this document just describes the character classes associated with the various routines. -.PP -Note: A Tcl_UniChar is a Unicode character represented as an unsigned, -fixed-size quantity. .SH "CHARACTER CLASSES" .PP diff --git a/doc/Utf.3 b/doc/Utf.3 index 378c806..9d0c617 100644 --- a/doc/Utf.3 +++ b/doc/Utf.3 @@ -77,7 +77,7 @@ int Buffer in which the UTF-8 representation of the Tcl_UniChar is stored. At most \fBTCL_UTF_MAX\fR bytes are stored in the buffer. .AP int ch in -The Tcl_UniChar to be converted or examined. +The Unicode character to be converted or examined. .AP Tcl_UniChar *chPtr out Filled with the Tcl_UniChar represented by the head of the UTF-8 string. .AP "const char" *src in diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 3b39226..17f769d 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -404,7 +404,7 @@ Tcl_UtfToUniCharDString( * appended to this previously initialized * DString. */ { - Tcl_UniChar ch, *w, *wString; + Tcl_UniChar ch = 0, *w, *wString; const char *p, *end; int oldLength; @@ -528,13 +528,13 @@ Tcl_NumUtfChars( * * Tcl_UtfFindFirst -- * - * Returns a pointer to the first occurance of the given Tcl_UniChar in - * the NULL-terminated UTF-8 string. The NULL terminator is considered + * Returns a pointer to the first occurance of the given Unicode character + * in the NULL-terminated UTF-8 string. The NULL terminator is considered * part of the UTF-8 string. Equivalent to Plan 9 utfrune(). * * Results: - * As above. If the Tcl_UniChar does not exist in the given string, the - * return value is NULL. + * As above. If the Unicode character does not exist in the given string, + * the return value is NULL. * * Side effects: * None. @@ -545,14 +545,21 @@ Tcl_NumUtfChars( const char * Tcl_UtfFindFirst( const char *src, /* The UTF-8 string to be searched. */ - int ch) /* The Tcl_UniChar to search for. */ + int ch) /* The Unicode character to search for. */ { - int len; + int len, fullchar; Tcl_UniChar find = 0; while (1) { len = TclUtfToUniChar(src, &find); - if (find == ch) { + fullchar = find; +#if TCL_UTF_MAX == 4 + if (!len) { + len += TclUtfToUniChar(src, &find); + fullchar = (((fullchar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000; + } +#endif + if (find == fullchar) { return src; } if (*src == '\0') { @@ -567,8 +574,8 @@ Tcl_UtfFindFirst( * * Tcl_UtfFindLast -- * - * Returns a pointer to the last occurance of the given Tcl_UniChar in - * the NULL-terminated UTF-8 string. The NULL terminator is considered + * Returns a pointer to the last occurance of the given Unicode character + * in the NULL-terminated UTF-8 string. The NULL terminator is considered * part of the UTF-8 string. Equivalent to Plan 9 utfrrune(). * * Results: @@ -584,16 +591,23 @@ Tcl_UtfFindFirst( const char * Tcl_UtfFindLast( const char *src, /* The UTF-8 string to be searched. */ - int ch) /* The Tcl_UniChar to search for. */ + int ch) /* The Unicode character to search for. */ { - int len; + int len, fullchar; Tcl_UniChar find = 0; const char *last; last = NULL; while (1) { len = TclUtfToUniChar(src, &find); - if (find == ch) { + fullchar = find; +#if TCL_UTF_MAX == 4 + if (!len) { + len += TclUtfToUniChar(src, &find); + fullchar = (((fullchar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000; + } +#endif + if (find == fullchar) { last = src; } if (*src == '\0') { @@ -1058,6 +1072,15 @@ Tcl_UtfNcmp( cs += TclUtfToUniChar(cs, &ch1); ct += TclUtfToUniChar(ct, &ch2); +#if TCL_UTF_MAX == 4 + /* map high surrogate characters to values > 0xffff */ + if ((ch1 & 0xFC00) == 0xD800) { + ch1 += 0x4000; + } + if ((ch2 & 0xFC00) == 0xD800) { + ch2 += 0x4000; + } +#endif if (ch1 != ch2) { return (ch1 - ch2); } @@ -1090,6 +1113,7 @@ Tcl_UtfNcasecmp( unsigned long numChars) /* Number of UTF chars to compare. */ { Tcl_UniChar ch1 = 0, ch2 = 0; + while (numChars-- > 0) { /* * n must be interpreted as chars, not bytes. @@ -1098,6 +1122,15 @@ Tcl_UtfNcasecmp( */ cs += TclUtfToUniChar(cs, &ch1); ct += TclUtfToUniChar(ct, &ch2); +#if TCL_UTF_MAX == 4 + /* map high surrogate characters to values > 0xffff */ + if ((ch1 & 0xFC00) == 0xD800) { + ch1 += 0x4000; + } + if ((ch2 & 0xFC00) == 0xD800) { + ch2 += 0x4000; + } +#endif if (ch1 != ch2) { ch1 = Tcl_UniCharToLower(ch1); ch2 = Tcl_UniCharToLower(ch2); @@ -1112,7 +1145,7 @@ Tcl_UtfNcasecmp( /* *---------------------------------------------------------------------- * - * Tcl_UtfNcasecmp -- + * TclUtfCasecmp -- * * Compare UTF chars of string cs to string ct case insensitively. * Replacement for strcasecmp in Tcl core, in places where UTF-8 should @@ -1132,11 +1165,20 @@ TclUtfCasecmp( const char *cs, /* UTF string to compare to ct. */ const char *ct) /* UTF string cs is compared to. */ { - while (*cs && *ct) { - Tcl_UniChar ch1, ch2; + Tcl_UniChar ch1 = 0, ch2 = 0; + while (*cs && *ct) { cs += TclUtfToUniChar(cs, &ch1); ct += TclUtfToUniChar(ct, &ch2); +#if TCL_UTF_MAX == 4 + /* map high surrogate characters to values > 0xffff */ + if ((ch1 & 0xFC00) == 0xD800) { + ch1 += 0x4000; + } + if ((ch2 & 0xFC00) == 0xD800) { + ch2 += 0x4000; + } +#endif if (ch1 != ch2) { ch1 = Tcl_UniCharToLower(ch1); ch2 = Tcl_UniCharToLower(ch2); -- cgit v0.12