diff options
| author | jan.nijtmans <nijtmans@users.sourceforge.net> | 2019-03-02 16:53:42 (GMT) |
|---|---|---|
| committer | jan.nijtmans <nijtmans@users.sourceforge.net> | 2019-03-02 16:53:42 (GMT) |
| commit | 8c315fd31ff823b217374dd32577e04c42674249 (patch) | |
| tree | 85b723f08e36160bc0b6f437d9cd6bced8d061dd | |
| parent | c0c278ccb909abc9b83305b8873e3171f5d9ab02 (diff) | |
| parent | d0eefe67c87f69a16ae393d0ab5eb0847292c340 (diff) | |
| download | tcl-8c315fd31ff823b217374dd32577e04c42674249.zip tcl-8c315fd31ff823b217374dd32577e04c42674249.tar.gz tcl-8c315fd31ff823b217374dd32577e04c42674249.tar.bz2 | |
Merge 8.7
| -rw-r--r-- | generic/tclScan.c | 4 | ||||
| -rw-r--r-- | generic/tclUtf.c | 19 |
2 files changed, 18 insertions, 5 deletions
diff --git a/generic/tclScan.c b/generic/tclScan.c index 3529951..775e42a 100644 --- a/generic/tclScan.c +++ b/generic/tclScan.c @@ -881,8 +881,8 @@ Tcl_ScanObjCmd( offset = TclUtfToUniChar(string, &sch); i = (int)sch; -#if TCL_UTF_MAX == 4 - if (((sch & 0xFC00) == 0xD800) && (offset < 3)) { +#if TCL_UTF_MAX <= 4 + if ((sch >= 0xD800) && (offset < 3)) { offset += TclUtfToUniChar(string+offset, &sch); i = (((i<<10) & 0x0FFC00) + 0x10000) + (sch & 0x3FF); } diff --git a/generic/tclUtf.c b/generic/tclUtf.c index e9e4432..4d9edf1 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -112,6 +112,19 @@ TclUtfCount( * Store the given Tcl_UniChar as a sequence of UTF-8 bytes in the * provided buffer. Equivalent to Plan 9 runetochar(). * + * Special handling of Surrogate pairs is handled as follows: + * When this function is called for ch being a high surrogate, + * the first byte of the 4-byte UTF-8 sequence is produced and + * the function returns 1. Calling the function again with a + * low surrogate, the remaining 3 bytes of the 4-byte UTF-8 + * sequence is produced, and the function returns 3. The buffer + * is used to remember the high surrogate between the two calls. + * + * If no low surrogate follows the high surrogate (which is actually + * illegal), this can be handled reasonably by calling Tcl_UniCharToUtf + * again with ch = -1. This will produce a 3-byte UTF-8 sequence + * representing the high surrogate. + * * Results: * The return values is the number of bytes in the buffer that were * consumed. @@ -270,11 +283,11 @@ Tcl_UniCharToUtfDString( * Tcl_UtfCharComplete() before calling this routine to ensure that * enough bytes remain in the string. * - * If TCL_UTF_MAX == 4, special handling of Surrogate pairs is done: + * Special handling of Surrogate pairs is handled as follows: * For any UTF-8 string containing a character outside of the BMP, the * first call to this function will fill *chPtr with the high surrogate - * and generate a return value of 0. Calling Tcl_UtfToUniChar again - * will produce the low surrogate and a return value of 4. Because *chPtr + * and generate a return value of 1. Calling Tcl_UtfToUniChar again + * will produce the low surrogate and a return value of 3. Because *chPtr * is used to remember whether the high surrogate is already produced, it * is recommended to initialize the variable it points to as 0 before * the first call to Tcl_UtfToUniChar is done. |
