diff options
author | jan.nijtmans <nijtmans@users.sourceforge.net> | 2019-03-10 21:04:41 (GMT) |
---|---|---|
committer | jan.nijtmans <nijtmans@users.sourceforge.net> | 2019-03-10 21:04:41 (GMT) |
commit | 919d62dcd3d557c7976c331687ef9e8bbf6560c9 (patch) | |
tree | e68684c2b7317009033535896b177fcdfefb8dce /generic | |
parent | e6a53eb44dab26c44e01f4620467c2c5ae0f27e5 (diff) | |
parent | 934e6a98376ded432d70c77b3778869bc49763d4 (diff) | |
download | tcl-919d62dcd3d557c7976c331687ef9e8bbf6560c9.zip tcl-919d62dcd3d557c7976c331687ef9e8bbf6560c9.tar.gz tcl-919d62dcd3d557c7976c331687ef9e8bbf6560c9.tar.bz2 |
Merge 8.7
Diffstat (limited to 'generic')
-rw-r--r-- | generic/regcomp.c | 2 | ||||
-rw-r--r-- | generic/tclInt.h | 11 | ||||
-rw-r--r-- | generic/tclStubInit.c | 96 | ||||
-rw-r--r-- | generic/tclUtf.c | 204 |
4 files changed, 216 insertions, 97 deletions
diff --git a/generic/regcomp.c b/generic/regcomp.c index 0855685..47f06c8 100644 --- a/generic/regcomp.c +++ b/generic/regcomp.c @@ -206,7 +206,7 @@ struct vars { int cflags; /* copy of compile flags */ int lasttype; /* type of previous token */ int nexttype; /* type of next token */ - chr nextvalue; /* value (if any) of next token */ + int nextvalue; /* value (if any) of next token */ int lexcon; /* lexical context type (see lex.c) */ int nsubexp; /* subexpression count */ struct subre **subs; /* subRE pointer vector */ diff --git a/generic/tclInt.h b/generic/tclInt.h index d999603..bb20792 100644 --- a/generic/tclInt.h +++ b/generic/tclInt.h @@ -3170,6 +3170,17 @@ MODULE_SCOPE const char*TclGetCommandTypeName(Tcl_Command command); MODULE_SCOPE void TclRegisterCommandTypeName( Tcl_ObjCmdProc *implementationProc, const char *nameStr); +#if (TCL_UTF_MAX > 4) && (defined(__CYGWIN__) || defined(_WIN32)) +MODULE_SCOPE int TclUtfToWChar(const char *src, WCHAR *chPtr); +MODULE_SCOPE char * TclWCharToUtfDString(const WCHAR *uniStr, + int uniLength, Tcl_DString *dsPtr); +MODULE_SCOPE WCHAR * TclUtfToWCharDString(const char *src, + int length, Tcl_DString *dsPtr); +#else +# define TclUtfToWChar TclUtfToUniChar +# define TclWCharToUtfDString Tcl_UniCharToUtfDString +# define TclUtfToWCharDString Tcl_UtfToUniCharDString +#endif MODULE_SCOPE int TclUtfCmp(const char *cs, const char *ct); MODULE_SCOPE int TclUtfCasecmp(const char *cs, const char *ct); MODULE_SCOPE size_t TclUtfCount(int ch); diff --git a/generic/tclStubInit.c b/generic/tclStubInit.c index 37d0e80..e49fc02 100644 --- a/generic/tclStubInit.c +++ b/generic/tclStubInit.c @@ -118,68 +118,11 @@ Tcl_WinUtfToTChar( size_t len, Tcl_DString *dsPtr) { -#if TCL_UTF_MAX > 4 - Tcl_UniChar ch = 0; - wchar_t *w, *wString; - const char *p, *end; - int oldLength; -#endif - Tcl_DStringInit(dsPtr); if (!string) { return NULL; } -#if TCL_UTF_MAX > 4 - - if (len < 0) { - len = strlen(string); - } - - /* - * Unicode string length in Tcl_UniChars will be <= UTF-8 string length in - * bytes. - */ - - oldLength = Tcl_DStringLength(dsPtr); - - Tcl_DStringSetLength(dsPtr, - oldLength + (int) ((len + 1) * sizeof(wchar_t))); - wString = (wchar_t *) (Tcl_DStringValue(dsPtr) + oldLength); - - w = wString; - p = string; - end = string + len - 4; - while (p < end) { - p += TclUtfToUniChar(p, &ch); - if (ch > 0xFFFF) { - *w++ = (wchar_t) (0xD800 + ((ch -= 0x10000) >> 10)); - *w++ = (wchar_t) (0xDC00 | (ch & 0x3FF)); - } else { - *w++ = ch; - } - } - end += 4; - while (p < end) { - if (Tcl_UtfCharComplete(p, end-p)) { - p += TclUtfToUniChar(p, &ch); - } else { - ch = UCHAR(*p++); - } - if (ch > 0xFFFF) { - *w++ = (wchar_t) (0xD800 + ((ch -= 0x10000) >> 10)); - *w++ = (wchar_t) (0xDC00 | (ch & 0x3FF)); - } else { - *w++ = ch; - } - } - *w = '\0'; - Tcl_DStringSetLength(dsPtr, - oldLength + ((char *) w - (char *) wString)); - - return (char *)wString; -#else - return (char *)Tcl_UtfToUniCharDString(string, len, dsPtr); -#endif + return (char *)Tcl_UtfToWCharDString(string, len, dsPtr); } char * @@ -188,12 +131,6 @@ Tcl_WinTCharToUtf( size_t len, Tcl_DString *dsPtr) { -#if TCL_UTF_MAX > 4 - const wchar_t *w, *wEnd; - char *p, *result; - int oldLength, blen = 1; -#endif - Tcl_DStringInit(dsPtr); if (!string) { return NULL; @@ -203,36 +140,7 @@ Tcl_WinTCharToUtf( } else { len /= 2; } -#if TCL_UTF_MAX > 4 - oldLength = Tcl_DStringLength(dsPtr); - Tcl_DStringSetLength(dsPtr, oldLength + (len + 1) * 4); - result = Tcl_DStringValue(dsPtr) + oldLength; - - p = result; - wEnd = (wchar_t *)string + len; - for (w = (wchar_t *)string; w < wEnd; ) { - if (!blen && ((*w & 0xFC00) != 0xDC00)) { - /* Special case for handling high surrogates. */ - p += Tcl_UniCharToUtf(-1, p); - } - blen = Tcl_UniCharToUtf(*w, p); - p += blen; - if ((*w >= 0xD800) && (blen < 3)) { - /* Indication that high surrogate is handled */ - blen = 0; - } - w++; - } - if (!blen) { - /* Special case for handling high surrogates. */ - p += Tcl_UniCharToUtf(-1, p); - } - Tcl_DStringSetLength(dsPtr, oldLength + (p - result)); - - return result; -#else - return Tcl_UniCharToUtfDString((Tcl_UniChar *)string, len, dsPtr); -#endif + return TclWCharToUtfDString((Tcl_UniChar *)string, len, dsPtr); } #if defined(TCL_WIDE_INT_IS_LONG) diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 8c0a5b7..59116c4 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -267,6 +267,50 @@ Tcl_UniCharToUtfDString( return string; } +#if (TCL_UTF_MAX > 4) && (defined(__CYGWIN__) || defined(_WIN32)) +char * +TclWCharToUtfDString( + const WCHAR *uniStr, /* WCHAR string to convert to UTF-8. */ + int uniLength, /* Length of WCHAR string in Tcl_UniChars + * (must be >= 0). */ + Tcl_DString *dsPtr) /* UTF-8 representation of string is appended + * to this previously initialized DString. */ +{ + const WCHAR *w, *wEnd; + char *p, *string; + int oldLength, len = 1; + + /* + * UTF-8 string length in bytes will be <= Unicode string length * 4. + */ + + oldLength = Tcl_DStringLength(dsPtr); + Tcl_DStringSetLength(dsPtr, oldLength + (uniLength + 1) * 4); + string = Tcl_DStringValue(dsPtr) + oldLength; + + p = string; + wEnd = uniStr + uniLength; + for (w = uniStr; w < wEnd; ) { + if (!len && ((*w & 0xFC00) != 0xDC00)) { + /* Special case for handling high surrogates. */ + p += Tcl_UniCharToUtf(-1, p); + } + len = Tcl_UniCharToUtf(*w, p); + p += len; + if ((*w >= 0xD800) && (len < 3)) { + len = 0; /* Indication that high surrogate was found */ + } + w++; + } + if (!len) { + /* Special case for handling high surrogates. */ + p += Tcl_UniCharToUtf(-1, p); + } + Tcl_DStringSetLength(dsPtr, oldLength + (p - string)); + + return string; +} +#endif /* *--------------------------------------------------------------------------- * @@ -418,7 +462,109 @@ Tcl_UtfToUniChar( *chPtr = byte; return 1; } - + +#if (TCL_UTF_MAX > 4) && (defined(__CYGWIN__) || defined(_WIN32)) +int +TclUtfToWChar( + const char *src, /* The UTF-8 string. */ + WCHAR *chPtr)/* Filled with the WCHAR represented by + * the UTF-8 string. */ +{ + WCHAR byte; + + /* + * Unroll 1 to 4 byte UTF-8 sequences. + */ + + byte = *((unsigned char *) src); + if (byte < 0xC0) { + /* + * Handles properly formed UTF-8 characters between 0x01 and 0x7F. + * Treats naked trail bytes 0x80 to 0x9F as valid characters from + * the cp1252 table. See: <https://en.wikipedia.org/wiki/UTF-8> + * Also treats \0 and other naked trail bytes 0xA0 to 0xBF as valid + * characters representing themselves. + */ + + /* If *chPtr contains a high surrogate (produced by a previous + * Tcl_UtfToUniChar() call) and the next 3 bytes are UTF-8 continuation + * bytes, then we must produce a follow-up low surrogate. We only + * do that if the high surrogate matches the bits we encounter. + */ + if ((byte >= 0x80) + && (((((byte - 0x10) << 2) & 0xFC) | 0xD800) == (*chPtr & 0xFCFC)) + && ((src[1] & 0xF0) == (((*chPtr << 4) & 0x30) | 0x80)) + && ((src[2] & 0xC0) == 0x80)) { + *chPtr = ((src[1] & 0x0F) << 6) + (src[2] & 0x3F) + 0xDC00; + return 3; + } + if ((unsigned)(byte-0x80) < (unsigned)0x20) { + *chPtr = cp1252[byte-0x80]; + } else { + *chPtr = byte; + } + return 1; + } else if (byte < 0xE0) { + if ((src[1] & 0xC0) == 0x80) { + /* + * Two-byte-character lead-byte followed by a trail-byte. + */ + + *chPtr = (((byte & 0x1F) << 6) | (src[1] & 0x3F)); + if ((unsigned)(*chPtr - 1) >= (UNICODE_SELF - 1)) { + return 2; + } + } + + /* + * A two-byte-character lead-byte not followed by trail-byte + * represents itself. + */ + } else if (byte < 0xF0) { + if (((src[1] & 0xC0) == 0x80) && ((src[2] & 0xC0) == 0x80)) { + /* + * Three-byte-character lead byte followed by two trail bytes. + */ + + *chPtr = (((byte & 0x0F) << 12) + | ((src[1] & 0x3F) << 6) | (src[2] & 0x3F)); + if (*chPtr > 0x7FF) { + return 3; + } + } + + /* + * A three-byte-character lead-byte not followed by two trail-bytes + * represents itself. + */ + } + else if (byte < 0xF8) { + if (((src[1] & 0xC0) == 0x80) && ((src[2] & 0xC0) == 0x80) && ((src[3] & 0xC0) == 0x80)) { + /* + * Four-byte-character lead byte followed by three trail bytes. + */ + WCHAR high = (((byte & 0x07) << 8) | ((src[1] & 0x3F) << 2) + | ((src[2] & 0x3F) >> 4)) - 0x40; + if (high >= 0x400) { + /* out of range, < 0x10000 or > 0x10ffff */ + } else { + /* produce high surrogate, advance source pointer */ + *chPtr = 0xD800 + high; + return 1; + } + } + + /* + * A four-byte-character lead-byte not followed by three trail-bytes + * represents itself. + */ + } + + *chPtr = byte; + return 1; +} +#endif + /* *--------------------------------------------------------------------------- * @@ -489,7 +635,61 @@ Tcl_UtfToUniCharDString( return wString; } - + +#if (TCL_UTF_MAX > 4) && (defined(__CYGWIN__) || defined(_WIN32)) +WCHAR * +TclUtfToWCharDString( + const char *src, /* UTF-8 string to convert to Unicode. */ + int length, /* Length of UTF-8 string in bytes, or -1 for + * strlen(). */ + Tcl_DString *dsPtr) /* Unicode representation of string is + * appended to this previously initialized + * DString. */ +{ + WCHAR ch = 0, *w, *wString; + const char *p, *end; + int oldLength; + + if (length < 0) { + length = strlen(src); + } + + /* + * Unicode string length in Tcl_UniChars will be <= UTF-8 string length in + * bytes. + */ + + oldLength = Tcl_DStringLength(dsPtr); + + Tcl_DStringSetLength(dsPtr, + oldLength + (int) ((length + 1) * sizeof(WCHAR))); + wString = (WCHAR *) (Tcl_DStringValue(dsPtr) + oldLength); + + w = wString; + p = src; + end = src + length - 4; + while (p < end) { + p += TclUtfToWChar(p, &ch); + *w++ = ch; + } + end += 4; + while (p < end) { + if (Tcl_UtfCharComplete(p, end-p)) { + p += TclUtfToWChar(p, &ch); + } else if (((UCHAR(*p)-0x80)) < 0x20) { + ch = cp1252[UCHAR(*p++)-0x80]; + } else { + ch = UCHAR(*p++); + } + *w++ = ch; + } + *w = '\0'; + Tcl_DStringSetLength(dsPtr, + oldLength + ((char *) w - (char *) wString)); + + return wString; +} +#endif /* *--------------------------------------------------------------------------- * |