From 934e6a98376ded432d70c77b3778869bc49763d4 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Sun, 10 Mar 2019 20:18:48 +0000 Subject: re-implemente changes in win/tclWinFile.c (handling -DTCL_UTF_MAX=6) using 3 new utility functions. This allows to re-use code in more places: cleaner implementation more future-proof. --- generic/regcomp.c | 2 +- generic/tclInt.h | 11 +++ generic/tclStubInit.c | 96 +----------------------- generic/tclUtf.c | 204 +++++++++++++++++++++++++++++++++++++++++++++++++- win/tclWin32Dll.c | 100 +------------------------ win/tclWinFCmd.c | 13 ++-- win/tclWinFile.c | 20 ++--- win/tclWinInit.c | 9 ++- win/tclWinPipe.c | 6 +- 9 files changed, 245 insertions(+), 216 deletions(-) diff --git a/generic/regcomp.c b/generic/regcomp.c index 7735d8b..49b024f 100644 --- a/generic/regcomp.c +++ b/generic/regcomp.c @@ -206,7 +206,7 @@ struct vars { int cflags; /* copy of compile flags */ int lasttype; /* type of previous token */ int nexttype; /* type of next token */ - chr nextvalue; /* value (if any) of next token */ + int nextvalue; /* value (if any) of next token */ int lexcon; /* lexical context type (see lex.c) */ int nsubexp; /* subexpression count */ struct subre **subs; /* subRE pointer vector */ diff --git a/generic/tclInt.h b/generic/tclInt.h index 3a77196..beb7a35 100644 --- a/generic/tclInt.h +++ b/generic/tclInt.h @@ -3241,6 +3241,17 @@ MODULE_SCOPE const char*TclGetCommandTypeName(Tcl_Command command); MODULE_SCOPE void TclRegisterCommandTypeName( Tcl_ObjCmdProc *implementationProc, const char *nameStr); +#if (TCL_UTF_MAX > 4) && (defined(__CYGWIN__) || defined(_WIN32)) +MODULE_SCOPE int TclUtfToWChar(const char *src, WCHAR *chPtr); +MODULE_SCOPE char * TclWCharToUtfDString(const WCHAR *uniStr, + int uniLength, Tcl_DString *dsPtr); +MODULE_SCOPE WCHAR * TclUtfToWCharDString(const char *src, + int length, Tcl_DString *dsPtr); +#else +# define TclUtfToWChar TclUtfToUniChar +# define TclWCharToUtfDString Tcl_UniCharToUtfDString +# define TclUtfToWCharDString Tcl_UtfToUniCharDString +#endif MODULE_SCOPE int TclUtfCmp(const char *cs, const char *ct); MODULE_SCOPE int TclUtfCasecmp(const char *cs, const char *ct); MODULE_SCOPE int TclUtfCount(int ch); diff --git a/generic/tclStubInit.c b/generic/tclStubInit.c index 43d1a50..cd31e10 100644 --- a/generic/tclStubInit.c +++ b/generic/tclStubInit.c @@ -240,68 +240,11 @@ Tcl_WinUtfToTChar( int len, Tcl_DString *dsPtr) { -#if TCL_UTF_MAX > 4 - Tcl_UniChar ch = 0; - wchar_t *w, *wString; - const char *p, *end; - int oldLength; -#endif - Tcl_DStringInit(dsPtr); if (!string) { return NULL; } -#if TCL_UTF_MAX > 4 - - if (len < 0) { - len = strlen(string); - } - - /* - * Unicode string length in Tcl_UniChars will be <= UTF-8 string length in - * bytes. - */ - - oldLength = Tcl_DStringLength(dsPtr); - - Tcl_DStringSetLength(dsPtr, - oldLength + (int) ((len + 1) * sizeof(wchar_t))); - wString = (wchar_t *) (Tcl_DStringValue(dsPtr) + oldLength); - - w = wString; - p = string; - end = string + len - 4; - while (p < end) { - p += TclUtfToUniChar(p, &ch); - if (ch > 0xFFFF) { - *w++ = (wchar_t) (0xD800 + ((ch -= 0x10000) >> 10)); - *w++ = (wchar_t) (0xDC00 | (ch & 0x3FF)); - } else { - *w++ = ch; - } - } - end += 4; - while (p < end) { - if (Tcl_UtfCharComplete(p, end-p)) { - p += TclUtfToUniChar(p, &ch); - } else { - ch = UCHAR(*p++); - } - if (ch > 0xFFFF) { - *w++ = (wchar_t) (0xD800 + ((ch -= 0x10000) >> 10)); - *w++ = (wchar_t) (0xDC00 | (ch & 0x3FF)); - } else { - *w++ = ch; - } - } - *w = '\0'; - Tcl_DStringSetLength(dsPtr, - oldLength + ((char *) w - (char *) wString)); - - return (char *)wString; -#else - return (char *)Tcl_UtfToUniCharDString(string, len, dsPtr); -#endif + return (char *)Tcl_UtfToWCharDString(string, len, dsPtr); } char * @@ -310,12 +253,6 @@ Tcl_WinTCharToUtf( int len, Tcl_DString *dsPtr) { -#if TCL_UTF_MAX > 4 - const wchar_t *w, *wEnd; - char *p, *result; - int oldLength, blen = 1; -#endif - Tcl_DStringInit(dsPtr); if (!string) { return NULL; @@ -325,36 +262,7 @@ Tcl_WinTCharToUtf( } else { len /= 2; } -#if TCL_UTF_MAX > 4 - oldLength = Tcl_DStringLength(dsPtr); - Tcl_DStringSetLength(dsPtr, oldLength + (len + 1) * 4); - result = Tcl_DStringValue(dsPtr) + oldLength; - - p = result; - wEnd = (wchar_t *)string + len; - for (w = (wchar_t *)string; w < wEnd; ) { - if (!blen && ((*w & 0xFC00) != 0xDC00)) { - /* Special case for handling high surrogates. */ - p += Tcl_UniCharToUtf(-1, p); - } - blen = Tcl_UniCharToUtf(*w, p); - p += blen; - if ((*w >= 0xD800) && (blen < 3)) { - /* Indication that high surrogate is handled */ - blen = 0; - } - w++; - } - if (!blen) { - /* Special case for handling high surrogates. */ - p += Tcl_UniCharToUtf(-1, p); - } - Tcl_DStringSetLength(dsPtr, oldLength + (p - result)); - - return result; -#else - return Tcl_UniCharToUtfDString((Tcl_UniChar *)string, len, dsPtr); -#endif + return TclWCharToUtfDString((Tcl_UniChar *)string, len, dsPtr); } #if defined(TCL_WIDE_INT_IS_LONG) diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 4f2ec5a..b5d8824 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -266,6 +266,50 @@ Tcl_UniCharToUtfDString( return string; } +#if (TCL_UTF_MAX > 4) && (defined(__CYGWIN__) || defined(_WIN32)) +char * +TclWCharToUtfDString( + const WCHAR *uniStr, /* WCHAR string to convert to UTF-8. */ + int uniLength, /* Length of WCHAR string in Tcl_UniChars + * (must be >= 0). */ + Tcl_DString *dsPtr) /* UTF-8 representation of string is appended + * to this previously initialized DString. */ +{ + const WCHAR *w, *wEnd; + char *p, *string; + int oldLength, len = 1; + + /* + * UTF-8 string length in bytes will be <= Unicode string length * 4. + */ + + oldLength = Tcl_DStringLength(dsPtr); + Tcl_DStringSetLength(dsPtr, oldLength + (uniLength + 1) * 4); + string = Tcl_DStringValue(dsPtr) + oldLength; + + p = string; + wEnd = uniStr + uniLength; + for (w = uniStr; w < wEnd; ) { + if (!len && ((*w & 0xFC00) != 0xDC00)) { + /* Special case for handling high surrogates. */ + p += Tcl_UniCharToUtf(-1, p); + } + len = Tcl_UniCharToUtf(*w, p); + p += len; + if ((*w >= 0xD800) && (len < 3)) { + len = 0; /* Indication that high surrogate was found */ + } + w++; + } + if (!len) { + /* Special case for handling high surrogates. */ + p += Tcl_UniCharToUtf(-1, p); + } + Tcl_DStringSetLength(dsPtr, oldLength + (p - string)); + + return string; +} +#endif /* *--------------------------------------------------------------------------- * @@ -417,7 +461,109 @@ Tcl_UtfToUniChar( *chPtr = byte; return 1; } - + +#if (TCL_UTF_MAX > 4) && (defined(__CYGWIN__) || defined(_WIN32)) +int +TclUtfToWChar( + const char *src, /* The UTF-8 string. */ + WCHAR *chPtr)/* Filled with the WCHAR represented by + * the UTF-8 string. */ +{ + WCHAR byte; + + /* + * Unroll 1 to 4 byte UTF-8 sequences. + */ + + byte = *((unsigned char *) src); + if (byte < 0xC0) { + /* + * Handles properly formed UTF-8 characters between 0x01 and 0x7F. + * Treats naked trail bytes 0x80 to 0x9F as valid characters from + * the cp1252 table. See: + * Also treats \0 and other naked trail bytes 0xA0 to 0xBF as valid + * characters representing themselves. + */ + + /* If *chPtr contains a high surrogate (produced by a previous + * Tcl_UtfToUniChar() call) and the next 3 bytes are UTF-8 continuation + * bytes, then we must produce a follow-up low surrogate. We only + * do that if the high surrogate matches the bits we encounter. + */ + if ((byte >= 0x80) + && (((((byte - 0x10) << 2) & 0xFC) | 0xD800) == (*chPtr & 0xFCFC)) + && ((src[1] & 0xF0) == (((*chPtr << 4) & 0x30) | 0x80)) + && ((src[2] & 0xC0) == 0x80)) { + *chPtr = ((src[1] & 0x0F) << 6) + (src[2] & 0x3F) + 0xDC00; + return 3; + } + if ((unsigned)(byte-0x80) < (unsigned)0x20) { + *chPtr = cp1252[byte-0x80]; + } else { + *chPtr = byte; + } + return 1; + } else if (byte < 0xE0) { + if ((src[1] & 0xC0) == 0x80) { + /* + * Two-byte-character lead-byte followed by a trail-byte. + */ + + *chPtr = (((byte & 0x1F) << 6) | (src[1] & 0x3F)); + if ((unsigned)(*chPtr - 1) >= (UNICODE_SELF - 1)) { + return 2; + } + } + + /* + * A two-byte-character lead-byte not followed by trail-byte + * represents itself. + */ + } else if (byte < 0xF0) { + if (((src[1] & 0xC0) == 0x80) && ((src[2] & 0xC0) == 0x80)) { + /* + * Three-byte-character lead byte followed by two trail bytes. + */ + + *chPtr = (((byte & 0x0F) << 12) + | ((src[1] & 0x3F) << 6) | (src[2] & 0x3F)); + if (*chPtr > 0x7FF) { + return 3; + } + } + + /* + * A three-byte-character lead-byte not followed by two trail-bytes + * represents itself. + */ + } + else if (byte < 0xF8) { + if (((src[1] & 0xC0) == 0x80) && ((src[2] & 0xC0) == 0x80) && ((src[3] & 0xC0) == 0x80)) { + /* + * Four-byte-character lead byte followed by three trail bytes. + */ + WCHAR high = (((byte & 0x07) << 8) | ((src[1] & 0x3F) << 2) + | ((src[2] & 0x3F) >> 4)) - 0x40; + if (high >= 0x400) { + /* out of range, < 0x10000 or > 0x10ffff */ + } else { + /* produce high surrogate, advance source pointer */ + *chPtr = 0xD800 + high; + return 1; + } + } + + /* + * A four-byte-character lead-byte not followed by three trail-bytes + * represents itself. + */ + } + + *chPtr = byte; + return 1; +} +#endif + /* *--------------------------------------------------------------------------- * @@ -488,7 +634,61 @@ Tcl_UtfToUniCharDString( return wString; } - + +#if (TCL_UTF_MAX > 4) && (defined(__CYGWIN__) || defined(_WIN32)) +WCHAR * +TclUtfToWCharDString( + const char *src, /* UTF-8 string to convert to Unicode. */ + int length, /* Length of UTF-8 string in bytes, or -1 for + * strlen(). */ + Tcl_DString *dsPtr) /* Unicode representation of string is + * appended to this previously initialized + * DString. */ +{ + WCHAR ch = 0, *w, *wString; + const char *p, *end; + int oldLength; + + if (length < 0) { + length = strlen(src); + } + + /* + * Unicode string length in Tcl_UniChars will be <= UTF-8 string length in + * bytes. + */ + + oldLength = Tcl_DStringLength(dsPtr); + + Tcl_DStringSetLength(dsPtr, + oldLength + (int) ((length + 1) * sizeof(WCHAR))); + wString = (WCHAR *) (Tcl_DStringValue(dsPtr) + oldLength); + + w = wString; + p = src; + end = src + length - 4; + while (p < end) { + p += TclUtfToWChar(p, &ch); + *w++ = ch; + } + end += 4; + while (p < end) { + if (Tcl_UtfCharComplete(p, end-p)) { + p += TclUtfToWChar(p, &ch); + } else if (((UCHAR(*p)-0x80)) < 0x20) { + ch = cp1252[UCHAR(*p++)-0x80]; + } else { + ch = UCHAR(*p++); + } + *w++ = ch; + } + *w = '\0'; + Tcl_DStringSetLength(dsPtr, + oldLength + ((char *) w - (char *) wString)); + + return wString; +} +#endif /* *--------------------------------------------------------------------------- * diff --git a/win/tclWin32Dll.c b/win/tclWin32Dll.c index c39d2c1..4c2134b 100644 --- a/win/tclWin32Dll.c +++ b/win/tclWin32Dll.c @@ -471,123 +471,31 @@ Tcl_WinUtfToTChar( Tcl_DString *dsPtr) /* Uninitialized or free DString in which the * converted string is stored. */ { -#if TCL_UTF_MAX > 4 - Tcl_UniChar ch = 0; - TCHAR *w, *wString; - const char *p, *end; - int oldLength; -#endif - Tcl_DStringInit(dsPtr); if (!string) { return NULL; } -#if TCL_UTF_MAX > 4 - - if (len < 0) { - len = strlen(string); - } - - /* - * Unicode string length in Tcl_UniChars will be <= UTF-8 string length in - * bytes. - */ - - oldLength = Tcl_DStringLength(dsPtr); - - Tcl_DStringSetLength(dsPtr, - oldLength + (int) ((len + 1) * sizeof(TCHAR))); - wString = (TCHAR *) (Tcl_DStringValue(dsPtr) + oldLength); - - w = wString; - p = string; - end = string + len - 4; - while (p < end) { - p += TclUtfToUniChar(p, &ch); - if (ch > 0xFFFF) { - *w++ = (wchar_t) (0xD800 + ((ch -= 0x10000) >> 10)); - *w++ = (wchar_t) (0xDC00 | (ch & 0x3FF)); - } else { - *w++ = ch; - } - } - end += 4; - while (p < end) { - if (Tcl_UtfCharComplete(p, end-p)) { - p += TclUtfToUniChar(p, &ch); - } else { - ch = UCHAR(*p++); - } - if (ch > 0xFFFF) { - *w++ = (wchar_t) (0xD800 + ((ch -= 0x10000) >> 10)); - *w++ = (wchar_t) (0xDC00 | (ch & 0x3FF)); - } else { - *w++ = ch; - } - } - *w = '\0'; - Tcl_DStringSetLength(dsPtr, - oldLength + ((char *) w - (char *) wString)); - - return wString; -#else - return Tcl_UtfToUniCharDString(string, len, dsPtr); -#endif + return TclUtfToWCharDString(string, len, dsPtr); } char * Tcl_WinTCharToUtf( - const TCHAR *string, /* Source string in Unicode. */ + const WCHAR *string, /* Source string in Unicode. */ int len, /* Source string length in bytes, or -1 for * platform-specific string length. */ Tcl_DString *dsPtr) /* Uninitialized or free DString in which the * converted string is stored. */ { -#if TCL_UTF_MAX > 4 - const TCHAR *w, *wEnd; - char *p, *result; - int oldLength, blen = 1; -#endif - Tcl_DStringInit(dsPtr); if (!string) { return NULL; } if (len < 0) { - len = wcslen((TCHAR *)string); + len = wcslen((WCHAR *)string); } else { len /= 2; } -#if TCL_UTF_MAX > 4 - oldLength = Tcl_DStringLength(dsPtr); - Tcl_DStringSetLength(dsPtr, oldLength + (len + 1) * 4); - result = Tcl_DStringValue(dsPtr) + oldLength; - - p = result; - wEnd = (TCHAR *)string + len; - for (w = (TCHAR *)string; w < wEnd; ) { - if (!blen && ((*w & 0xFC00) != 0xDC00)) { - /* Special case for handling high surrogates. */ - p += Tcl_UniCharToUtf(-1, p); - } - blen = Tcl_UniCharToUtf(*w, p); - p += blen; - if ((*w >= 0xD800) && (blen < 3)) { - /* Indication that high surrogate is handled */ - blen = 0; - } - w++; - } - if (!blen) { - /* Special case for handling high surrogates. */ - p += Tcl_UniCharToUtf(-1, p); - } - Tcl_DStringSetLength(dsPtr, oldLength + (p - result)); - - return result; -#else - return Tcl_UniCharToUtfDString((Tcl_UniChar *)string, len, dsPtr); -#endif + return TclWCharToUtfDString((unsigned short *)string, len, dsPtr); } /* diff --git a/win/tclWinFCmd.c b/win/tclWinFCmd.c index c3ced34..6580627 100644 --- a/win/tclWinFCmd.c +++ b/win/tclWinFCmd.c @@ -1524,8 +1524,8 @@ GetWinFileAttributes( * We test for, and fix that case, here. */ - const char *str = TclGetString(fileName); - size_t len = fileName->length; + int len; + const char *str = TclGetStringFromObj(fileName, &len); if (len < 4) { if (len == 0) { @@ -1610,11 +1610,12 @@ ConvertFileNameFormat( for (i = 0; i < pathc; i++) { Tcl_Obj *elt; char *pathv; + int length; Tcl_ListObjIndex(NULL, splitPath, i, &elt); - pathv = TclGetString(elt); - if ((pathv[0] == '/') || ((elt->length == 3) && (pathv[1] == ':')) + pathv = TclGetStringFromObj(elt, &length); + if ((pathv[0] == '/') || ((length == 3) && (pathv[1] == ':')) || (strcmp(pathv, ".") == 0) || (strcmp(pathv, "..") == 0)) { /* * Handle "/", "//machine/export", "c:/", "." or ".." by just @@ -1649,8 +1650,8 @@ ConvertFileNameFormat( * likely to lead to infinite loops. */ - tempString = TclGetString(tempPath); - nativeName = Tcl_WinUtfToTChar(tempString, tempPath->length, &ds); + tempString = TclGetStringFromObj(tempPath, &length); + nativeName = Tcl_WinUtfToTChar(tempString, length, &ds); Tcl_DecrRefCount(tempPath); handle = FindFirstFile(nativeName, &data); if (handle == INVALID_HANDLE_VALUE) { diff --git a/win/tclWinFile.c b/win/tclWinFile.c index bae4bd7..899311a 100644 --- a/win/tclWinFile.c +++ b/win/tclWinFile.c @@ -1464,13 +1464,15 @@ TclpGetUserHome( } Tcl_DStringFree(&ds); } else { - wName = Tcl_WinUtfToTChar(domain + 1, -1, &ds); + Tcl_DStringInit(&ds); + wName = TclUtfToWCharDString(domain + 1, -1, &ds); rc = NetGetDCName(NULL, wName, (LPBYTE *) &wDomain); Tcl_DStringFree(&ds); nameLen = domain - name; } if (rc == 0) { - wName = Tcl_WinUtfToTChar(name, nameLen, &ds); + Tcl_DStringInit(&ds); + wName = TclUtfToWCharDString(name, nameLen, &ds); while (NetUserGetInfo(wDomain, wName, 1, (LPBYTE *) &uiPtr) != 0) { /* * user does not exists - if domain was not specified, @@ -1488,14 +1490,14 @@ TclpGetUserHome( wHomeDir = uiPtr->usri1_home_dir; if ((wHomeDir != NULL) && (wHomeDir[0] != L'\0')) { size = lstrlenW(wHomeDir); - Tcl_WinTCharToUtf((TCHAR *) wHomeDir, size * sizeof(WCHAR), bufferPtr); + TclWCharToUtfDString(wHomeDir, size, bufferPtr); } else { /* * User exists but has no home dir. Return * "{GetProfilesDirectory}/". */ GetProfilesDirectoryW(buf, &size); - Tcl_WinTCharToUtf(buf, (size-1) * sizeof(WCHAR), bufferPtr); + TclWCharToUtfDString(buf, size-1, bufferPtr); Tcl_DStringAppend(bufferPtr, "/", 1); Tcl_DStringAppend(bufferPtr, name, nameLen); } @@ -2842,8 +2844,7 @@ TclWinVolumeRelativeNormalize( */ int cwdLen; - const char *drive = - TclGetStringFromObj(useThisCwd, &cwdLen); + const char *drive = TclGetStringFromObj(useThisCwd, &cwdLen); char drive_cur = path[0]; if (drive_cur >= 'a') { @@ -2978,7 +2979,7 @@ TclNativeCreateNativeRep( WCHAR *nativePathPtr = NULL; const char *str; Tcl_Obj *validPathPtr; - size_t len; + int len; WCHAR *wp; if (TclFSCwdIsNative()) { @@ -3006,10 +3007,9 @@ TclNativeCreateNativeRep( Tcl_IncrRefCount(validPathPtr); } - str = Tcl_GetString(validPathPtr); - len = validPathPtr->length; + str = Tcl_GetStringFromObj(validPathPtr, &len); - if (strlen(str)!=(unsigned int)len) { + if (strlen(str)!=(size_t)len) { /* String contains NUL-bytes. This is invalid. */ goto done; } diff --git a/win/tclWinInit.c b/win/tclWinInit.c index d780660..97a47bf 100644 --- a/win/tclWinInit.c +++ b/win/tclWinInit.c @@ -188,6 +188,7 @@ TclpInitLibraryPath( Tcl_Obj *pathPtr; char installLib[LIBRARY_SIZE]; const char *bytes; + int length; pathPtr = Tcl_NewObj(); @@ -223,10 +224,10 @@ TclpInitLibraryPath( TclGetProcessGlobalValue(&sourceLibraryDir)); *encodingPtr = NULL; - bytes = TclGetString(pathPtr); - *lengthPtr = pathPtr->length; - *valuePtr = ckalloc(*lengthPtr + 1); - memcpy(*valuePtr, bytes, *lengthPtr + 1); + bytes = TclGetStringFromObj(pathPtr, &length); + *lengthPtr = length++; + *valuePtr = ckalloc(length); + memcpy(*valuePtr, bytes, length); Tcl_DecrRefCount(pathPtr); } diff --git a/win/tclWinPipe.c b/win/tclWinPipe.c index 86b98f7..7b4769c 100644 --- a/win/tclWinPipe.c +++ b/win/tclWinPipe.c @@ -3119,15 +3119,15 @@ TclpOpenTemporaryFile( } namePtr += length * sizeof(TCHAR); if (basenameObj) { - const char *string = Tcl_GetString(basenameObj); + const char *string = TclGetStringFromObj(basenameObj, &length); - Tcl_WinUtfToTChar(string, basenameObj->length, &buf); + Tcl_WinUtfToTChar(string, length, &buf); memcpy(namePtr, Tcl_DStringValue(&buf), Tcl_DStringLength(&buf)); namePtr += Tcl_DStringLength(&buf); Tcl_DStringFree(&buf); } else { const TCHAR *baseStr = TEXT("TCL"); - int length = 3 * sizeof(TCHAR); + length = 3 * sizeof(TCHAR); memcpy(namePtr, baseStr, length); namePtr += length; -- cgit v0.12