diff options
Diffstat (limited to 'generic/tclUtf.c')
| -rw-r--r-- | generic/tclUtf.c | 155 |
1 files changed, 71 insertions, 84 deletions
diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 6c39d1c..320d7aa 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -221,22 +221,33 @@ three: *--------------------------------------------------------------------------- */ +#undef Tcl_UniCharToUtfDString char * Tcl_UniCharToUtfDString( - const Tcl_UniChar *uniStr, /* Unicode string to convert to UTF-8. */ - int uniLength, /* Length of Unicode string in Tcl_UniChars - * (must be >= 0). */ + const int *uniStr, /* Unicode string to convert to UTF-8. */ + int uniLength, /* Length of Unicode string. */ Tcl_DString *dsPtr) /* UTF-8 representation of string is appended * to this previously initialized DString. */ { - const Tcl_UniChar *w, *wEnd; + const int *w, *wEnd; char *p, *string; - int oldLength, len = 1; + int oldLength; /* * UTF-8 string length in bytes will be <= Unicode string length * 4. */ + if (uniStr == NULL) { + return NULL; + } + if (uniLength < 0) { + uniLength = 0; + w = uniStr; + while (*w != '\0') { + uniLength++; + w++; + } + } oldLength = Tcl_DStringLength(dsPtr); Tcl_DStringSetLength(dsPtr, oldLength + (uniLength + 1) * 4); string = Tcl_DStringValue(dsPtr) + oldLength; @@ -244,45 +255,43 @@ Tcl_UniCharToUtfDString( p = string; wEnd = uniStr + uniLength; for (w = uniStr; w < wEnd; ) { - if (!len && ((*w & 0xFC00) != 0xDC00)) { - /* Special case for handling high surrogates. */ - p += Tcl_UniCharToUtf(-1, p); - } - len = Tcl_UniCharToUtf(*w, p); - p += len; - if ((*w >= 0xD800) && (len < 3)) { - len = 0; /* Indication that high surrogate was found */ - } + p += Tcl_UniCharToUtf(*w, p); w++; } - if (!len) { - /* Special case for handling high surrogates. */ - p += Tcl_UniCharToUtf(-1, p); - } Tcl_DStringSetLength(dsPtr, oldLength + (p - string)); return string; } -#if (TCL_UTF_MAX > 4) && (defined(__CYGWIN__) || defined(_WIN32)) char * -TclWCharToUtfDString( - const WCHAR *uniStr, /* WCHAR string to convert to UTF-8. */ - int uniLength, /* Length of WCHAR string in Tcl_UniChars - * (must be >= 0). */ +Tcl_Char16ToUtfDString( + const unsigned short *uniStr,/* Utf-16 string to convert to UTF-8. */ + int uniLength, /* Length of Utf-16 string. */ Tcl_DString *dsPtr) /* UTF-8 representation of string is appended * to this previously initialized DString. */ { - const WCHAR *w, *wEnd; + const unsigned short *w, *wEnd; char *p, *string; int oldLength, len = 1; /* - * UTF-8 string length in bytes will be <= Unicode string length * 4. + * UTF-8 string length in bytes will be <= Utf16 string length * 3. */ + if (uniStr == NULL) { + return NULL; + } + if (uniLength < 0) { + + uniLength = 0; + w = uniStr; + while (*w != '\0') { + uniLength++; + w++; + } + } oldLength = Tcl_DStringLength(dsPtr); - Tcl_DStringSetLength(dsPtr, oldLength + (uniLength + 1) * 4); + Tcl_DStringSetLength(dsPtr, oldLength + (uniLength + 1) * 3); string = Tcl_DStringValue(dsPtr) + oldLength; p = string; @@ -307,7 +316,6 @@ TclWCharToUtfDString( return string; } -#endif /* *--------------------------------------------------------------------------- * @@ -350,13 +358,14 @@ static const unsigned short cp1252[32] = { 0x2DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x9D, 0x017E, 0x0178 }; +#undef Tcl_UtfToUniChar int Tcl_UtfToUniChar( const char *src, /* The UTF-8 string. */ - Tcl_UniChar *chPtr)/* Filled with the Tcl_UniChar represented by + int *chPtr)/* Filled with the unsigned int represented by * the UTF-8 string. */ { - Tcl_UniChar byte; + int byte; /* * Unroll 1 to 4 byte UTF-8 sequences. @@ -372,20 +381,6 @@ Tcl_UtfToUniChar( * characters representing themselves. */ -#if TCL_UTF_MAX <= 4 - /* If *chPtr contains a high surrogate (produced by a previous - * Tcl_UtfToUniChar() call) and the next 3 bytes are UTF-8 continuation - * bytes, then we must produce a follow-up low surrogate. We only - * do that if the high surrogate matches the bits we encounter. - */ - if ((byte >= 0x80) - && (((((byte - 0x10) << 2) & 0xFC) | 0xD800) == (*chPtr & 0xFCFC)) - && ((src[1] & 0xF0) == (((*chPtr << 4) & 0x30) | 0x80)) - && ((src[2] & 0xC0) == 0x80)) { - *chPtr = ((src[1] & 0x0F) << 6) + (src[2] & 0x3F) + 0xDC00; - return 3; - } -#endif if ((unsigned)(byte-0x80) < (unsigned)0x20) { *chPtr = cp1252[byte-0x80]; } else { @@ -431,23 +426,11 @@ Tcl_UtfToUniChar( /* * Four-byte-character lead byte followed by three trail bytes. */ -#if TCL_UTF_MAX <= 4 - Tcl_UniChar high = (((byte & 0x07) << 8) | ((src[1] & 0x3F) << 2) - | ((src[2] & 0x3F) >> 4)) - 0x40; - if (high >= 0x400) { - /* out of range, < 0x10000 or > 0x10ffff */ - } else { - /* produce high surrogate, advance source pointer */ - *chPtr = 0xD800 + high; - return 1; - } -#else *chPtr = (((byte & 0x07) << 18) | ((src[1] & 0x3F) << 12) | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F)); if ((unsigned)(*chPtr - 0x10000) <= 0xFFFFF) { return 4; } -#endif } /* @@ -460,14 +443,13 @@ Tcl_UtfToUniChar( return 1; } -#if (TCL_UTF_MAX > 4) && (defined(__CYGWIN__) || defined(_WIN32)) int -TclUtfToWChar( +Tcl_UtfToChar16( const char *src, /* The UTF-8 string. */ - WCHAR *chPtr)/* Filled with the WCHAR represented by + unsigned short *chPtr)/* Filled with the unsigned short represented by * the UTF-8 string. */ { - WCHAR byte; + unsigned short byte; /* * Unroll 1 to 4 byte UTF-8 sequences. @@ -540,7 +522,7 @@ TclUtfToWChar( /* * Four-byte-character lead byte followed by three trail bytes. */ - WCHAR high = (((byte & 0x07) << 8) | ((src[1] & 0x3F) << 2) + unsigned short high = (((byte & 0x07) << 8) | ((src[1] & 0x3F) << 2) | ((src[2] & 0x3F) >> 4)) - 0x40; if (high >= 0x400) { /* out of range, < 0x10000 or > 0x10ffff */ @@ -560,7 +542,6 @@ TclUtfToWChar( *chPtr = byte; return 1; } -#endif /* *--------------------------------------------------------------------------- @@ -580,7 +561,8 @@ TclUtfToWChar( *--------------------------------------------------------------------------- */ -Tcl_UniChar * +#undef Tcl_UtfToUniCharDString +int * Tcl_UtfToUniCharDString( const char *src, /* UTF-8 string to convert to Unicode. */ int length, /* Length of UTF-8 string in bytes, or -1 for @@ -589,10 +571,13 @@ Tcl_UtfToUniCharDString( * appended to this previously initialized * DString. */ { - Tcl_UniChar ch = 0, *w, *wString; + int ch = 0, *w, *wString; const char *p, *end; int oldLength; + if (src == NULL) { + return NULL; + } if (length < 0) { length = strlen(src); } @@ -605,20 +590,20 @@ Tcl_UtfToUniCharDString( oldLength = Tcl_DStringLength(dsPtr); Tcl_DStringSetLength(dsPtr, - oldLength + (int) ((length + 1) * sizeof(Tcl_UniChar))); - wString = (Tcl_UniChar *) (Tcl_DStringValue(dsPtr) + oldLength); + oldLength + ((length + 1) * sizeof(int))); + wString = (int *) (Tcl_DStringValue(dsPtr) + oldLength); w = wString; p = src; end = src + length - 4; while (p < end) { - p += TclUtfToUniChar(p, &ch); + p += Tcl_UtfToUniChar(p, &ch); *w++ = ch; } end += 4; while (p < end) { if (Tcl_UtfCharComplete(p, end-p)) { - p += TclUtfToUniChar(p, &ch); + p += Tcl_UtfToUniChar(p, &ch); } else { ch = UCHAR(*p++); } @@ -631,9 +616,8 @@ Tcl_UtfToUniCharDString( return wString; } -#if (TCL_UTF_MAX > 4) && (defined(__CYGWIN__) || defined(_WIN32)) -WCHAR * -TclUtfToWCharDString( +unsigned short * +Tcl_UtfToChar16DString( const char *src, /* UTF-8 string to convert to Unicode. */ int length, /* Length of UTF-8 string in bytes, or -1 for * strlen(). */ @@ -641,10 +625,14 @@ TclUtfToWCharDString( * appended to this previously initialized * DString. */ { - WCHAR ch = 0, *w, *wString; + unsigned short ch = 0; + unsigned short *w, *wString; const char *p, *end; int oldLength; + if (src == NULL) { + return NULL; + } if (length < 0) { length = strlen(src); } @@ -657,20 +645,20 @@ TclUtfToWCharDString( oldLength = Tcl_DStringLength(dsPtr); Tcl_DStringSetLength(dsPtr, - oldLength + (int) ((length + 1) * sizeof(WCHAR))); - wString = (WCHAR *) (Tcl_DStringValue(dsPtr) + oldLength); + oldLength + ((length + 1) * sizeof(unsigned short))); + wString = (unsigned short *) (Tcl_DStringValue(dsPtr) + oldLength); w = wString; p = src; end = src + length - 4; while (p < end) { - p += TclUtfToWChar(p, &ch); + p += Tcl_UtfToChar16(p, &ch); *w++ = ch; } end += 4; while (p < end) { if (Tcl_UtfCharComplete(p, end-p)) { - p += TclUtfToWChar(p, &ch); + p += Tcl_UtfToChar16(p, &ch); } else { ch = UCHAR(*p++); } @@ -682,7 +670,6 @@ TclUtfToWCharDString( return wString; } -#endif /* *--------------------------------------------------------------------------- * @@ -2151,7 +2138,7 @@ Tcl_UniCharCaseMatch( if ((p != '[') && (p != '?') && (p != '\\')) { if (nocase) { while (*uniStr && (p != *uniStr) - && (p != (Tcl_UniChar)Tcl_UniCharToLower(*uniStr))) { + && (p != Tcl_UniCharToLower(*uniStr))) { uniStr++; } } else { @@ -2191,13 +2178,13 @@ Tcl_UniCharCaseMatch( Tcl_UniChar startChar, endChar; uniPattern++; - ch1 = (nocase ? (Tcl_UniChar)Tcl_UniCharToLower(*uniStr) : *uniStr); + ch1 = (nocase ? Tcl_UniCharToLower(*uniStr) : *uniStr); uniStr++; while (1) { if ((*uniPattern == ']') || (*uniPattern == 0)) { return 0; } - startChar = (nocase ? (Tcl_UniChar)Tcl_UniCharToLower(*uniPattern) + startChar = (nocase ? Tcl_UniCharToLower(*uniPattern) : *uniPattern); uniPattern++; if (*uniPattern == '-') { @@ -2205,7 +2192,7 @@ Tcl_UniCharCaseMatch( if (*uniPattern == 0) { return 0; } - endChar = (nocase ? (Tcl_UniChar)Tcl_UniCharToLower(*uniPattern) + endChar = (nocase ? Tcl_UniCharToLower(*uniPattern) : *uniPattern); uniPattern++; if (((startChar <= ch1) && (ch1 <= endChar)) @@ -2343,7 +2330,7 @@ TclUniCharMatch( if ((p != '[') && (p != '?') && (p != '\\')) { if (nocase) { while ((string < stringEnd) && (p != *string) - && (p != (Tcl_UniChar)Tcl_UniCharToLower(*string))) { + && (p != Tcl_UniCharToLower(*string))) { string++; } } else { @@ -2384,20 +2371,20 @@ TclUniCharMatch( Tcl_UniChar ch1, startChar, endChar; pattern++; - ch1 = (nocase ? (Tcl_UniChar)Tcl_UniCharToLower(*string) : *string); + ch1 = (nocase ? Tcl_UniCharToLower(*string) : *string); string++; while (1) { if ((*pattern == ']') || (pattern == patternEnd)) { return 0; } - startChar = (nocase ? (Tcl_UniChar)Tcl_UniCharToLower(*pattern) : *pattern); + startChar = (nocase ? Tcl_UniCharToLower(*pattern) : *pattern); pattern++; if (*pattern == '-') { pattern++; if (pattern == patternEnd) { return 0; } - endChar = (nocase ? (Tcl_UniChar)Tcl_UniCharToLower(*pattern) + endChar = (nocase ? Tcl_UniCharToLower(*pattern) : *pattern); pattern++; if (((startChar <= ch1) && (ch1 <= endChar)) |
