summaryrefslogtreecommitdiffstats
path: root/generic/tclUtf.c
diff options
context:
space:
mode:
Diffstat (limited to 'generic/tclUtf.c')
-rw-r--r--generic/tclUtf.c155
1 files changed, 71 insertions, 84 deletions
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index 6c39d1c..320d7aa 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -221,22 +221,33 @@ three:
*---------------------------------------------------------------------------
*/
+#undef Tcl_UniCharToUtfDString
char *
Tcl_UniCharToUtfDString(
- const Tcl_UniChar *uniStr, /* Unicode string to convert to UTF-8. */
- int uniLength, /* Length of Unicode string in Tcl_UniChars
- * (must be >= 0). */
+ const int *uniStr, /* Unicode string to convert to UTF-8. */
+ int uniLength, /* Length of Unicode string. */
Tcl_DString *dsPtr) /* UTF-8 representation of string is appended
* to this previously initialized DString. */
{
- const Tcl_UniChar *w, *wEnd;
+ const int *w, *wEnd;
char *p, *string;
- int oldLength, len = 1;
+ int oldLength;
/*
* UTF-8 string length in bytes will be <= Unicode string length * 4.
*/
+ if (uniStr == NULL) {
+ return NULL;
+ }
+ if (uniLength < 0) {
+ uniLength = 0;
+ w = uniStr;
+ while (*w != '\0') {
+ uniLength++;
+ w++;
+ }
+ }
oldLength = Tcl_DStringLength(dsPtr);
Tcl_DStringSetLength(dsPtr, oldLength + (uniLength + 1) * 4);
string = Tcl_DStringValue(dsPtr) + oldLength;
@@ -244,45 +255,43 @@ Tcl_UniCharToUtfDString(
p = string;
wEnd = uniStr + uniLength;
for (w = uniStr; w < wEnd; ) {
- if (!len && ((*w & 0xFC00) != 0xDC00)) {
- /* Special case for handling high surrogates. */
- p += Tcl_UniCharToUtf(-1, p);
- }
- len = Tcl_UniCharToUtf(*w, p);
- p += len;
- if ((*w >= 0xD800) && (len < 3)) {
- len = 0; /* Indication that high surrogate was found */
- }
+ p += Tcl_UniCharToUtf(*w, p);
w++;
}
- if (!len) {
- /* Special case for handling high surrogates. */
- p += Tcl_UniCharToUtf(-1, p);
- }
Tcl_DStringSetLength(dsPtr, oldLength + (p - string));
return string;
}
-#if (TCL_UTF_MAX > 4) && (defined(__CYGWIN__) || defined(_WIN32))
char *
-TclWCharToUtfDString(
- const WCHAR *uniStr, /* WCHAR string to convert to UTF-8. */
- int uniLength, /* Length of WCHAR string in Tcl_UniChars
- * (must be >= 0). */
+Tcl_Char16ToUtfDString(
+ const unsigned short *uniStr,/* Utf-16 string to convert to UTF-8. */
+ int uniLength, /* Length of Utf-16 string. */
Tcl_DString *dsPtr) /* UTF-8 representation of string is appended
* to this previously initialized DString. */
{
- const WCHAR *w, *wEnd;
+ const unsigned short *w, *wEnd;
char *p, *string;
int oldLength, len = 1;
/*
- * UTF-8 string length in bytes will be <= Unicode string length * 4.
+ * UTF-8 string length in bytes will be <= Utf16 string length * 3.
*/
+ if (uniStr == NULL) {
+ return NULL;
+ }
+ if (uniLength < 0) {
+
+ uniLength = 0;
+ w = uniStr;
+ while (*w != '\0') {
+ uniLength++;
+ w++;
+ }
+ }
oldLength = Tcl_DStringLength(dsPtr);
- Tcl_DStringSetLength(dsPtr, oldLength + (uniLength + 1) * 4);
+ Tcl_DStringSetLength(dsPtr, oldLength + (uniLength + 1) * 3);
string = Tcl_DStringValue(dsPtr) + oldLength;
p = string;
@@ -307,7 +316,6 @@ TclWCharToUtfDString(
return string;
}
-#endif
/*
*---------------------------------------------------------------------------
*
@@ -350,13 +358,14 @@ static const unsigned short cp1252[32] = {
0x2DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x9D, 0x017E, 0x0178
};
+#undef Tcl_UtfToUniChar
int
Tcl_UtfToUniChar(
const char *src, /* The UTF-8 string. */
- Tcl_UniChar *chPtr)/* Filled with the Tcl_UniChar represented by
+ int *chPtr)/* Filled with the unsigned int represented by
* the UTF-8 string. */
{
- Tcl_UniChar byte;
+ int byte;
/*
* Unroll 1 to 4 byte UTF-8 sequences.
@@ -372,20 +381,6 @@ Tcl_UtfToUniChar(
* characters representing themselves.
*/
-#if TCL_UTF_MAX <= 4
- /* If *chPtr contains a high surrogate (produced by a previous
- * Tcl_UtfToUniChar() call) and the next 3 bytes are UTF-8 continuation
- * bytes, then we must produce a follow-up low surrogate. We only
- * do that if the high surrogate matches the bits we encounter.
- */
- if ((byte >= 0x80)
- && (((((byte - 0x10) << 2) & 0xFC) | 0xD800) == (*chPtr & 0xFCFC))
- && ((src[1] & 0xF0) == (((*chPtr << 4) & 0x30) | 0x80))
- && ((src[2] & 0xC0) == 0x80)) {
- *chPtr = ((src[1] & 0x0F) << 6) + (src[2] & 0x3F) + 0xDC00;
- return 3;
- }
-#endif
if ((unsigned)(byte-0x80) < (unsigned)0x20) {
*chPtr = cp1252[byte-0x80];
} else {
@@ -431,23 +426,11 @@ Tcl_UtfToUniChar(
/*
* Four-byte-character lead byte followed by three trail bytes.
*/
-#if TCL_UTF_MAX <= 4
- Tcl_UniChar high = (((byte & 0x07) << 8) | ((src[1] & 0x3F) << 2)
- | ((src[2] & 0x3F) >> 4)) - 0x40;
- if (high >= 0x400) {
- /* out of range, < 0x10000 or > 0x10ffff */
- } else {
- /* produce high surrogate, advance source pointer */
- *chPtr = 0xD800 + high;
- return 1;
- }
-#else
*chPtr = (((byte & 0x07) << 18) | ((src[1] & 0x3F) << 12)
| ((src[2] & 0x3F) << 6) | (src[3] & 0x3F));
if ((unsigned)(*chPtr - 0x10000) <= 0xFFFFF) {
return 4;
}
-#endif
}
/*
@@ -460,14 +443,13 @@ Tcl_UtfToUniChar(
return 1;
}
-#if (TCL_UTF_MAX > 4) && (defined(__CYGWIN__) || defined(_WIN32))
int
-TclUtfToWChar(
+Tcl_UtfToChar16(
const char *src, /* The UTF-8 string. */
- WCHAR *chPtr)/* Filled with the WCHAR represented by
+ unsigned short *chPtr)/* Filled with the unsigned short represented by
* the UTF-8 string. */
{
- WCHAR byte;
+ unsigned short byte;
/*
* Unroll 1 to 4 byte UTF-8 sequences.
@@ -540,7 +522,7 @@ TclUtfToWChar(
/*
* Four-byte-character lead byte followed by three trail bytes.
*/
- WCHAR high = (((byte & 0x07) << 8) | ((src[1] & 0x3F) << 2)
+ unsigned short high = (((byte & 0x07) << 8) | ((src[1] & 0x3F) << 2)
| ((src[2] & 0x3F) >> 4)) - 0x40;
if (high >= 0x400) {
/* out of range, < 0x10000 or > 0x10ffff */
@@ -560,7 +542,6 @@ TclUtfToWChar(
*chPtr = byte;
return 1;
}
-#endif
/*
*---------------------------------------------------------------------------
@@ -580,7 +561,8 @@ TclUtfToWChar(
*---------------------------------------------------------------------------
*/
-Tcl_UniChar *
+#undef Tcl_UtfToUniCharDString
+int *
Tcl_UtfToUniCharDString(
const char *src, /* UTF-8 string to convert to Unicode. */
int length, /* Length of UTF-8 string in bytes, or -1 for
@@ -589,10 +571,13 @@ Tcl_UtfToUniCharDString(
* appended to this previously initialized
* DString. */
{
- Tcl_UniChar ch = 0, *w, *wString;
+ int ch = 0, *w, *wString;
const char *p, *end;
int oldLength;
+ if (src == NULL) {
+ return NULL;
+ }
if (length < 0) {
length = strlen(src);
}
@@ -605,20 +590,20 @@ Tcl_UtfToUniCharDString(
oldLength = Tcl_DStringLength(dsPtr);
Tcl_DStringSetLength(dsPtr,
- oldLength + (int) ((length + 1) * sizeof(Tcl_UniChar)));
- wString = (Tcl_UniChar *) (Tcl_DStringValue(dsPtr) + oldLength);
+ oldLength + ((length + 1) * sizeof(int)));
+ wString = (int *) (Tcl_DStringValue(dsPtr) + oldLength);
w = wString;
p = src;
end = src + length - 4;
while (p < end) {
- p += TclUtfToUniChar(p, &ch);
+ p += Tcl_UtfToUniChar(p, &ch);
*w++ = ch;
}
end += 4;
while (p < end) {
if (Tcl_UtfCharComplete(p, end-p)) {
- p += TclUtfToUniChar(p, &ch);
+ p += Tcl_UtfToUniChar(p, &ch);
} else {
ch = UCHAR(*p++);
}
@@ -631,9 +616,8 @@ Tcl_UtfToUniCharDString(
return wString;
}
-#if (TCL_UTF_MAX > 4) && (defined(__CYGWIN__) || defined(_WIN32))
-WCHAR *
-TclUtfToWCharDString(
+unsigned short *
+Tcl_UtfToChar16DString(
const char *src, /* UTF-8 string to convert to Unicode. */
int length, /* Length of UTF-8 string in bytes, or -1 for
* strlen(). */
@@ -641,10 +625,14 @@ TclUtfToWCharDString(
* appended to this previously initialized
* DString. */
{
- WCHAR ch = 0, *w, *wString;
+ unsigned short ch = 0;
+ unsigned short *w, *wString;
const char *p, *end;
int oldLength;
+ if (src == NULL) {
+ return NULL;
+ }
if (length < 0) {
length = strlen(src);
}
@@ -657,20 +645,20 @@ TclUtfToWCharDString(
oldLength = Tcl_DStringLength(dsPtr);
Tcl_DStringSetLength(dsPtr,
- oldLength + (int) ((length + 1) * sizeof(WCHAR)));
- wString = (WCHAR *) (Tcl_DStringValue(dsPtr) + oldLength);
+ oldLength + ((length + 1) * sizeof(unsigned short)));
+ wString = (unsigned short *) (Tcl_DStringValue(dsPtr) + oldLength);
w = wString;
p = src;
end = src + length - 4;
while (p < end) {
- p += TclUtfToWChar(p, &ch);
+ p += Tcl_UtfToChar16(p, &ch);
*w++ = ch;
}
end += 4;
while (p < end) {
if (Tcl_UtfCharComplete(p, end-p)) {
- p += TclUtfToWChar(p, &ch);
+ p += Tcl_UtfToChar16(p, &ch);
} else {
ch = UCHAR(*p++);
}
@@ -682,7 +670,6 @@ TclUtfToWCharDString(
return wString;
}
-#endif
/*
*---------------------------------------------------------------------------
*
@@ -2151,7 +2138,7 @@ Tcl_UniCharCaseMatch(
if ((p != '[') && (p != '?') && (p != '\\')) {
if (nocase) {
while (*uniStr && (p != *uniStr)
- && (p != (Tcl_UniChar)Tcl_UniCharToLower(*uniStr))) {
+ && (p != Tcl_UniCharToLower(*uniStr))) {
uniStr++;
}
} else {
@@ -2191,13 +2178,13 @@ Tcl_UniCharCaseMatch(
Tcl_UniChar startChar, endChar;
uniPattern++;
- ch1 = (nocase ? (Tcl_UniChar)Tcl_UniCharToLower(*uniStr) : *uniStr);
+ ch1 = (nocase ? Tcl_UniCharToLower(*uniStr) : *uniStr);
uniStr++;
while (1) {
if ((*uniPattern == ']') || (*uniPattern == 0)) {
return 0;
}
- startChar = (nocase ? (Tcl_UniChar)Tcl_UniCharToLower(*uniPattern)
+ startChar = (nocase ? Tcl_UniCharToLower(*uniPattern)
: *uniPattern);
uniPattern++;
if (*uniPattern == '-') {
@@ -2205,7 +2192,7 @@ Tcl_UniCharCaseMatch(
if (*uniPattern == 0) {
return 0;
}
- endChar = (nocase ? (Tcl_UniChar)Tcl_UniCharToLower(*uniPattern)
+ endChar = (nocase ? Tcl_UniCharToLower(*uniPattern)
: *uniPattern);
uniPattern++;
if (((startChar <= ch1) && (ch1 <= endChar))
@@ -2343,7 +2330,7 @@ TclUniCharMatch(
if ((p != '[') && (p != '?') && (p != '\\')) {
if (nocase) {
while ((string < stringEnd) && (p != *string)
- && (p != (Tcl_UniChar)Tcl_UniCharToLower(*string))) {
+ && (p != Tcl_UniCharToLower(*string))) {
string++;
}
} else {
@@ -2384,20 +2371,20 @@ TclUniCharMatch(
Tcl_UniChar ch1, startChar, endChar;
pattern++;
- ch1 = (nocase ? (Tcl_UniChar)Tcl_UniCharToLower(*string) : *string);
+ ch1 = (nocase ? Tcl_UniCharToLower(*string) : *string);
string++;
while (1) {
if ((*pattern == ']') || (pattern == patternEnd)) {
return 0;
}
- startChar = (nocase ? (Tcl_UniChar)Tcl_UniCharToLower(*pattern) : *pattern);
+ startChar = (nocase ? Tcl_UniCharToLower(*pattern) : *pattern);
pattern++;
if (*pattern == '-') {
pattern++;
if (pattern == patternEnd) {
return 0;
}
- endChar = (nocase ? (Tcl_UniChar)Tcl_UniCharToLower(*pattern)
+ endChar = (nocase ? Tcl_UniCharToLower(*pattern)
: *pattern);
pattern++;
if (((startChar <= ch1) && (ch1 <= endChar))