summaryrefslogtreecommitdiffstats
path: root/generic/tclUtf.c
diff options
context:
space:
mode:
Diffstat (limited to 'generic/tclUtf.c')
-rw-r--r--generic/tclUtf.c1748
1 files changed, 346 insertions, 1402 deletions
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index 2fa0e80..e5497a4 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -3,7 +3,7 @@
*
* Routines for manipulating UTF-8 strings.
*
- * Copyright © 1997-1998 Sun Microsystems, Inc.
+ * Copyright (c) 1997-1998 Sun Microsystems, Inc.
*
* See the file "license.terms" for information on usage and redistribution of
* this file, and for a DISCLAIMER OF ALL WARRANTIES.
@@ -26,7 +26,7 @@
#define ALPHA_BITS ((1 << UPPERCASE_LETTER) | (1 << LOWERCASE_LETTER) \
| (1 << TITLECASE_LETTER) | (1 << MODIFIER_LETTER) | (1<<OTHER_LETTER))
-#define CONTROL_BITS ((1 << CONTROL) | (1 << FORMAT))
+#define CONTROL_BITS ((1 << CONTROL) | (1 << FORMAT) | (1 << PRIVATE_USE))
#define DIGIT_BITS (1 << DECIMAL_DIGIT_NUMBER)
@@ -55,44 +55,46 @@
#define UNICODE_SELF 0x80
/*
- * The following structures are used when mapping between Unicode and
+ * The following structures are used when mapping between Unicode (UCS-2) and
* UTF-8.
*/
-static const unsigned char totalBytes[256] = {
+static CONST unsigned char totalBytes[256] = {
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 2,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
- 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,1,1,1,1,1,1,1,1,1,1,1
+ 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
+#if TCL_UTF_MAX > 3
+ 4,4,4,4,4,4,4,4,
+#else
+ 1,1,1,1,1,1,1,1,
+#endif
+#if TCL_UTF_MAX > 4
+ 5,5,5,5,
+#else
+ 1,1,1,1,
+#endif
+#if TCL_UTF_MAX > 5
+ 6,6,6,6
+#else
+ 1,1,1,1
+#endif
};
-static const unsigned char complete[256] = {
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-/* Tcl_UtfCharComplete() might point to 2nd byte of valid 4-byte sequence */
- 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
- 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
-/* End of "continuation byte section" */
- 2,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
- 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,1,1,1,1,1,1,1,1,1,1,1
-};
-
/*
* Functions used only in this module.
*/
-static int Invalid(const char *src);
+static int UtfCount(int ch);
/*
*---------------------------------------------------------------------------
*
- * TclUtfCount --
+ * UtfCount --
*
* Find the number of bytes in the Utf character "ch".
*
@@ -105,98 +107,44 @@ static int Invalid(const char *src);
*---------------------------------------------------------------------------
*/
-int
-TclUtfCount(
- int ch) /* The Unicode character whose size is returned. */
+INLINE static int
+UtfCount(
+ int ch) /* The Tcl_UniChar whose size is returned. */
{
- if ((unsigned)(ch - 1) < (UNICODE_SELF - 1)) {
+ if ((ch > 0) && (ch < UNICODE_SELF)) {
return 1;
}
if (ch <= 0x7FF) {
return 2;
}
- if (((unsigned)(ch - 0x10000) <= 0xFFFFF)) {
+ if (ch <= 0xFFFF) {
+ return 3;
+ }
+#if TCL_UTF_MAX > 3
+ if (ch <= 0x1FFFFF) {
return 4;
}
+ if (ch <= 0x3FFFFFF) {
+ return 5;
+ }
+ if (ch <= 0x7FFFFFFF) {
+ return 6;
+ }
+#endif
return 3;
}
-
-/*
- *---------------------------------------------------------------------------
- *
- * Invalid --
- *
- * Given a pointer to a two-byte prefix of a well-formed UTF-8 byte
- * sequence (a lead byte followed by a trail byte) this routine
- * examines those two bytes to determine whether the sequence is
- * invalid in UTF-8. This might be because it is an overlong
- * encoding, or because it encodes something out of the proper range.
- *
- * Given a pointer to the bytes \xF8 or \xFC , this routine will
- * try to read beyond the end of the "bounds" table. Callers must
- * prevent this.
- *
- * Given a pointer to something else (an ASCII byte, a trail byte,
- * or another byte that can never begin a valid byte sequence such
- * as \xF5) this routine returns false. That makes the routine poorly
- * named, as it does not detect and report all invalid sequences.
- *
- * Callers have to take care that this routine does something useful
- * for their needs.
- *
- * Results:
- * A boolean.
- *---------------------------------------------------------------------------
- */
-static const unsigned char bounds[28] = {
- 0x80, 0x80, /* \xC0 accepts \x80 only */
- 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF,
- 0x80, 0xBF, /* (\xC4 - \xDC) -- all sequences valid */
- 0xA0, 0xBF, /* \xE0\x80 through \xE0\x9F are invalid prefixes */
- 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF, /* (\xE4 - \xEC) -- all valid */
- 0x90, 0xBF, /* \xF0\x80 through \xF0\x8F are invalid prefixes */
- 0x80, 0x8F /* \xF4\x90 and higher are invalid prefixes */
-};
-
-static int
-Invalid(
- const char *src) /* Points to lead byte of a UTF-8 byte sequence */
-{
- unsigned char byte = UCHAR(*src);
- int index;
-
- if ((byte & 0xC3) == 0xC0) {
- /* Only lead bytes 0xC0, 0xE0, 0xF0, 0xF4 need examination */
- index = (byte - 0xC0) >> 1;
- if (UCHAR(src[1]) < bounds[index] || UCHAR(src[1]) > bounds[index+1]) {
- /* Out of bounds - report invalid. */
- return 1;
- }
- }
- return 0;
-}
-
/*
*---------------------------------------------------------------------------
*
* Tcl_UniCharToUtf --
*
- * Stores the given Tcl_UniChar as a sequence of UTF-8 bytes in the provided
- * buffer. Equivalent to Plan 9 runetochar().
- *
- * Surrogate pairs are handled as follows: When ch is a high surrogate,
- * the first byte of the 4-byte UTF-8 sequence is stored in the buffer and
- * the function returns 1. If the function is called again with a low
- * surrogate and the same buffer, the remaining 3 bytes of the 4-byte
- * UTF-8 sequence are produced.
- *
- * If no low surrogate follows the high surrogate (which is actually illegal),
- * calling Tcl_UniCharToUtf again with ch being -1 produces a 3-byte UTF-8
- * sequence representing the high surrogate.
+ * Store the given Tcl_UniChar as a sequence of UTF-8 bytes in the
+ * provided buffer. Equivalent to Plan 9 runetochar().
*
* Results:
- * Returns the number of bytes stored into the buffer.
+ * The return values is the number of bytes in the buffer that were
+ * consumed.
*
* Side effects:
* None.
@@ -204,84 +152,63 @@ Invalid(
*---------------------------------------------------------------------------
*/
-Tcl_Size
+INLINE int
Tcl_UniCharToUtf(
- int ch, /* The Tcl_UniChar to be stored in the
- * buffer.
- */
- char *buf) /* Buffer in which the UTF-8 representation of
- * ch is stored. Must be large enough to hold the UTF-8
- * character (at most 4 bytes).
- */
+ int ch, /* The Tcl_UniChar to be stored in the
+ * buffer. */
+ char *buf) /* Buffer in which the UTF-8 representation of
+ * the Tcl_UniChar is stored. Buffer must be
+ * large enough to hold the UTF-8 character
+ * (at most TCL_UTF_MAX bytes). */
{
- if ((unsigned)(ch - 1) < (UNICODE_SELF - 1)) {
+ if ((ch > 0) && (ch < UNICODE_SELF)) {
buf[0] = (char) ch;
return 1;
}
if (ch >= 0) {
if (ch <= 0x7FF) {
- buf[1] = (char) (0x80 | (0x3F & ch));
- buf[0] = (char) (0xC0 | (ch >> 6));
+ buf[1] = (char) ((ch | 0x80) & 0xBF);
+ buf[0] = (char) ((ch >> 6) | 0xC0);
return 2;
}
if (ch <= 0xFFFF) {
- if ((ch & 0xF800) == 0xD800) {
- if (ch & 0x0400) {
- /* Low surrogate */
- if ( (0x80 == (0xC0 & buf[0]))
- && (0 == (0xCF & buf[1]))) {
- /* Previous Tcl_UniChar was a high surrogate, so combine */
- buf[2] = (char) (0x80 | (0x3F & ch));
- buf[1] |= (char) (0x80 | (0x0F & (ch >> 6)));
- return 3;
- }
- /* Previous Tcl_UniChar was not a high surrogate, so just output */
- } else {
- /* High surrogate */
-
- /* Add 0x10000 to the raw number encoded in the surrogate
- * pair in order to get the code point.
- */
- ch += 0x40;
-
- /* Fill buffer with specific 3-byte (invalid) byte combination,
- so following low surrogate can recognize it and combine */
- buf[2] = (char) ((ch << 4) & 0x30);
- buf[1] = (char) (0x80 | (0x3F & (ch >> 2)));
- buf[0] = (char) (0xF0 | (0x07 & (ch >> 8)));
- return 1;
- }
- }
- goto three;
+ three:
+ buf[2] = (char) ((ch | 0x80) & 0xBF);
+ buf[1] = (char) (((ch >> 6) | 0x80) & 0xBF);
+ buf[0] = (char) ((ch >> 12) | 0xE0);
+ return 3;
}
- if (ch <= 0x10FFFF) {
- buf[3] = (char) (0x80 | (0x3F & ch));
- buf[2] = (char) (0x80 | (0x3F & (ch >> 6)));
- buf[1] = (char) (0x80 | (0x3F & (ch >> 12)));
- buf[0] = (char) (0xF0 | (ch >> 18));
+
+#if TCL_UTF_MAX > 3
+ if (ch <= 0x1FFFFF) {
+ buf[3] = (char) ((ch | 0x80) & 0xBF);
+ buf[2] = (char) (((ch >> 6) | 0x80) & 0xBF);
+ buf[1] = (char) (((ch >> 12) | 0x80) & 0xBF);
+ buf[0] = (char) ((ch >> 18) | 0xF0);
return 4;
}
- } else if (ch == -1) {
- if ( (0x80 == (0xC0 & buf[0]))
- && (0 == (0xCF & buf[1]))
- && (0xF0 == (0xF8 & buf[-1]))) {
- ch = 0xD7C0
- + ((0x07 & buf[-1]) << 8)
- + ((0x3F & buf[0]) << 2)
- + ((0x30 & buf[1]) >> 4);
- buf[1] = (char) (0x80 | (0x3F & ch));
- buf[0] = (char) (0x80 | (0x3F & (ch >> 6)));
- buf[-1] = (char) (0xE0 | (ch >> 12));
- return 2;
+ if (ch <= 0x3FFFFFF) {
+ buf[4] = (char) ((ch | 0x80) & 0xBF);
+ buf[3] = (char) (((ch >> 6) | 0x80) & 0xBF);
+ buf[2] = (char) (((ch >> 12) | 0x80) & 0xBF);
+ buf[1] = (char) (((ch >> 18) | 0x80) & 0xBF);
+ buf[0] = (char) ((ch >> 24) | 0xF8);
+ return 5;
+ }
+ if (ch <= 0x7FFFFFFF) {
+ buf[5] = (char) ((ch | 0x80) & 0xBF);
+ buf[4] = (char) (((ch >> 6) | 0x80) & 0xBF);
+ buf[3] = (char) (((ch >> 12) | 0x80) & 0xBF);
+ buf[2] = (char) (((ch >> 18) | 0x80) & 0xBF);
+ buf[1] = (char) (((ch >> 24) | 0x80) & 0xBF);
+ buf[0] = (char) ((ch >> 30) | 0xFC);
+ return 6;
}
+#endif
}
ch = 0xFFFD;
-three:
- buf[2] = (char) (0x80 | (0x3F & ch));
- buf[1] = (char) (0x80 | (0x3F & (ch >> 6)));
- buf[0] = (char) (0xE0 | (ch >> 12));
- return 3;
+ goto three;
}
/*
@@ -302,41 +229,29 @@ three:
*---------------------------------------------------------------------------
*/
-#undef Tcl_UniCharToUtfDString
char *
Tcl_UniCharToUtfDString(
- const int *uniStr, /* Unicode string to convert to UTF-8. */
- Tcl_Size uniLength, /* Length of Unicode string. Negative for nul
- * terminated string */
+ CONST Tcl_UniChar *uniStr, /* Unicode string to convert to UTF-8. */
+ int uniLength, /* Length of Unicode string in Tcl_UniChars
+ * (must be >= 0). */
Tcl_DString *dsPtr) /* UTF-8 representation of string is appended
* to this previously initialized DString. */
{
- const int *w, *wEnd;
+ CONST Tcl_UniChar *w, *wEnd;
char *p, *string;
- Tcl_Size oldLength;
+ int oldLength;
/*
- * UTF-8 string length in bytes will be <= Unicode string length * 4.
+ * UTF-8 string length in bytes will be <= Unicode string length *
+ * TCL_UTF_MAX.
*/
- if (uniStr == NULL) {
- return NULL;
- }
- if (uniLength < 0) {
- uniLength = 0;
- w = uniStr;
- while (*w != '\0') {
- uniLength++;
- w++;
- }
- }
oldLength = Tcl_DStringLength(dsPtr);
- Tcl_DStringSetLength(dsPtr, oldLength + (uniLength + 1) * 4);
+ Tcl_DStringSetLength(dsPtr, (oldLength + uniLength + 1) * TCL_UTF_MAX);
string = Tcl_DStringValue(dsPtr) + oldLength;
p = string;
wEnd = uniStr + uniLength;
-
for (w = uniStr; w < wEnd; ) {
p += Tcl_UniCharToUtf(*w, p);
w++;
@@ -346,61 +261,6 @@ Tcl_UniCharToUtfDString(
return string;
}
-char *
-Tcl_Char16ToUtfDString(
- const unsigned short *uniStr,/* Utf-16 string to convert to UTF-8. */
- Tcl_Size uniLength, /* Length of Utf-16 string. */
- Tcl_DString *dsPtr) /* UTF-8 representation of string is appended
- * to this previously initialized DString. */
-{
- const unsigned short *w, *wEnd;
- char *p, *string;
- Tcl_Size oldLength;
- int len = 1;
-
- /*
- * UTF-8 string length in bytes will be <= Utf16 string length * 3.
- */
-
- if (uniStr == NULL) {
- return NULL;
- }
- if (uniLength < 0) {
-
- uniLength = 0;
- w = uniStr;
- while (*w != '\0') {
- uniLength++;
- w++;
- }
- }
- oldLength = Tcl_DStringLength(dsPtr);
- Tcl_DStringSetLength(dsPtr, oldLength + (uniLength + 1) * 3);
- string = Tcl_DStringValue(dsPtr) + oldLength;
-
- p = string;
- wEnd = uniStr + uniLength;
-
- for (w = uniStr; w < wEnd; ) {
- if (!len && ((*w & 0xFC00) != 0xDC00)) {
- /* Special case for handling high surrogates. */
- p += Tcl_UniCharToUtf(-1, p);
- }
- len = Tcl_UniCharToUtf(*w, p);
- p += len;
- if ((*w >= 0xD800) && (len < 3)) {
- len = 0; /* Indication that high surrogate was found */
- }
- w++;
- }
- if (!len) {
- /* Special case for handling high surrogates. */
- p += Tcl_UniCharToUtf(-1, p);
- }
- Tcl_DStringSetLength(dsPtr, oldLength + (p - string));
-
- return string;
-}
/*
*---------------------------------------------------------------------------
*
@@ -417,15 +277,6 @@ Tcl_Char16ToUtfDString(
* Tcl_UtfCharComplete() before calling this routine to ensure that
* enough bytes remain in the string.
*
- * Special handling of Surrogate pairs is done:
- * For any UTF-8 string containing a character outside of the BMP, the
- * first call to this function will fill *chPtr with the high surrogate
- * and generate a return value of 1. Calling Tcl_UtfToUniChar again
- * will produce the low surrogate and a return value of 3. Because *chPtr
- * is used to remember whether the high surrogate is already produced, it
- * is recommended to initialize the variable it points to as 0 before
- * the first call to Tcl_UtfToUniChar is done.
- *
* Results:
* *chPtr is filled with the Tcl_UniChar, and the return value is the
* number of bytes from the UTF-8 string that were consumed.
@@ -436,192 +287,89 @@ Tcl_Char16ToUtfDString(
*---------------------------------------------------------------------------
*/
-static const unsigned short cp1252[32] = {
- 0x20AC, 0x81, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021,
- 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x8D, 0x017D, 0x8F,
- 0x90, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
- 0x2DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x9D, 0x017E, 0x0178
-};
-
-Tcl_Size
+int
Tcl_UtfToUniChar(
- const char *src, /* The UTF-8 string. */
- int *chPtr)/* Filled with the Unicode character represented by
+ register CONST char *src, /* The UTF-8 string. */
+ register Tcl_UniChar *chPtr)/* Filled with the Tcl_UniChar represented by
* the UTF-8 string. */
{
- int byte;
+ register int byte;
/*
- * Unroll 1 to 4 byte UTF-8 sequences.
+ * Unroll 1 to 3 byte UTF-8 sequences, use loop to handle longer ones.
*/
byte = *((unsigned char *) src);
if (byte < 0xC0) {
/*
* Handles properly formed UTF-8 characters between 0x01 and 0x7F.
- * Treats naked trail bytes 0x80 to 0x9F as valid characters from
- * the cp1252 table. See: <https://en.wikipedia.org/wiki/UTF-8>
- * Also treats \0 and other naked trail bytes 0xA0 to 0xBF as valid
+ * Also treats \0 and naked trail bytes 0x80 to 0xBF as valid
* characters representing themselves.
*/
- if ((unsigned)(byte-0x80) < (unsigned)0x20) {
- *chPtr = cp1252[byte-0x80];
- } else {
- *chPtr = byte;
- }
+ *chPtr = (Tcl_UniChar) byte;
return 1;
} else if (byte < 0xE0) {
- if ((byte != 0xC1) && ((src[1] & 0xC0) == 0x80)) {
+ if ((src[1] & 0xC0) == 0x80) {
/*
* Two-byte-character lead-byte followed by a trail-byte.
*/
- *chPtr = (((byte & 0x1F) << 6) | (src[1] & 0x3F));
- if ((unsigned)(*chPtr - 1) >= (UNICODE_SELF - 1)) {
- return 2;
- }
+ *chPtr = (Tcl_UniChar) (((byte & 0x1F) << 6) | (src[1] & 0x3F));
+ return 2;
}
/*
* A two-byte-character lead-byte not followed by trail-byte
* represents itself.
*/
+
+ *chPtr = (Tcl_UniChar) byte;
+ return 1;
} else if (byte < 0xF0) {
if (((src[1] & 0xC0) == 0x80) && ((src[2] & 0xC0) == 0x80)) {
/*
* Three-byte-character lead byte followed by two trail bytes.
*/
- *chPtr = (((byte & 0x0F) << 12)
+ *chPtr = (Tcl_UniChar) (((byte & 0x0F) << 12)
| ((src[1] & 0x3F) << 6) | (src[2] & 0x3F));
- if (*chPtr > 0x7FF) {
- return 3;
- }
+ return 3;
}
/*
* A three-byte-character lead-byte not followed by two trail-bytes
* represents itself.
*/
- } else if (byte < 0xF5) {
- if (((src[1] & 0xC0) == 0x80) && ((src[2] & 0xC0) == 0x80) && ((src[3] & 0xC0) == 0x80)) {
- /*
- * Four-byte-character lead byte followed by three trail bytes.
- */
- *chPtr = (((byte & 0x07) << 18) | ((src[1] & 0x3F) << 12)
- | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F));
- if ((unsigned)(*chPtr - 0x10000) <= 0xFFFFF) {
- return 4;
- }
- }
-
- /*
- * A four-byte-character lead-byte not followed by three trail-bytes
- * represents itself.
- */
- }
- *chPtr = byte;
- return 1;
-}
-
-Tcl_Size
-Tcl_UtfToChar16(
- const char *src, /* The UTF-8 string. */
- unsigned short *chPtr)/* Filled with the Tcl_UniChar represented by
- * the UTF-8 string. This could be a surrogate too. */
-{
- unsigned short byte;
-
- /*
- * Unroll 1 to 4 byte UTF-8 sequences.
- */
-
- byte = UCHAR(*src);
- if (byte < 0xC0) {
- /*
- * Handles properly formed UTF-8 characters between 0x01 and 0x7F.
- * Treats naked trail bytes 0x80 to 0x9F as valid characters from
- * the cp1252 table. See: <https://en.wikipedia.org/wiki/UTF-8>
- * Also treats \0 and other naked trail bytes 0xA0 to 0xBF as valid
- * characters representing themselves.
- */
-
- /* If *chPtr contains a high surrogate (produced by a previous
- * Tcl_UtfToUniChar() call) and the next 3 bytes are UTF-8 continuation
- * bytes, then we must produce a follow-up low surrogate. We only
- * do that if the high surrogate matches the bits we encounter.
- */
- if (((byte & 0xC0) == 0x80)
- && ((src[1] & 0xC0) == 0x80) && ((src[2] & 0xC0) == 0x80)
- && (((((byte - 0x10) << 2) & 0xFC) | 0xD800) == (*chPtr & 0xFCFC))
- && ((src[1] & 0xF0) == (((*chPtr << 4) & 0x30) | 0x80))) {
- *chPtr = ((src[1] & 0x0F) << 6) + (src[2] & 0x3F) + 0xDC00;
- return 3;
- }
- if ((unsigned)(byte-0x80) < (unsigned)0x20) {
- *chPtr = cp1252[byte-0x80];
- } else {
- *chPtr = byte;
- }
+ *chPtr = (Tcl_UniChar) byte;
return 1;
- } else if (byte < 0xE0) {
- if ((byte != 0xC1) && ((src[1] & 0xC0) == 0x80)) {
- /*
- * Two-byte-character lead-byte followed by a trail-byte.
- */
-
- *chPtr = (((byte & 0x1F) << 6) | (src[1] & 0x3F));
- if ((unsigned)(*chPtr - 1) >= (UNICODE_SELF - 1)) {
- return 2;
- }
- }
-
- /*
- * A two-byte-character lead-byte not followed by trail-byte
- * represents itself.
- */
- } else if (byte < 0xF0) {
- if (((src[1] & 0xC0) == 0x80) && ((src[2] & 0xC0) == 0x80)) {
- /*
- * Three-byte-character lead byte followed by two trail bytes.
- */
-
- *chPtr = (((byte & 0x0F) << 12)
- | ((src[1] & 0x3F) << 6) | (src[2] & 0x3F));
- if (*chPtr > 0x7FF) {
- return 3;
- }
- }
+ }
+#if TCL_UTF_MAX > 3
+ {
+ int ch, total, trail;
- /*
- * A three-byte-character lead-byte not followed by two trail-bytes
- * represents itself.
- */
- } else if (byte < 0xF5) {
- if (((src[1] & 0xC0) == 0x80) && ((src[2] & 0xC0) == 0x80)) {
- /*
- * Four-byte-character lead byte followed by at least two trail bytes.
- * We don't test the validity of 3th trail byte, see [ed29806ba]
- */
- Tcl_UniChar high = (((byte & 0x07) << 8) | ((src[1] & 0x3F) << 2)
- | ((src[2] & 0x3F) >> 4)) - 0x40;
- if (high < 0x400) {
- /* produce high surrogate, advance source pointer */
- *chPtr = 0xD800 + high;
- return 1;
- }
- /* out of range, < 0x10000 or > 0x10FFFF */
+ total = totalBytes[byte];
+ trail = total - 1;
+ if (trail > 0) {
+ ch = byte & (0x3F >> trail);
+ do {
+ src++;
+ if ((*src & 0xC0) != 0x80) {
+ *chPtr = byte;
+ return 1;
+ }
+ ch <<= 6;
+ ch |= (*src & 0x3F);
+ trail--;
+ } while (trail > 0);
+ *chPtr = ch;
+ return total;
}
-
- /*
- * A four-byte-character lead-byte not followed by three trail-bytes
- * represents itself.
- */
}
+#endif
- *chPtr = byte;
+ *chPtr = (Tcl_UniChar) byte;
return 1;
}
@@ -643,118 +391,42 @@ Tcl_UtfToChar16(
*---------------------------------------------------------------------------
*/
-#undef Tcl_UtfToUniCharDString
-int *
+Tcl_UniChar *
Tcl_UtfToUniCharDString(
- const char *src, /* UTF-8 string to convert to Unicode. */
- Tcl_Size length, /* Length of UTF-8 string in bytes, or -1 for
+ CONST char *src, /* UTF-8 string to convert to Unicode. */
+ int length, /* Length of UTF-8 string in bytes, or -1 for
* strlen(). */
Tcl_DString *dsPtr) /* Unicode representation of string is
* appended to this previously initialized
* DString. */
{
- int ch = 0, *w, *wString;
- const char *p;
- Tcl_Size oldLength;
- /* Pointer to the end of string. Never read endPtr[0] */
- const char *endPtr = src + length;
- /* Pointer to last byte where optimization still can be used */
- const char *optPtr = endPtr - TCL_UTF_MAX;
-
- if (src == NULL) {
- return NULL;
- }
- if (length < 0) {
- length = strlen(src);
- }
-
- /*
- * Unicode string length in Tcl_UniChars will be <= UTF-8 string length in
- * bytes.
- */
-
- oldLength = Tcl_DStringLength(dsPtr);
-
- Tcl_DStringSetLength(dsPtr,
- oldLength + ((length + 1) * sizeof(int)));
- wString = (int *) (Tcl_DStringValue(dsPtr) + oldLength);
-
- w = wString;
- p = src;
- endPtr = src + length;
- optPtr = endPtr - 4;
- while (p <= optPtr) {
- p += TclUtfToUniChar(p, &ch);
- *w++ = ch;
- }
- while ((p < endPtr) && Tcl_UtfCharComplete(p, endPtr-p)) {
- p += TclUtfToUniChar(p, &ch);
- *w++ = ch;
- }
- while (p < endPtr) {
- *w++ = UCHAR(*p++);
- }
- *w = '\0';
- Tcl_DStringSetLength(dsPtr,
- oldLength + ((char *) w - (char *) wString));
-
- return wString;
-}
+ Tcl_UniChar *w, *wString;
+ CONST char *p, *end;
+ int oldLength;
-unsigned short *
-Tcl_UtfToChar16DString(
- const char *src, /* UTF-8 string to convert to Unicode. */
- Tcl_Size length, /* Length of UTF-8 string in bytes, or -1 for
- * strlen(). */
- Tcl_DString *dsPtr) /* Unicode representation of string is
- * appended to this previously initialized
- * DString. */
-{
- unsigned short ch = 0, *w, *wString;
- const char *p;
- Tcl_Size oldLength;
- /* Pointer to the end of string. Never read endPtr[0] */
- const char *endPtr = src + length;
- /* Pointer to last byte where optimization still can be used */
- const char *optPtr = endPtr - TCL_UTF_MAX;
-
- if (src == NULL) {
- return NULL;
- }
if (length < 0) {
length = strlen(src);
}
/*
- * Unicode string length in WCHARs will be <= UTF-8 string length in
+ * Unicode string length in Tcl_UniChars will be <= UTF-8 string length in
* bytes.
*/
oldLength = Tcl_DStringLength(dsPtr);
-
Tcl_DStringSetLength(dsPtr,
- oldLength + ((length + 1) * sizeof(unsigned short)));
- wString = (unsigned short *) (Tcl_DStringValue(dsPtr) + oldLength);
+ (int) ((oldLength + length + 1) * sizeof(Tcl_UniChar)));
+ wString = (Tcl_UniChar *) (Tcl_DStringValue(dsPtr) + oldLength);
w = wString;
- p = src;
- endPtr = src + length;
- optPtr = endPtr - 3;
- while (p <= optPtr) {
- p += Tcl_UtfToChar16(p, &ch);
- *w++ = ch;
- }
- while (p < endPtr) {
- if (Tcl_UtfCharComplete(p, endPtr-p)) {
- p += Tcl_UtfToChar16(p, &ch);
- *w++ = ch;
- } else {
- *w++ = UCHAR(*p++);
- }
+ end = src + length;
+ for (p = src; p < end; ) {
+ p += TclUtfToUniChar(p, w);
+ w++;
}
*w = '\0';
Tcl_DStringSetLength(dsPtr,
- oldLength + ((char *) w - (char *) wString));
+ (oldLength + ((char *) w - (char *) wString)));
return wString;
}
@@ -780,11 +452,14 @@ Tcl_UtfToChar16DString(
int
Tcl_UtfCharComplete(
- const char *src, /* String to check if first few bytes contain
+ CONST char *src, /* String to check if first few bytes contain
* a complete UTF-8 character. */
- Tcl_Size length) /* Length of above string in bytes. */
+ int length) /* Length of above string in bytes. */
{
- return length >= complete[UCHAR(*src)];
+ int ch;
+
+ ch = *((unsigned char *) src);
+ return length >= totalBytes[ch];
}
/*
@@ -805,124 +480,59 @@ Tcl_UtfCharComplete(
*---------------------------------------------------------------------------
*/
-Tcl_Size
-TclNumUtfChars(
- const char *src, /* The UTF-8 string to measure. */
- Tcl_Size length) /* The length of the string in bytes, or
- * negative value for strlen(src). */
-{
- Tcl_UniChar ch = 0;
- Tcl_Size i = 0;
-
- if (length < 0) {
- /* string is NUL-terminated, so TclUtfToUniChar calls are safe. */
- while ((*src != '\0') && (i < INT_MAX)) {
- src += TclUtfToUniChar(src, &ch);
- i++;
- }
- } else {
- /* Will return value between 0 and length. No overflow checks. */
-
- /* Pointer to the end of string. Never read endPtr[0] */
- const char *endPtr = src + length;
- /* Pointer to last byte where optimization still can be used */
- const char *optPtr = endPtr - 4;
-
- /*
- * Optimize away the call in this loop. Justified because...
- * when (src <= optPtr), (endPtr - src) >= (endPtr - optPtr)
- * By initialization above (endPtr - optPtr) = TCL_UTF_MAX
- * So (endPtr - src) >= TCL_UTF_MAX, and passing that to
- * Tcl_UtfCharComplete we know will cause return of 1.
- */
- while (src <= optPtr
- /* && Tcl_UtfCharComplete(src, endPtr - src) */ ) {
- src += TclUtfToUniChar(src, &ch);
- i++;
- }
- /* Loop over the remaining string where call must happen */
- while (src < endPtr) {
- if (Tcl_UtfCharComplete(src, endPtr - src)) {
- src += TclUtfToUniChar(src, &ch);
- } else {
- /*
- * src points to incomplete UTF-8 sequence
- * Treat first byte as character and count it
- */
- src++;
- }
- i++;
- }
- }
- return i;
-}
-
-#if !defined(TCL_NO_DEPRECATED)
-Tcl_Size
+int
Tcl_NumUtfChars(
- const char *src, /* The UTF-8 string to measure. */
- Tcl_Size length) /* The length of the string in bytes, or
- * negative for strlen(src). */
+ register CONST char *src, /* The UTF-8 string to measure. */
+ int length) /* The length of the string in bytes, or -1
+ * for strlen(string). */
{
- unsigned short ch = 0;
- Tcl_Size i = 0;
+ Tcl_UniChar ch;
+ register Tcl_UniChar *chPtr = &ch;
+ register int i;
+
+ /*
+ * The separate implementations are faster.
+ *
+ * Since this is a time-sensitive function, we also do the check for the
+ * single-byte char case specially.
+ */
+ i = 0;
if (length < 0) {
- /* string is NUL-terminated, so TclUtfToUniChar calls are safe. */
- while ((*src != '\0') && (i < INT_MAX)) {
- src += Tcl_UtfToChar16(src, &ch);
+ while (*src != '\0') {
+ src += TclUtfToUniChar(src, chPtr);
i++;
}
} else {
- /* Will return value between 0 and length. No overflow checks. */
+ register int n;
- /* Pointer to the end of string. Never read endPtr[0] */
- const char *endPtr = src + length;
- /* Pointer to last byte where optimization still can be used */
- const char *optPtr = endPtr - 4;
-
- /*
- * Optimize away the call in this loop. Justified because...
- * when (src <= optPtr), (endPtr - src) >= (endPtr - optPtr)
- * By initialization above (endPtr - optPtr) = TCL_UTF_MAX
- * So (endPtr - src) >= TCL_UTF_MAX, and passing that to
- * Tcl_UtfCharComplete we know will cause return of 1.
- */
- while (src <= optPtr
- /* && Tcl_UtfCharComplete(src, endPtr - src) */ ) {
- src += Tcl_UtfToChar16(src, &ch);
- i++;
- }
- /* Loop over the remaining string where call must happen */
- while (src < endPtr) {
- if (Tcl_UtfCharComplete(src, endPtr - src)) {
- src += Tcl_UtfToChar16(src, &ch);
- } else {
- /*
- * src points to incomplete UTF-8 sequence
- * Treat first byte as character and count it
- */
+ while (length > 0) {
+ if (UCHAR(*src) < 0xC0) {
+ length--;
src++;
+ } else {
+ n = Tcl_UtfToUniChar(src, chPtr);
+ length -= n;
+ src += n;
}
i++;
}
}
return i;
}
-#endif
-
+
/*
*---------------------------------------------------------------------------
*
* Tcl_UtfFindFirst --
*
- * Returns a pointer to the first occurrence of the given Unicode character
- * in the NULL-terminated UTF-8 string. The NULL terminator is considered
+ * Returns a pointer to the first occurance of the given Tcl_UniChar in
+ * the NULL-terminated UTF-8 string. The NULL terminator is considered
* part of the UTF-8 string. Equivalent to Plan 9 utfrune().
*
* Results:
- * As above. If the Unicode character does not exist in the given string,
- * the return value is NULL.
+ * As above. If the Tcl_UniChar does not exist in the given string, the
+ * return value is NULL.
*
* Side effects:
* None.
@@ -930,14 +540,16 @@ Tcl_NumUtfChars(
*---------------------------------------------------------------------------
*/
-const char *
+CONST char *
Tcl_UtfFindFirst(
- const char *src, /* The UTF-8 string to be searched. */
- int ch) /* The Unicode character to search for. */
+ CONST char *src, /* The UTF-8 string to be searched. */
+ int ch) /* The Tcl_UniChar to search for. */
{
- while (1) {
- int find, len = TclUtfToUniChar(src, &find);
+ int len;
+ Tcl_UniChar find;
+ while (1) {
+ len = TclUtfToUniChar(src, &find);
if (find == ch) {
return src;
}
@@ -953,12 +565,12 @@ Tcl_UtfFindFirst(
*
* Tcl_UtfFindLast --
*
- * Returns a pointer to the last occurrence of the given Unicode character
- * in the NULL-terminated UTF-8 string. The NULL terminator is considered
+ * Returns a pointer to the last occurance of the given Tcl_UniChar in
+ * the NULL-terminated UTF-8 string. The NULL terminator is considered
* part of the UTF-8 string. Equivalent to Plan 9 utfrrune().
*
* Results:
- * As above. If the Unicode character does not exist in the given string, the
+ * As above. If the Tcl_UniChar does not exist in the given string, the
* return value is NULL.
*
* Side effects:
@@ -967,16 +579,18 @@ Tcl_UtfFindFirst(
*---------------------------------------------------------------------------
*/
-const char *
+CONST char *
Tcl_UtfFindLast(
- const char *src, /* The UTF-8 string to be searched. */
- int ch) /* The Unicode character to search for. */
+ CONST char *src, /* The UTF-8 string to be searched. */
+ int ch) /* The Tcl_UniChar to search for. */
{
- const char *last = NULL;
+ int len;
+ Tcl_UniChar find;
+ CONST char *last;
+ last = NULL;
while (1) {
- int find, len = TclUtfToUniChar(src, &find);
-
+ len = TclUtfToUniChar(src, &find);
if (find == ch) {
last = src;
}
@@ -993,11 +607,9 @@ Tcl_UtfFindLast(
*
* Tcl_UtfNext --
*
- * Given a pointer to some location in a UTF-8 string, Tcl_UtfNext
- * returns a pointer to the next UTF-8 character in the string.
- * The caller must not ask for the next character after the last
- * character in the string if the string is not terminated by a null
- * character.
+ * Given a pointer to some current location in a UTF-8 string, move
+ * forward one character. The caller must ensure that they are not asking
+ * for the next character after the last character in the string.
*
* Results:
* The return value is the pointer to the next character in the UTF-8
@@ -1009,48 +621,13 @@ Tcl_UtfFindLast(
*---------------------------------------------------------------------------
*/
-const char *
+CONST char *
Tcl_UtfNext(
- const char *src) /* The current location in the string. */
+ CONST char *src) /* The current location in the string. */
{
- int left;
- const char *next;
-
- if (((*src) & 0xC0) == 0x80) {
- /* Continuation byte, so we start 'inside' a (possible valid) UTF-8
- * sequence. Since we are not allowed to access src[-1], we cannot
- * check if the sequence is actually valid, the best we can do is
- * just assume it is valid and locate the end. */
- if ((((*++src) & 0xC0) == 0x80) && (((*++src) & 0xC0) == 0x80)) {
- ++src;
- }
- return src;
- }
+ Tcl_UniChar ch;
- left = totalBytes[UCHAR(*src)];
- next = src + 1;
- while (--left) {
- if ((*next & 0xC0) != 0x80) {
- /*
- * src points to non-trail byte; We ran out of trail bytes
- * before the needs of the lead byte were satisfied.
- * Let the (malformed) lead byte alone be a character
- */
- return src + 1;
- }
- next++;
- }
- /*
- * Call Invalid() here only if required conditions are met:
- * src[0] is known a lead byte.
- * src[1] is known a trail byte.
- * Especially important to prevent calls when src[0] == '\xF8' or '\xFC'
- * See tests utf-6.37 through utf-6.43 through valgrind or similar tool.
- */
- if ((next == src + 1) || Invalid(src)) {
- return src + 1;
- }
- return next;
+ return src + TclUtfToUniChar(src, &ch);
}
/*
@@ -1074,96 +651,34 @@ Tcl_UtfNext(
*---------------------------------------------------------------------------
*/
-const char *
+CONST char *
Tcl_UtfPrev(
- const char *src, /* A location in a UTF-8 string. */
- const char *start) /* Pointer to the beginning of the string */
+ CONST char *src, /* The current location in the string. */
+ CONST char *start) /* Pointer to the beginning of the string, to
+ * avoid going backwards too far. */
{
- int trailBytesSeen = 0; /* How many trail bytes have been verified? */
- const char *fallback = src - 1;
- /* If we cannot find a lead byte that might
- * start a prefix of a valid UTF byte sequence,
- * we will fallback to a one-byte back step */
- const char *look = fallback;
- /* Start search at the fallback position */
-
- /* Quick boundary case exit. */
- if (fallback <= start) {
- return start;
- }
-
- do {
- unsigned char byte = UCHAR(look[0]);
-
+ CONST char *look;
+ int i, byte;
+
+ src--;
+ look = src;
+ for (i = 0; i < TCL_UTF_MAX; i++) {
+ if (look < start) {
+ if (src < start) {
+ src = start;
+ }
+ break;
+ }
+ byte = *((unsigned char *) look);
if (byte < 0x80) {
- /*
- * Single byte character. Either this is a correct previous
- * character, or it is followed by at least one trail byte
- * which indicates a malformed sequence. In either case the
- * correct result is to return the fallback.
- */
- return fallback;
+ break;
}
if (byte >= 0xC0) {
- /* Non-trail byte; May be multibyte lead. */
-
- if ((trailBytesSeen == 0)
- /*
- * We've seen no trailing context to use to check
- * anything. From what we know, this non-trail byte
- * is a prefix of a previous character, and accepting
- * it (the fallback) is correct.
- */
-
- || (trailBytesSeen >= totalBytes[byte])) {
- /*
- * That is, (1 + trailBytesSeen > needed).
- * We've examined more bytes than needed to complete
- * this lead byte. No matter about well-formedness or
- * validity, the sequence starting with this lead byte
- * will never include the fallback location, so we must
- * return the fallback location. See test utf-7.17
- */
- return fallback;
- }
-
- /*
- * trailBytesSeen > 0, so we can examine look[1] safely.
- * Use that capability to screen out invalid sequences.
- */
-
- if (Invalid(look)) {
- /* Reject */
- return fallback;
- }
- return (const char *)look;
+ return look;
}
-
- /* We saw a trail byte. */
- trailBytesSeen++;
-
- if ((const char *)look == start) {
- /*
- * Do not read before the start of the string
- *
- * If we get here, we've examined bytes at every location
- * >= start and < src and all of them are trail bytes,
- * including (*start). We need to return our fallback
- * and exit this loop before we run past the start of the string.
- */
- return fallback;
- }
-
- /* Continue the search backwards... */
look--;
- } while (trailBytesSeen < 4);
-
- /*
- * We've seen 4 trail bytes, so we know there will not be a
- * properly formed byte sequence to find, and we can stop looking,
- * accepting the fallback.
- */
- return fallback;
+ }
+ return src;
}
/*
@@ -1183,27 +698,18 @@ Tcl_UtfPrev(
*---------------------------------------------------------------------------
*/
-int
+Tcl_UniChar
Tcl_UniCharAtIndex(
- const char *src, /* The UTF-8 string to dereference. */
- Tcl_Size index) /* The position of the desired character. */
+ register CONST char *src, /* The UTF-8 string to dereference. */
+ register int index) /* The position of the desired character. */
{
- unsigned short ch = 0;
- int i = 0;
+ Tcl_UniChar ch;
- if (index < 0) {
- return -1;
- }
- while (index-- > 0) {
- i = Tcl_UtfToChar16(src, &ch);
- src += i;
- }
- if ((ch >= 0xD800) && (i < 3)) {
- /* Index points at character following high Surrogate */
- return -1;
+ while (index >= 0) {
+ index--;
+ src += TclUtfToUniChar(src, &ch);
}
- TclUtfToUniChar(src, &i);
- return i;
+ return ch;
}
/*
@@ -1223,41 +729,20 @@ Tcl_UniCharAtIndex(
*---------------------------------------------------------------------------
*/
-const char *
-TclUtfAtIndex(
- const char *src, /* The UTF-8 string. */
- Tcl_Size index) /* The position of the desired character. */
+CONST char *
+Tcl_UtfAtIndex(
+ register CONST char *src, /* The UTF-8 string. */
+ register int index) /* The position of the desired character. */
{
- Tcl_UniChar ch = 0;
+ Tcl_UniChar ch;
- while (index-- > 0) {
+ while (index > 0) {
+ index--;
src += TclUtfToUniChar(src, &ch);
}
return src;
}
-#if !defined(TCL_NO_DEPRECATED)
-const char *
-Tcl_UtfAtIndex(
- const char *src, /* The UTF-8 string. */
- Tcl_Size index) /* The position of the desired character. */
-{
- unsigned short ch = 0;
- Tcl_Size len = 0;
-
- if (index > 0) {
- while (index--) {
- src += (len = Tcl_UtfToChar16(src, &ch));
- }
- if ((ch >= 0xD800) && (len < 3)) {
- /* Index points at character following high Surrogate */
- src += Tcl_UtfToChar16(src, &ch);
- }
- }
- return src;
-}
-#endif
-
/*
*---------------------------------------------------------------------------
*
@@ -1267,7 +752,7 @@ Tcl_UtfAtIndex(
*
* Results:
* Stores the bytes represented by the backslash sequence in dst and
- * returns the number of bytes written to dst. At most 4 bytes
+ * returns the number of bytes written to dst. At most TCL_UTF_MAX bytes
* are written to dst; dst must have been large enough to accept those
* bytes. If readPtr isn't NULL then it is filled in with a count of the
* number of bytes in the backslash sequence.
@@ -1284,9 +769,9 @@ Tcl_UtfAtIndex(
*---------------------------------------------------------------------------
*/
-Tcl_Size
+int
Tcl_UtfBackslash(
- const char *src, /* Points to the backslash character of a
+ CONST char *src, /* Points to the backslash character of a
* backslash sequence. */
int *readPtr, /* Fill in with number of characters read from
* src, unless NULL. */
@@ -1294,7 +779,7 @@ Tcl_UtfBackslash(
* backslash sequence. */
{
#define LINE_LENGTH 128
- Tcl_Size numRead;
+ int numRead;
int result;
result = TclParseBackslash(src, LINE_LENGTH, &numRead, dst);
@@ -1303,7 +788,7 @@ Tcl_UtfBackslash(
* We ate a whole line. Pay the price of a strlen()
*/
- result = TclParseBackslash(src, strlen(src), &numRead, dst);
+ result = TclParseBackslash(src, (int)strlen(src), &numRead, dst);
}
if (readPtr != NULL) {
*readPtr = numRead;
@@ -1329,13 +814,13 @@ Tcl_UtfBackslash(
*----------------------------------------------------------------------
*/
-Tcl_Size
+int
Tcl_UtfToUpper(
char *str) /* String to convert in place. */
{
- int ch, upChar;
+ Tcl_UniChar ch, upChar;
char *src, *dst;
- Tcl_Size len;
+ int bytes;
/*
* Iterate over the string until we hit the terminating null.
@@ -1343,7 +828,7 @@ Tcl_UtfToUpper(
src = dst = str;
while (*src) {
- len = TclUtfToUniChar(src, &ch);
+ bytes = TclUtfToUniChar(src, &ch);
upChar = Tcl_UniCharToUpper(ch);
/*
@@ -1352,13 +837,13 @@ Tcl_UtfToUpper(
* char to dst if its size is <= the original char.
*/
- if ((len < TclUtfCount(upChar)) || ((upChar & ~0x7FF) == 0xD800)) {
- memmove(dst, src, len);
- dst += len;
+ if (bytes < UtfCount(upChar)) {
+ memcpy(dst, src, (size_t) bytes);
+ dst += bytes;
} else {
dst += Tcl_UniCharToUtf(upChar, dst);
}
- src += len;
+ src += bytes;
}
*dst = '\0';
return (dst - str);
@@ -1382,13 +867,13 @@ Tcl_UtfToUpper(
*----------------------------------------------------------------------
*/
-Tcl_Size
+int
Tcl_UtfToLower(
char *str) /* String to convert in place. */
{
- int ch, lowChar;
+ Tcl_UniChar ch, lowChar;
char *src, *dst;
- Tcl_Size len;
+ int bytes;
/*
* Iterate over the string until we hit the terminating null.
@@ -1396,7 +881,7 @@ Tcl_UtfToLower(
src = dst = str;
while (*src) {
- len = TclUtfToUniChar(src, &ch);
+ bytes = TclUtfToUniChar(src, &ch);
lowChar = Tcl_UniCharToLower(ch);
/*
@@ -1405,13 +890,13 @@ Tcl_UtfToLower(
* char to dst if its size is <= the original char.
*/
- if ((len < TclUtfCount(lowChar)) || ((lowChar & ~0x7FF) == 0xD800)) {
- memmove(dst, src, len);
- dst += len;
+ if (bytes < UtfCount(lowChar)) {
+ memcpy(dst, src, (size_t) bytes);
+ dst += bytes;
} else {
dst += Tcl_UniCharToUtf(lowChar, dst);
}
- src += len;
+ src += bytes;
}
*dst = '\0';
return (dst - str);
@@ -1436,13 +921,13 @@ Tcl_UtfToLower(
*----------------------------------------------------------------------
*/
-Tcl_Size
+int
Tcl_UtfToTitle(
char *str) /* String to convert in place. */
{
- int ch, titleChar, lowChar;
+ Tcl_UniChar ch, titleChar, lowChar;
char *src, *dst;
- Tcl_Size len;
+ int bytes;
/*
* Capitalize the first character and then lowercase the rest of the
@@ -1452,32 +937,28 @@ Tcl_UtfToTitle(
src = dst = str;
if (*src) {
- len = TclUtfToUniChar(src, &ch);
+ bytes = TclUtfToUniChar(src, &ch);
titleChar = Tcl_UniCharToTitle(ch);
- if ((len < TclUtfCount(titleChar)) || ((titleChar & ~0x7FF) == 0xD800)) {
- memmove(dst, src, len);
- dst += len;
+ if (bytes < UtfCount(titleChar)) {
+ memcpy(dst, src, (size_t) bytes);
+ dst += bytes;
} else {
dst += Tcl_UniCharToUtf(titleChar, dst);
}
- src += len;
+ src += bytes;
}
while (*src) {
- len = TclUtfToUniChar(src, &ch);
- lowChar = ch;
- /* Special exception for Georgian Asomtavruli chars, no titlecase. */
- if ((unsigned)(lowChar - 0x1C90) >= 0x30) {
- lowChar = Tcl_UniCharToLower(lowChar);
- }
+ bytes = TclUtfToUniChar(src, &ch);
+ lowChar = Tcl_UniCharToLower(ch);
- if ((len < TclUtfCount(lowChar)) || ((lowChar & ~0x7FF) == 0xD800)) {
- memmove(dst, src, len);
- dst += len;
+ if (bytes < UtfCount(lowChar)) {
+ memcpy(dst, src, (size_t) bytes);
+ dst += bytes;
} else {
dst += Tcl_UniCharToUtf(lowChar, dst);
}
- src += len;
+ src += bytes;
}
*dst = '\0';
return (dst - str);
@@ -1502,8 +983,8 @@ Tcl_UtfToTitle(
int
TclpUtfNcmp2(
- const char *cs, /* UTF string to compare to ct. */
- const char *ct, /* UTF string cs is compared to. */
+ CONST char *cs, /* UTF string to compare to ct. */
+ CONST char *ct, /* UTF string cs is compared to. */
unsigned long numBytes) /* Number of *bytes* to compare. */
{
/*
@@ -1512,7 +993,7 @@ TclpUtfNcmp2(
* fine in the strcmp manner.
*/
- int result = 0;
+ register int result = 0;
for ( ; numBytes != 0; numBytes--, cs++, ct++) {
if (*cs != *ct) {
@@ -1535,8 +1016,8 @@ TclpUtfNcmp2(
*
* Tcl_UtfNcmp --
*
- * Compare at most numChars UTF-16 chars of string cs to string ct. Both cs
- * and ct are assumed to be at least numChars UTF-16 chars long.
+ * Compare at most numChars UTF chars of string cs to string ct. Both cs
+ * and ct are assumed to be at least numChars UTF chars long.
*
* Results:
* Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
@@ -1547,89 +1028,17 @@ TclpUtfNcmp2(
*----------------------------------------------------------------------
*/
-#if !defined(TCL_NO_DEPRECATED)
int
Tcl_UtfNcmp(
- const char *cs, /* UTF string to compare to ct. */
- const char *ct, /* UTF string cs is compared to. */
- unsigned long numChars) /* Number of UTF-16 chars to compare. */
-{
- unsigned short ch1 = 0, ch2 = 0;
-
- /*
- * Cannot use 'memcmp(cs, ct, n);' as byte representation of \u0000 (the
- * pair of bytes 0xC0,0x80) is larger than byte representation of \u0001
- * (the byte 0x01.)
- */
-
- while (numChars-- > 0) {
- /*
- * n must be interpreted as UTF-16 chars, not bytes. This should be called
- * only when both strings are of at least n UTF-16 chars long (no need for \0
- * check)
- */
-
- cs += Tcl_UtfToChar16(cs, &ch1);
- ct += Tcl_UtfToChar16(ct, &ch2);
- if (ch1 != ch2) {
- /* Surrogates always report higher than non-surrogates */
- if (((ch1 & 0xFC00) == 0xD800)) {
- if ((ch2 & 0xFC00) != 0xD800) {
- return ch1;
- }
- } else if ((ch2 & 0xFC00) == 0xD800) {
- return -ch2;
- }
- return (ch1 - ch2);
- }
- }
- return 0;
-}
-#endif /* TCL_NO_DEPRECATED */
-
-int
-TclUtfNcmp(
- const char *cs, /* UTF string to compare to ct. */
- const char *ct, /* UTF string cs is compared to. */
- size_t numChars) /* Number of chars to compare. */
+ CONST char *cs, /* UTF string to compare to ct. */
+ CONST char *ct, /* UTF string cs is compared to. */
+ unsigned long numChars) /* Number of UTF chars to compare. */
{
- Tcl_UniChar ch1 = 0, ch2 = 0;
+ Tcl_UniChar ch1, ch2;
/*
* Cannot use 'memcmp(cs, ct, n);' as byte representation of \u0000 (the
- * pair of bytes 0xC0,0x80) is larger than byte representation of \u0001
- * (the byte 0x01.)
- */
-
- while (numChars-- > 0) {
- /*
- * n must be interpreted as chars, not bytes. This should be called
- * only when both strings are of at least n chars long (no need for \0
- * check)
- */
-
- cs += TclUtfToUniChar(cs, &ch1);
- ct += TclUtfToUniChar(ct, &ch2);
- if (ch1 != ch2) {
- return (ch1 - ch2);
- }
- }
- return 0;
-}
-
-int
-TclUtfNmemcmp(
- const void *csPtr, /* UTF string to compare to ct. */
- const void *ctPtr, /* UTF string cs is compared to. */
- size_t numChars) /* Number of chars to compare. */
-{
- Tcl_UniChar ch1 = 0, ch2 = 0;
- const char *cs = (const char *)csPtr;
- const char *ct = (const char *)ctPtr;
-
- /*
- * Cannot use 'memcmp(cs, ct, n);' as byte representation of \u0000 (the
- * pair of bytes 0xC0,0x80) is larger than byte representation of \u0001
+ * pair of bytes 0xc0,0x80) is larger than byte representation of \u0001
* (the byte 0x01.)
*/
@@ -1654,8 +1063,8 @@ TclUtfNmemcmp(
*
* Tcl_UtfNcasecmp --
*
- * Compare at most numChars UTF-16 chars of string cs to string ct case
- * insensitive. Both cs and ct are assumed to be at least numChars UTF-16
+ * Compare at most numChars UTF chars of string cs to string ct case
+ * insensitive. Both cs and ct are assumed to be at least numChars UTF
* chars long.
*
* Results:
@@ -1667,81 +1076,13 @@ TclUtfNmemcmp(
*----------------------------------------------------------------------
*/
-#if !defined(TCL_NO_DEPRECATED)
int
Tcl_UtfNcasecmp(
- const char *cs, /* UTF string to compare to ct. */
- const char *ct, /* UTF string cs is compared to. */
- unsigned long numChars) /* Number of UTF-16 chars to compare. */
+ CONST char *cs, /* UTF string to compare to ct. */
+ CONST char *ct, /* UTF string cs is compared to. */
+ unsigned long numChars) /* Number of UTF chars to compare. */
{
- unsigned short ch1 = 0, ch2 = 0;
-
- while (numChars-- > 0) {
- /*
- * n must be interpreted as UTF-16 chars, not bytes.
- * This should be called only when both strings are of
- * at least n UTF-16 chars long (no need for \0 check)
- */
- cs += Tcl_UtfToChar16(cs, &ch1);
- ct += Tcl_UtfToChar16(ct, &ch2);
- if (ch1 != ch2) {
- /* Surrogates always report higher than non-surrogates */
- if (((ch1 & 0xFC00) == 0xD800)) {
- if ((ch2 & 0xFC00) != 0xD800) {
- return ch1;
- }
- } else if ((ch2 & 0xFC00) == 0xD800) {
- return -ch2;
- }
- ch1 = Tcl_UniCharToLower(ch1);
- ch2 = Tcl_UniCharToLower(ch2);
- if (ch1 != ch2) {
- return (ch1 - ch2);
- }
- }
- }
- return 0;
-}
-#endif /* TCL_NO_DEPRECATED */
-
-
-int
-TclUtfNcasecmp(
- const char *cs, /* UTF string to compare to ct. */
- const char *ct, /* UTF string cs is compared to. */
- size_t numChars) /* Number of chars to compare. */
-{
- Tcl_UniChar ch1 = 0, ch2 = 0;
-
- while (numChars-- > 0) {
- /*
- * n must be interpreted as chars, not bytes.
- * This should be called only when both strings are of
- * at least n chars long (no need for \0 check)
- */
- cs += TclUtfToUniChar(cs, &ch1);
- ct += TclUtfToUniChar(ct, &ch2);
- if (ch1 != ch2) {
- ch1 = Tcl_UniCharToLower(ch1);
- ch2 = Tcl_UniCharToLower(ch2);
- if (ch1 != ch2) {
- return (ch1 - ch2);
- }
- }
- }
- return 0;
-}
-
-int
-TclUtfNcasememcmp(
- const void *csPtr, /* UTF string to compare to ct. */
- const void *ctPtr, /* UTF string cs is compared to. */
- size_t numChars) /* Number of chars to compare. */
-{
- const char *cs = (const char *)csPtr;
- const char *ct = (const char *)ctPtr;
- Tcl_UniChar ch1 = 0, ch2 = 0;
-
+ Tcl_UniChar ch1, ch2;
while (numChars-- > 0) {
/*
* n must be interpreted as chars, not bytes.
@@ -1760,47 +1101,11 @@ TclUtfNcasememcmp(
}
return 0;
}
-
-/*
- *----------------------------------------------------------------------
- *
- * Tcl_UtfCmp --
- *
- * Compare UTF chars of string cs to string ct case sensitively.
- * Replacement for strcmp in Tcl core, in places where UTF-8 should
- * be handled.
- *
- * Results:
- * Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
- *
- * Side effects:
- * None.
- *
- *----------------------------------------------------------------------
- */
-
-int
-TclUtfCmp(
- const char *cs, /* UTF string to compare to ct. */
- const char *ct) /* UTF string cs is compared to. */
-{
- Tcl_UniChar ch1 = 0, ch2 = 0;
-
- while (*cs && *ct) {
- cs += TclUtfToUniChar(cs, &ch1);
- ct += TclUtfToUniChar(ct, &ch2);
- if (ch1 != ch2) {
- return ch1 - ch2;
- }
- }
- return UCHAR(*cs) - UCHAR(*ct);
-}
-
/*
*----------------------------------------------------------------------
*
- * TclUtfCasecmp --
+ * Tcl_UtfNcasecmp --
*
* Compare UTF chars of string cs to string ct case insensitively.
* Replacement for strcasecmp in Tcl core, in places where UTF-8 should
@@ -1817,12 +1122,12 @@ TclUtfCmp(
int
TclUtfCasecmp(
- const char *cs, /* UTF string to compare to ct. */
- const char *ct) /* UTF string cs is compared to. */
+ CONST char *cs, /* UTF string to compare to ct. */
+ CONST char *ct) /* UTF string cs is compared to. */
{
- Tcl_UniChar ch1 = 0, ch2 = 0;
-
while (*cs && *ct) {
+ Tcl_UniChar ch1, ch2;
+
cs += TclUtfToUniChar(cs, &ch1);
ct += TclUtfToUniChar(ct, &ch2);
if (ch1 != ch2) {
@@ -1853,19 +1158,16 @@ TclUtfCasecmp(
*----------------------------------------------------------------------
*/
-int
+Tcl_UniChar
Tcl_UniCharToUpper(
int ch) /* Unicode character to convert. */
{
- if (!UNICODE_OUT_OF_RANGE(ch)) {
- int info = GetUniCharInfo(ch);
+ int info = GetUniCharInfo(ch);
- if (GetCaseType(info) & 0x04) {
- ch -= GetDelta(info);
- }
+ if (GetCaseType(info) & 0x04) {
+ ch -= GetDelta(info);
}
- /* Clear away extension bits, if any */
- return ch & 0x1FFFFF;
+ return (Tcl_UniChar) ch;
}
/*
@@ -1884,22 +1186,18 @@ Tcl_UniCharToUpper(
*----------------------------------------------------------------------
*/
-int
+Tcl_UniChar
Tcl_UniCharToLower(
int ch) /* Unicode character to convert. */
{
- if (!UNICODE_OUT_OF_RANGE(ch)) {
- int info = GetUniCharInfo(ch);
- int mode = GetCaseType(info);
+ int info = GetUniCharInfo(ch);
- if ((mode & 0x02) && (mode != 0x7)) {
- ch += GetDelta(info);
- }
+ if (GetCaseType(info) & 0x02) {
+ ch += GetDelta(info);
}
- /* Clear away extension bits, if any */
- return ch & 0x1FFFFF;
+ return (Tcl_UniChar) ch;
}
-
+
/*
*----------------------------------------------------------------------
*
@@ -1916,63 +1214,28 @@ Tcl_UniCharToLower(
*----------------------------------------------------------------------
*/
-int
+Tcl_UniChar
Tcl_UniCharToTitle(
int ch) /* Unicode character to convert. */
{
- if (!UNICODE_OUT_OF_RANGE(ch)) {
- int info = GetUniCharInfo(ch);
- int mode = GetCaseType(info);
+ int info = GetUniCharInfo(ch);
+ int mode = GetCaseType(info);
- if (mode & 0x1) {
- /*
- * Subtract or add one depending on the original case.
- */
+ if (mode & 0x1) {
+ /*
+ * Subtract or add one depending on the original case.
+ */
- if (mode != 0x7) {
- ch += ((mode & 0x4) ? -1 : 1);
- }
- } else if (mode == 0x4) {
- ch -= GetDelta(info);
- }
+ ch += ((mode & 0x4) ? -1 : 1);
+ } else if (mode == 0x4) {
+ ch -= GetDelta(info);
}
- /* Clear away extension bits, if any */
- return ch & 0x1FFFFF;
+ return (Tcl_UniChar) ch;
}
/*
*----------------------------------------------------------------------
*
- * Tcl_Char16Len --
- *
- * Find the length of a UniChar string. The str input must be null
- * terminated.
- *
- * Results:
- * Returns the length of str in UniChars (not bytes).
- *
- * Side effects:
- * None.
- *
- *----------------------------------------------------------------------
- */
-
-Tcl_Size
-Tcl_Char16Len(
- const unsigned short *uniStr) /* Unicode string to find length of. */
-{
- Tcl_Size len = 0;
-
- while (*uniStr != '\0') {
- len++;
- uniStr++;
- }
- return len;
-}
-
-/*
- *----------------------------------------------------------------------
- *
* Tcl_UniCharLen --
*
* Find the length of a UniChar string. The str input must be null
@@ -1987,12 +1250,11 @@ Tcl_Char16Len(
*----------------------------------------------------------------------
*/
-#undef Tcl_UniCharLen
-Tcl_Size
+int
Tcl_UniCharLen(
- const int *uniStr) /* Unicode string to find length of. */
+ CONST Tcl_UniChar *uniStr) /* Unicode string to find length of. */
{
- Tcl_Size len = 0;
+ int len = 0;
while (*uniStr != '\0') {
len++;
@@ -2006,8 +1268,8 @@ Tcl_UniCharLen(
*
* Tcl_UniCharNcmp --
*
- * Compare at most numChars chars (not bytes) of string ucs to string uct.
- * Both ucs and uct are assumed to be at least numChars chars long.
+ * Compare at most numChars unichars of string ucs to string uct.
+ * Both ucs and uct are assumed to be at least numChars unichars long.
*
* Results:
* Return <0 if ucs < uct, 0 if ucs == uct, or >0 if ucs > uct.
@@ -2019,69 +1281,12 @@ Tcl_UniCharLen(
*/
int
-TclUniCharNcmp(
- const Tcl_UniChar *ucs, /* Unicode string to compare to uct. */
- const Tcl_UniChar *uct, /* Unicode string ucs is compared to. */
- size_t numChars) /* Number of chars to compare. */
-{
-#if defined(WORDS_BIGENDIAN)
- /*
- * We are definitely on a big-endian machine; memcmp() is safe
- */
-
- return memcmp(ucs, uct, numChars*sizeof(Tcl_UniChar));
-
-#else /* !WORDS_BIGENDIAN */
- /*
- * We can't simply call memcmp() because that is not lexically correct.
- */
-
- for ( ; numChars != 0; ucs++, uct++, numChars--) {
- if (*ucs != *uct) {
- return (*ucs - *uct);
- }
- }
- return 0;
-#endif /* WORDS_BIGENDIAN */
-}
-
-int
-TclUniCharNmemcmp(
- const void *ucsPtr, /* Unicode string to compare to uct. */
- const void *uctPtr, /* Unicode string ucs is compared to. */
- size_t numChars) /* Number of chars (not bytes) to compare. */
-{
- const Tcl_UniChar *ucs = (const Tcl_UniChar *)ucsPtr;
- const Tcl_UniChar *uct = (const Tcl_UniChar *)uctPtr;
-#if defined(WORDS_BIGENDIAN)
- /*
- * We are definitely on a big-endian machine; memcmp() is safe
- */
-
- return memcmp(ucs, uct, numChars*sizeof(Tcl_UniChar));
-
-#else /* !WORDS_BIGENDIAN */
- /*
- * We can't simply call memcmp() because that is not lexically correct.
- */
-
- for ( ; numChars != 0; ucs++, uct++, numChars--) {
- if (*ucs != *uct) {
- return (*ucs - *uct);
- }
- }
- return 0;
-#endif /* WORDS_BIGENDIAN */
-}
-
-#if !defined(TCL_NO_DEPRECATED)
-int
Tcl_UniCharNcmp(
- const unsigned short *ucs, /* Unicode string to compare to uct. */
- const unsigned short *uct, /* Unicode string ucs is compared to. */
- unsigned long numChars) /* Number of chars (not bytes) to compare. */
+ CONST Tcl_UniChar *ucs, /* Unicode string to compare to uct. */
+ CONST Tcl_UniChar *uct, /* Unicode string ucs is compared to. */
+ unsigned long numChars) /* Number of unichars to compare. */
{
-#if defined(WORDS_BIGENDIAN)
+#ifdef WORDS_BIGENDIAN
/*
* We are definitely on a big-endian machine; memcmp() is safe
*/
@@ -2095,27 +1300,21 @@ Tcl_UniCharNcmp(
for ( ; numChars != 0; ucs++, uct++, numChars--) {
if (*ucs != *uct) {
- /* special case for handling upper surrogates */
- if (((*ucs & 0xFC00) == 0xD800) && ((*uct & 0xFC00) != 0xD800)) {
- return 1;
- } else if (((*uct & 0xFC00) == 0xD800)) {
- return -1;
- }
return (*ucs - *uct);
}
}
return 0;
#endif /* WORDS_BIGENDIAN */
}
-#endif
+
/*
*----------------------------------------------------------------------
*
* Tcl_UniCharNcasecmp --
*
- * Compare at most numChars chars (not bytes) of string ucs to string uct case
+ * Compare at most numChars unichars of string ucs to string uct case
* insensitive. Both ucs and uct are assumed to be at least numChars
- * chars long.
+ * unichars long.
*
* Results:
* Return <0 if ucs < uct, 0 if ucs == uct, or >0 if ucs > uct.
@@ -2127,32 +1326,11 @@ Tcl_UniCharNcmp(
*/
int
-TclUniCharNcasecmp(
- const Tcl_UniChar *ucs, /* Unicode string to compare to uct. */
- const Tcl_UniChar *uct, /* Unicode string ucs is compared to. */
- size_t numChars) /* Number of chars to compare. */
-{
- for ( ; numChars != 0; numChars--, ucs++, uct++) {
- if (*ucs != *uct) {
- Tcl_UniChar lcs = Tcl_UniCharToLower(*ucs);
- Tcl_UniChar lct = Tcl_UniCharToLower(*uct);
-
- if (lcs != lct) {
- return (lcs - lct);
- }
- }
- }
- return 0;
-}
-
-int
-TclUniCharNcasememcmp(
- const void *ucsPtr, /* Unicode string to compare to uct. */
- const void *uctPtr, /* Unicode string ucs is compared to. */
- size_t numChars) /* Number of chars (not bytes) to compare. */
+Tcl_UniCharNcasecmp(
+ CONST Tcl_UniChar *ucs, /* Unicode string to compare to uct. */
+ CONST Tcl_UniChar *uct, /* Unicode string ucs is compared to. */
+ unsigned long numChars) /* Number of unichars to compare. */
{
- const Tcl_UniChar *ucs = (const Tcl_UniChar *)ucsPtr;
- const Tcl_UniChar *uct = (const Tcl_UniChar *)uctPtr;
for ( ; numChars != 0; numChars--, ucs++, uct++) {
if (*ucs != *uct) {
Tcl_UniChar lcs = Tcl_UniCharToLower(*ucs);
@@ -2165,33 +1343,6 @@ TclUniCharNcasememcmp(
}
return 0;
}
-
-#if !defined(TCL_NO_DEPRECATED)
-int
-Tcl_UniCharNcasecmp(
- const unsigned short *ucs, /* Unicode string to compare to uct. */
- const unsigned short *uct, /* Unicode string ucs is compared to. */
- unsigned long numChars) /* Number of chars (not bytes) to compare. */
-{
- for ( ; numChars != 0; numChars--, ucs++, uct++) {
- if (*ucs != *uct) {
- unsigned short lcs = Tcl_UniCharToLower(*ucs);
- unsigned short lct = Tcl_UniCharToLower(*uct);
-
- if (lcs != lct) {
- /* special case for handling upper surrogates */
- if (((lcs & 0xFC00) == 0xD800) && ((lct & 0xFC00) != 0xD800)) {
- return 1;
- } else if (((lct & 0xFC00) == 0xD800)) {
- return -1;
- }
- return (lcs - lct);
- }
- }
- }
- return 0;
-}
-#endif
/*
*----------------------------------------------------------------------
@@ -2213,9 +1364,6 @@ int
Tcl_UniCharIsAlnum(
int ch) /* Unicode character to test. */
{
- if (UNICODE_OUT_OF_RANGE(ch)) {
- return 0;
- }
return (((ALPHA_BITS | DIGIT_BITS) >> GetCategory(ch)) & 1);
}
@@ -2239,9 +1387,6 @@ int
Tcl_UniCharIsAlpha(
int ch) /* Unicode character to test. */
{
- if (UNICODE_OUT_OF_RANGE(ch)) {
- return 0;
- }
return ((ALPHA_BITS >> GetCategory(ch)) & 1);
}
@@ -2265,11 +1410,6 @@ int
Tcl_UniCharIsControl(
int ch) /* Unicode character to test. */
{
- if (UNICODE_OUT_OF_RANGE(ch)) {
- /* Clear away extension bits, if any */
- ch &= 0x1FFFFF;
- return ((ch == 0xE0001) || ((unsigned)(ch - 0xE0020) <= 0x5F));
- }
return ((CONTROL_BITS >> GetCategory(ch)) & 1);
}
@@ -2293,9 +1433,6 @@ int
Tcl_UniCharIsDigit(
int ch) /* Unicode character to test. */
{
- if (UNICODE_OUT_OF_RANGE(ch)) {
- return 0;
- }
return (GetCategory(ch) == DECIMAL_DIGIT_NUMBER);
}
@@ -2319,9 +1456,6 @@ int
Tcl_UniCharIsGraph(
int ch) /* Unicode character to test. */
{
- if (UNICODE_OUT_OF_RANGE(ch)) {
- return ((unsigned)((ch & 0x1FFFFF) - 0xE0100) <= 0xEF);
- }
return ((GRAPH_BITS >> GetCategory(ch)) & 1);
}
@@ -2345,9 +1479,6 @@ int
Tcl_UniCharIsLower(
int ch) /* Unicode character to test. */
{
- if (UNICODE_OUT_OF_RANGE(ch)) {
- return 0;
- }
return (GetCategory(ch) == LOWERCASE_LETTER);
}
@@ -2371,9 +1502,6 @@ int
Tcl_UniCharIsPrint(
int ch) /* Unicode character to test. */
{
- if (UNICODE_OUT_OF_RANGE(ch)) {
- return ((unsigned)((ch & 0x1FFFFF) - 0xE0100) <= 0xEF);
- }
return (((GRAPH_BITS|SPACE_BITS) >> GetCategory(ch)) & 1);
}
@@ -2397,9 +1525,6 @@ int
Tcl_UniCharIsPunct(
int ch) /* Unicode character to test. */
{
- if (UNICODE_OUT_OF_RANGE(ch)) {
- return 0;
- }
return ((PUNCT_BITS >> GetCategory(ch)) & 1);
}
@@ -2423,20 +1548,14 @@ int
Tcl_UniCharIsSpace(
int ch) /* Unicode character to test. */
{
- /* Ignore upper 11 bits. */
- ch &= 0x1FFFFF;
-
/*
* If the character is within the first 127 characters, just use the
* standard C function, otherwise consult the Unicode table.
*/
- if (ch < 0x80) {
- return TclIsSpaceProcM((char) ch);
- } else if (UNICODE_OUT_OF_RANGE(ch)) {
- return 0;
- } else if (ch == 0x0085 || ch == 0x180E || ch == 0x200B
- || ch == 0x202F || ch == 0x2060 || ch == 0xFEFF) {
+ if (((Tcl_UniChar) ch) < ((Tcl_UniChar) 0x80)) {
+ return TclIsSpaceProc((char) ch);
+ } else if ((Tcl_UniChar) ch == 0x180e || (Tcl_UniChar) ch == 0x202f) {
return 1;
} else {
return ((SPACE_BITS >> GetCategory(ch)) & 1);
@@ -2463,9 +1582,6 @@ int
Tcl_UniCharIsUpper(
int ch) /* Unicode character to test. */
{
- if (UNICODE_OUT_OF_RANGE(ch)) {
- return 0;
- }
return (GetCategory(ch) == UPPERCASE_LETTER);
}
@@ -2489,9 +1605,6 @@ int
Tcl_UniCharIsWordChar(
int ch) /* Unicode character to test. */
{
- if (UNICODE_OUT_OF_RANGE(ch)) {
- return 0;
- }
return ((WORD_BITS >> GetCategory(ch)) & 1);
}
@@ -2519,182 +1632,14 @@ Tcl_UniCharIsWordChar(
*/
int
-TclUniCharCaseMatch(
- const Tcl_UniChar *uniStr, /* Unicode String. */
- const Tcl_UniChar *uniPattern,
- /* Pattern, which may contain special
- * characters. */
- int nocase) /* 0 for case sensitive, 1 for insensitive */
-{
- Tcl_UniChar ch1 = 0, p;
-
- while (1) {
- p = *uniPattern;
-
- /*
- * See if we're at the end of both the pattern and the string. If so,
- * we succeeded. If we're at the end of the pattern but not at the end
- * of the string, we failed.
- */
-
- if (p == 0) {
- return (*uniStr == 0);
- }
- if ((*uniStr == 0) && (p != '*')) {
- return 0;
- }
-
- /*
- * Check for a "*" as the next pattern character. It matches any
- * substring. We handle this by skipping all the characters up to the
- * next matching one in the pattern, and then calling ourselves
- * recursively for each postfix of string, until either we match or we
- * reach the end of the string.
- */
-
- if (p == '*') {
- /*
- * Skip all successive *'s in the pattern
- */
-
- while (*(++uniPattern) == '*') {
- /* empty body */
- }
- p = *uniPattern;
- if (p == 0) {
- return 1;
- }
- if (nocase) {
- p = Tcl_UniCharToLower(p);
- }
- while (1) {
- /*
- * Optimization for matching - cruise through the string
- * quickly if the next char in the pattern isn't a special
- * character
- */
-
- if ((p != '[') && (p != '?') && (p != '\\')) {
- if (nocase) {
- while (*uniStr && (p != *uniStr)
- && (p != Tcl_UniCharToLower(*uniStr))) {
- uniStr++;
- }
- } else {
- while (*uniStr && (p != *uniStr)) {
- uniStr++;
- }
- }
- }
- if (TclUniCharCaseMatch(uniStr, uniPattern, nocase)) {
- return 1;
- }
- if (*uniStr == 0) {
- return 0;
- }
- uniStr++;
- }
- }
-
- /*
- * Check for a "?" as the next pattern character. It matches any
- * single character.
- */
-
- if (p == '?') {
- uniPattern++;
- uniStr++;
- continue;
- }
-
- /*
- * Check for a "[" as the next pattern character. It is followed by a
- * list of characters that are acceptable, or by a range (two
- * characters separated by "-").
- */
-
- if (p == '[') {
- Tcl_UniChar startChar, endChar;
-
- uniPattern++;
- ch1 = (nocase ? Tcl_UniCharToLower(*uniStr) : *uniStr);
- uniStr++;
- while (1) {
- if ((*uniPattern == ']') || (*uniPattern == 0)) {
- return 0;
- }
- startChar = (nocase ? Tcl_UniCharToLower(*uniPattern)
- : *uniPattern);
- uniPattern++;
- if (*uniPattern == '-') {
- uniPattern++;
- if (*uniPattern == 0) {
- return 0;
- }
- endChar = (nocase ? Tcl_UniCharToLower(*uniPattern)
- : *uniPattern);
- uniPattern++;
- if (((startChar <= ch1) && (ch1 <= endChar))
- || ((endChar <= ch1) && (ch1 <= startChar))) {
- /*
- * Matches ranges of form [a-z] or [z-a].
- */
- break;
- }
- } else if (startChar == ch1) {
- break;
- }
- }
- while (*uniPattern != ']') {
- if (*uniPattern == 0) {
- uniPattern--;
- break;
- }
- uniPattern++;
- }
- uniPattern++;
- continue;
- }
-
- /*
- * If the next pattern character is '\', just strip off the '\' so we
- * do exact matching on the character that follows.
- */
-
- if (p == '\\') {
- if (*(++uniPattern) == '\0') {
- return 0;
- }
- }
-
- /*
- * There's no special character. Just make sure that the next bytes of
- * each string match.
- */
-
- if (nocase) {
- if (Tcl_UniCharToLower(*uniStr) !=
- Tcl_UniCharToLower(*uniPattern)) {
- return 0;
- }
- } else if (*uniStr != *uniPattern) {
- return 0;
- }
- uniStr++;
- uniPattern++;
- }
-}
-
-#if !defined(TCL_NO_DEPRECATED)
-int
Tcl_UniCharCaseMatch(
- const unsigned short *uniStr, /* Unicode String. */
- const unsigned short *uniPattern,
+ CONST Tcl_UniChar *uniStr, /* Unicode String. */
+ CONST Tcl_UniChar *uniPattern,
/* Pattern, which may contain special
* characters. */
int nocase) /* 0 for case sensitive, 1 for insensitive */
{
- unsigned short ch1 = 0, p;
+ Tcl_UniChar ch1, p;
while (1) {
p = *uniPattern;
@@ -2782,7 +1727,7 @@ Tcl_UniCharCaseMatch(
*/
if (p == '[') {
- unsigned short startChar, endChar;
+ Tcl_UniChar startChar, endChar;
uniPattern++;
ch1 = (nocase ? Tcl_UniCharToLower(*uniStr) : *uniStr);
@@ -2852,7 +1797,6 @@ Tcl_UniCharCaseMatch(
uniPattern++;
}
}
-#endif
/*
*----------------------------------------------------------------------
@@ -2877,14 +1821,14 @@ Tcl_UniCharCaseMatch(
int
TclUniCharMatch(
- const Tcl_UniChar *string, /* Unicode String. */
- Tcl_Size strLen, /* Length of String */
- const Tcl_UniChar *pattern, /* Pattern, which may contain special
+ CONST Tcl_UniChar *string, /* Unicode String. */
+ int strLen, /* Length of String */
+ CONST Tcl_UniChar *pattern, /* Pattern, which may contain special
* characters. */
- Tcl_Size ptnLen, /* Length of Pattern */
+ int ptnLen, /* Length of Pattern */
int nocase) /* 0 for case sensitive, 1 for insensitive */
{
- const Tcl_UniChar *stringEnd, *patternEnd;
+ CONST Tcl_UniChar *stringEnd, *patternEnd;
Tcl_UniChar p;
stringEnd = string + strLen;