summaryrefslogtreecommitdiffstats
path: root/generic/tclUtf.c
diff options
context:
space:
mode:
Diffstat (limited to 'generic/tclUtf.c')
-rw-r--r--generic/tclUtf.c445
1 files changed, 43 insertions, 402 deletions
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index 44a0a75..ca4a166 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -207,13 +207,18 @@ Invalid(
Tcl_Size
Tcl_UniCharToUtf(
int ch, /* The Tcl_UniChar to be stored in the
- * buffer.
+ * buffer. Can be or'ed with flag TCL_COMBINE.
*/
char *buf) /* Buffer in which the UTF-8 representation of
* ch is stored. Must be large enough to hold the UTF-8
* character (at most 4 bytes).
*/
{
+ int flags = ch;
+
+ if (ch >= TCL_COMBINE) {
+ ch &= (TCL_COMBINE - 1);
+ }
if ((unsigned)(ch - 1) < (UNICODE_SELF - 1)) {
buf[0] = (char) ch;
return 1;
@@ -225,7 +230,8 @@ Tcl_UniCharToUtf(
return 2;
}
if (ch <= 0xFFFF) {
- if ((ch & 0xF800) == 0xD800) {
+ if ((flags & TCL_COMBINE) &&
+ ((ch & 0xF800) == 0xD800)) {
if (ch & 0x0400) {
/* Low surrogate */
if ( (0x80 == (0xC0 & buf[0]))
@@ -302,7 +308,6 @@ three:
*---------------------------------------------------------------------------
*/
-#undef Tcl_UniCharToUtfDString
char *
Tcl_UniCharToUtfDString(
const int *uniStr, /* Unicode string to convert to UTF-8. */
@@ -386,7 +391,7 @@ Tcl_Char16ToUtfDString(
/* Special case for handling high surrogates. */
p += Tcl_UniCharToUtf(-1, p);
}
- len = Tcl_UniCharToUtf(*w, p);
+ len = Tcl_UniCharToUtf(*w | TCL_COMBINE, p);
p += len;
if ((*w >= 0xD800) && (len < 3)) {
len = 0; /* Indication that high surrogate was found */
@@ -417,15 +422,6 @@ Tcl_Char16ToUtfDString(
* Tcl_UtfCharComplete() before calling this routine to ensure that
* enough bytes remain in the string.
*
- * Special handling of Surrogate pairs is done:
- * For any UTF-8 string containing a character outside of the BMP, the
- * first call to this function will fill *chPtr with the high surrogate
- * and generate a return value of 1. Calling Tcl_UtfToUniChar again
- * will produce the low surrogate and a return value of 3. Because *chPtr
- * is used to remember whether the high surrogate is already produced, it
- * is recommended to initialize the variable it points to as 0 before
- * the first call to Tcl_UtfToUniChar is done.
- *
* Results:
* *chPtr is filled with the Tcl_UniChar, and the return value is the
* number of bytes from the UTF-8 string that were consumed.
@@ -443,7 +439,6 @@ static const unsigned short cp1252[32] = {
0x2DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x9D, 0x017E, 0x0178
};
-#undef Tcl_UtfToUniChar
Tcl_Size
Tcl_UtfToUniChar(
const char *src, /* The UTF-8 string. */
@@ -644,7 +639,6 @@ Tcl_UtfToChar16(
*---------------------------------------------------------------------------
*/
-#undef Tcl_UtfToUniCharDString
int *
Tcl_UtfToUniCharDString(
const char *src, /* UTF-8 string to convert to Unicode. */
@@ -807,7 +801,7 @@ Tcl_UtfCharComplete(
*/
Tcl_Size
-TclNumUtfChars(
+Tcl_NumUtfChars(
const char *src, /* The UTF-8 string to measure. */
Tcl_Size length) /* The length of the string in bytes, or
* negative value for strlen(src). */
@@ -817,7 +811,7 @@ TclNumUtfChars(
if (length < 0) {
/* string is NUL-terminated, so TclUtfToUniChar calls are safe. */
- while ((*src != '\0') && (i < INT_MAX)) {
+ while (*src != '\0') {
src += TclUtfToUniChar(src, &ch);
i++;
}
@@ -858,9 +852,8 @@ TclNumUtfChars(
return i;
}
-#if !defined(TCL_NO_DEPRECATED)
Tcl_Size
-Tcl_NumUtfChars(
+TclNumUtfChars(
const char *src, /* The UTF-8 string to measure. */
Tcl_Size length) /* The length of the string in bytes, or
* negative for strlen(src). */
@@ -870,7 +863,7 @@ Tcl_NumUtfChars(
if (length < 0) {
/* string is NUL-terminated, so TclUtfToUniChar calls are safe. */
- while ((*src != '\0') && (i < INT_MAX)) {
+ while (*src != '\0') {
src += Tcl_UtfToChar16(src, &ch);
i++;
}
@@ -910,7 +903,6 @@ Tcl_NumUtfChars(
}
return i;
}
-#endif
/*
*---------------------------------------------------------------------------
@@ -1189,20 +1181,16 @@ Tcl_UniCharAtIndex(
const char *src, /* The UTF-8 string to dereference. */
Tcl_Size index) /* The position of the desired character. */
{
- unsigned short ch = 0;
+ Tcl_UniChar ch = 0;
int i = 0;
if (index < 0) {
return -1;
}
- while (index-- > 0) {
- i = Tcl_UtfToChar16(src, &ch);
+ while (index--) {
+ i = TclUtfToUniChar(src, &ch);
src += i;
}
- if ((ch >= 0xD800) && (i < 3)) {
- /* Index points at character following high Surrogate */
- return -1;
- }
Tcl_UtfToUniChar(src, &i);
return i;
}
@@ -1225,21 +1213,20 @@ Tcl_UniCharAtIndex(
*/
const char *
-TclUtfAtIndex(
+Tcl_UtfAtIndex(
const char *src, /* The UTF-8 string. */
Tcl_Size index) /* The position of the desired character. */
{
Tcl_UniChar ch = 0;
while (index-- > 0) {
- src += TclUtfToUniChar(src, &ch);
+ src += Tcl_UtfToUniChar(src, &ch);
}
return src;
}
-#if !defined(TCL_NO_DEPRECATED)
const char *
-Tcl_UtfAtIndex(
+TclUtfAtIndex(
const char *src, /* The UTF-8 string. */
Tcl_Size index) /* The position of the desired character. */
{
@@ -1257,7 +1244,6 @@ Tcl_UtfAtIndex(
}
return src;
}
-#endif
/*
*---------------------------------------------------------------------------
@@ -1353,7 +1339,7 @@ Tcl_UtfToUpper(
* char to dst if its size is <= the original char.
*/
- if ((len < TclUtfCount(upChar)) || ((upChar & ~0x7FF) == 0xD800)) {
+ if (len < TclUtfCount(upChar)) {
memmove(dst, src, len);
dst += len;
} else {
@@ -1406,7 +1392,7 @@ Tcl_UtfToLower(
* char to dst if its size is <= the original char.
*/
- if ((len < TclUtfCount(lowChar)) || ((lowChar & ~0x7FF) == 0xD800)) {
+ if (len < TclUtfCount(lowChar)) {
memmove(dst, src, len);
dst += len;
} else {
@@ -1456,7 +1442,7 @@ Tcl_UtfToTitle(
len = Tcl_UtfToUniChar(src, &ch);
titleChar = Tcl_UniCharToTitle(ch);
- if ((len < TclUtfCount(titleChar)) || ((titleChar & ~0x7FF) == 0xD800)) {
+ if (len < TclUtfCount(titleChar)) {
memmove(dst, src, len);
dst += len;
} else {
@@ -1472,7 +1458,7 @@ Tcl_UtfToTitle(
lowChar = Tcl_UniCharToLower(lowChar);
}
- if ((len < TclUtfCount(lowChar)) || ((lowChar & ~0x7FF) == 0xD800)) {
+ if (len < TclUtfCount(lowChar)) {
memmove(dst, src, len);
dst += len;
} else {
@@ -1503,10 +1489,12 @@ Tcl_UtfToTitle(
int
TclpUtfNcmp2(
- const char *cs, /* UTF string to compare to ct. */
- const char *ct, /* UTF string cs is compared to. */
- unsigned long numBytes) /* Number of *bytes* to compare. */
+ const void *csPtr, /* UTF string to compare to ct. */
+ const void *ctPtr, /* UTF string cs is compared to. */
+ size_t numBytes) /* Number of *bytes* to compare. */
{
+ const char *cs = (const char *)csPtr;
+ const char *ct = (const char *)ctPtr;
/*
* We can't simply call 'memcmp(cs, ct, numBytes);' because we need to
* check for Tcl's \xC0\x80 non-utf-8 null encoding. Otherwise utf-8 lexes
@@ -1536,8 +1524,8 @@ TclpUtfNcmp2(
*
* Tcl_UtfNcmp --
*
- * Compare at most numChars UTF-16 chars of string cs to string ct. Both cs
- * and ct are assumed to be at least numChars UTF-16 chars long.
+ * Compare at most numChars chars (not bytes) of string cs to string ct. Both cs
+ * and ct are assumed to be at least numChars chars long.
*
* Results:
* Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
@@ -1548,12 +1536,11 @@ TclpUtfNcmp2(
*----------------------------------------------------------------------
*/
-#if !defined(TCL_NO_DEPRECATED)
int
-Tcl_UtfNcmp(
+TclUtfNcmp(
const char *cs, /* UTF string to compare to ct. */
const char *ct, /* UTF string cs is compared to. */
- unsigned long numChars) /* Number of UTF-16 chars to compare. */
+ size_t numChars) /* Number of UTF-16 chars to compare. */
{
unsigned short ch1 = 0, ch2 = 0;
@@ -1565,7 +1552,7 @@ Tcl_UtfNcmp(
while (numChars-- > 0) {
/*
- * n must be interpreted as UTF-16 chars, not bytes. This should be called
+ * n must be interpreted as chars, not bytes. This should be called
* only when both strings are of at least n UTF-16 chars long (no need for \0
* check)
*/
@@ -1586,10 +1573,9 @@ Tcl_UtfNcmp(
}
return 0;
}
-#endif /* TCL_NO_DEPRECATED */
int
-TclUtfNcmp(
+Tcl_UtfNcmp(
const char *cs, /* UTF string to compare to ct. */
const char *ct, /* UTF string cs is compared to. */
size_t numChars) /* Number of chars to compare. */
@@ -1617,46 +1603,14 @@ TclUtfNcmp(
}
return 0;
}
-
-int
-TclUtfNmemcmp(
- const void *csPtr, /* UTF string to compare to ct. */
- const void *ctPtr, /* UTF string cs is compared to. */
- size_t numChars) /* Number of chars to compare. */
-{
- Tcl_UniChar ch1 = 0, ch2 = 0;
- const char *cs = (const char *)csPtr;
- const char *ct = (const char *)ctPtr;
-
- /*
- * Cannot use 'memcmp(cs, ct, n);' as byte representation of \u0000 (the
- * pair of bytes 0xC0,0x80) is larger than byte representation of \u0001
- * (the byte 0x01.)
- */
-
- while (numChars-- > 0) {
- /*
- * n must be interpreted as chars, not bytes. This should be called
- * only when both strings are of at least n chars long (no need for \0
- * check)
- */
-
- cs += TclUtfToUniChar(cs, &ch1);
- ct += TclUtfToUniChar(ct, &ch2);
- if (ch1 != ch2) {
- return (ch1 - ch2);
- }
- }
- return 0;
-}
/*
*----------------------------------------------------------------------
*
* Tcl_UtfNcasecmp --
*
- * Compare at most numChars UTF-16 chars of string cs to string ct case
- * insensitive. Both cs and ct are assumed to be at least numChars UTF-16
+ * Compare at most numChars chars (not bytes) of string cs to string ct case
+ * insensitive. Both cs and ct are assumed to be at least numChars UTF
* chars long.
*
* Results:
@@ -1668,12 +1622,11 @@ TclUtfNmemcmp(
*----------------------------------------------------------------------
*/
-#if !defined(TCL_NO_DEPRECATED)
int
-Tcl_UtfNcasecmp(
+TclUtfNcasecmp(
const char *cs, /* UTF string to compare to ct. */
const char *ct, /* UTF string cs is compared to. */
- unsigned long numChars) /* Number of UTF-16 chars to compare. */
+ size_t numChars) /* Number of UTF-16 chars to compare. */
{
unsigned short ch1 = 0, ch2 = 0;
@@ -1703,11 +1656,9 @@ Tcl_UtfNcasecmp(
}
return 0;
}
-#endif /* TCL_NO_DEPRECATED */
-
int
-TclUtfNcasecmp(
+Tcl_UtfNcasecmp(
const char *cs, /* UTF string to compare to ct. */
const char *ct, /* UTF string cs is compared to. */
size_t numChars) /* Number of chars to compare. */
@@ -1733,35 +1684,6 @@ TclUtfNcasecmp(
return 0;
}
-int
-TclUtfNcasememcmp(
- const void *csPtr, /* UTF string to compare to ct. */
- const void *ctPtr, /* UTF string cs is compared to. */
- size_t numChars) /* Number of chars to compare. */
-{
- const char *cs = (const char *)csPtr;
- const char *ct = (const char *)ctPtr;
- Tcl_UniChar ch1 = 0, ch2 = 0;
-
- while (numChars-- > 0) {
- /*
- * n must be interpreted as chars, not bytes.
- * This should be called only when both strings are of
- * at least n chars long (no need for \0 check)
- */
- cs += TclUtfToUniChar(cs, &ch1);
- ct += TclUtfToUniChar(ct, &ch2);
- if (ch1 != ch2) {
- ch1 = Tcl_UniCharToLower(ch1);
- ch2 = Tcl_UniCharToLower(ch2);
- if (ch1 != ch2) {
- return (ch1 - ch2);
- }
- }
- }
- return 0;
-}
-
/*
*----------------------------------------------------------------------
*
@@ -1988,7 +1910,6 @@ Tcl_Char16Len(
*----------------------------------------------------------------------
*/
-#undef Tcl_UniCharLen
Tcl_Size
Tcl_UniCharLen(
const int *uniStr) /* Unicode string to find length of. */
@@ -2005,7 +1926,7 @@ Tcl_UniCharLen(
/*
*----------------------------------------------------------------------
*
- * Tcl_UniCharNcmp --
+ * TclUniCharNcmp --
*
* Compare at most numChars chars (not bytes) of string ucs to string uct.
* Both ucs and uct are assumed to be at least numChars chars long.
@@ -2046,73 +1967,10 @@ TclUniCharNcmp(
#endif /* WORDS_BIGENDIAN */
}
-int
-TclUniCharNmemcmp(
- const void *ucsPtr, /* Unicode string to compare to uct. */
- const void *uctPtr, /* Unicode string ucs is compared to. */
- size_t numChars) /* Number of chars (not bytes) to compare. */
-{
- const Tcl_UniChar *ucs = (const Tcl_UniChar *)ucsPtr;
- const Tcl_UniChar *uct = (const Tcl_UniChar *)uctPtr;
-#if defined(WORDS_BIGENDIAN)
- /*
- * We are definitely on a big-endian machine; memcmp() is safe
- */
-
- return memcmp(ucs, uct, numChars*sizeof(Tcl_UniChar));
-
-#else /* !WORDS_BIGENDIAN */
- /*
- * We can't simply call memcmp() because that is not lexically correct.
- */
-
- for ( ; numChars != 0; ucs++, uct++, numChars--) {
- if (*ucs != *uct) {
- return (*ucs - *uct);
- }
- }
- return 0;
-#endif /* WORDS_BIGENDIAN */
-}
-
-#if !defined(TCL_NO_DEPRECATED)
-int
-Tcl_UniCharNcmp(
- const unsigned short *ucs, /* Unicode string to compare to uct. */
- const unsigned short *uct, /* Unicode string ucs is compared to. */
- unsigned long numChars) /* Number of chars (not bytes) to compare. */
-{
-#if defined(WORDS_BIGENDIAN)
- /*
- * We are definitely on a big-endian machine; memcmp() is safe
- */
-
- return memcmp(ucs, uct, numChars*sizeof(Tcl_UniChar));
-
-#else /* !WORDS_BIGENDIAN */
- /*
- * We can't simply call memcmp() because that is not lexically correct.
- */
-
- for ( ; numChars != 0; ucs++, uct++, numChars--) {
- if (*ucs != *uct) {
- /* special case for handling upper surrogates */
- if (((*ucs & 0xFC00) == 0xD800) && ((*uct & 0xFC00) != 0xD800)) {
- return 1;
- } else if (((*uct & 0xFC00) == 0xD800)) {
- return -1;
- }
- return (*ucs - *uct);
- }
- }
- return 0;
-#endif /* WORDS_BIGENDIAN */
-}
-#endif
/*
*----------------------------------------------------------------------
*
- * Tcl_UniCharNcasecmp --
+ * TclUniCharNcasecmp --
*
* Compare at most numChars chars (not bytes) of string ucs to string uct case
* insensitive. Both ucs and uct are assumed to be at least numChars
@@ -2145,54 +2003,6 @@ TclUniCharNcasecmp(
}
return 0;
}
-
-int
-TclUniCharNcasememcmp(
- const void *ucsPtr, /* Unicode string to compare to uct. */
- const void *uctPtr, /* Unicode string ucs is compared to. */
- size_t numChars) /* Number of chars (not bytes) to compare. */
-{
- const Tcl_UniChar *ucs = (const Tcl_UniChar *)ucsPtr;
- const Tcl_UniChar *uct = (const Tcl_UniChar *)uctPtr;
- for ( ; numChars != 0; numChars--, ucs++, uct++) {
- if (*ucs != *uct) {
- Tcl_UniChar lcs = Tcl_UniCharToLower(*ucs);
- Tcl_UniChar lct = Tcl_UniCharToLower(*uct);
-
- if (lcs != lct) {
- return (lcs - lct);
- }
- }
- }
- return 0;
-}
-
-#if !defined(TCL_NO_DEPRECATED)
-int
-Tcl_UniCharNcasecmp(
- const unsigned short *ucs, /* Unicode string to compare to uct. */
- const unsigned short *uct, /* Unicode string ucs is compared to. */
- unsigned long numChars) /* Number of chars (not bytes) to compare. */
-{
- for ( ; numChars != 0; numChars--, ucs++, uct++) {
- if (*ucs != *uct) {
- unsigned short lcs = Tcl_UniCharToLower(*ucs);
- unsigned short lct = Tcl_UniCharToLower(*uct);
-
- if (lcs != lct) {
- /* special case for handling upper surrogates */
- if (((lcs & 0xFC00) == 0xD800) && ((lct & 0xFC00) != 0xD800)) {
- return 1;
- } else if (((lct & 0xFC00) == 0xD800)) {
- return -1;
- }
- return (lcs - lct);
- }
- }
- }
- return 0;
-}
-#endif
/*
*----------------------------------------------------------------------
@@ -2529,7 +2339,7 @@ Tcl_UniCharIsWordChar(
/*
*----------------------------------------------------------------------
*
- * Tcl_UniCharCaseMatch --
+ * TclUniCharCaseMatch --
*
* See if a particular Unicode string matches a particular pattern.
* Allows case insensitivity. This is the Unicode equivalent of the char*
@@ -2715,175 +2525,6 @@ TclUniCharCaseMatch(
uniPattern++;
}
}
-
-#if !defined(TCL_NO_DEPRECATED)
-int
-Tcl_UniCharCaseMatch(
- const unsigned short *uniStr, /* Unicode String. */
- const unsigned short *uniPattern,
- /* Pattern, which may contain special
- * characters. */
- int nocase) /* 0 for case sensitive, 1 for insensitive */
-{
- unsigned short ch1 = 0, p;
-
- while (1) {
- p = *uniPattern;
-
- /*
- * See if we're at the end of both the pattern and the string. If so,
- * we succeeded. If we're at the end of the pattern but not at the end
- * of the string, we failed.
- */
-
- if (p == 0) {
- return (*uniStr == 0);
- }
- if ((*uniStr == 0) && (p != '*')) {
- return 0;
- }
-
- /*
- * Check for a "*" as the next pattern character. It matches any
- * substring. We handle this by skipping all the characters up to the
- * next matching one in the pattern, and then calling ourselves
- * recursively for each postfix of string, until either we match or we
- * reach the end of the string.
- */
-
- if (p == '*') {
- /*
- * Skip all successive *'s in the pattern
- */
-
- while (*(++uniPattern) == '*') {
- /* empty body */
- }
- p = *uniPattern;
- if (p == 0) {
- return 1;
- }
- if (nocase) {
- p = Tcl_UniCharToLower(p);
- }
- while (1) {
- /*
- * Optimization for matching - cruise through the string
- * quickly if the next char in the pattern isn't a special
- * character
- */
-
- if ((p != '[') && (p != '?') && (p != '\\')) {
- if (nocase) {
- while (*uniStr && (p != *uniStr)
- && (p != Tcl_UniCharToLower(*uniStr))) {
- uniStr++;
- }
- } else {
- while (*uniStr && (p != *uniStr)) {
- uniStr++;
- }
- }
- }
- if (Tcl_UniCharCaseMatch(uniStr, uniPattern, nocase)) {
- return 1;
- }
- if (*uniStr == 0) {
- return 0;
- }
- uniStr++;
- }
- }
-
- /*
- * Check for a "?" as the next pattern character. It matches any
- * single character.
- */
-
- if (p == '?') {
- uniPattern++;
- uniStr++;
- continue;
- }
-
- /*
- * Check for a "[" as the next pattern character. It is followed by a
- * list of characters that are acceptable, or by a range (two
- * characters separated by "-").
- */
-
- if (p == '[') {
- unsigned short startChar, endChar;
-
- uniPattern++;
- ch1 = (nocase ? Tcl_UniCharToLower(*uniStr) : *uniStr);
- uniStr++;
- while (1) {
- if ((*uniPattern == ']') || (*uniPattern == 0)) {
- return 0;
- }
- startChar = (nocase ? Tcl_UniCharToLower(*uniPattern)
- : *uniPattern);
- uniPattern++;
- if (*uniPattern == '-') {
- uniPattern++;
- if (*uniPattern == 0) {
- return 0;
- }
- endChar = (nocase ? Tcl_UniCharToLower(*uniPattern)
- : *uniPattern);
- uniPattern++;
- if (((startChar <= ch1) && (ch1 <= endChar))
- || ((endChar <= ch1) && (ch1 <= startChar))) {
- /*
- * Matches ranges of form [a-z] or [z-a].
- */
- break;
- }
- } else if (startChar == ch1) {
- break;
- }
- }
- while (*uniPattern != ']') {
- if (*uniPattern == 0) {
- uniPattern--;
- break;
- }
- uniPattern++;
- }
- uniPattern++;
- continue;
- }
-
- /*
- * If the next pattern character is '\', just strip off the '\' so we
- * do exact matching on the character that follows.
- */
-
- if (p == '\\') {
- if (*(++uniPattern) == '\0') {
- return 0;
- }
- }
-
- /*
- * There's no special character. Just make sure that the next bytes of
- * each string match.
- */
-
- if (nocase) {
- if (Tcl_UniCharToLower(*uniStr) !=
- Tcl_UniCharToLower(*uniPattern)) {
- return 0;
- }
- } else if (*uniStr != *uniPattern) {
- return 0;
- }
- uniStr++;
- uniPattern++;
- }
-}
-#endif
/*
*----------------------------------------------------------------------
@@ -2892,7 +2533,7 @@ Tcl_UniCharCaseMatch(
*
* See if a particular Unicode string matches a particular pattern.
* Allows case insensitivity. This is the Unicode equivalent of the char*
- * Tcl_StringCaseMatch. This variant of Tcl_UniCharCaseMatch uses counted
+ * Tcl_StringCaseMatch. This variant of TclUniCharCaseMatch uses counted
* Strings, so embedded NULLs are allowed.
*
* Results: