summaryrefslogtreecommitdiffstats
path: root/generic/tclUtf.c
diff options
context:
space:
mode:
Diffstat (limited to 'generic/tclUtf.c')
-rw-r--r--generic/tclUtf.c274
1 files changed, 159 insertions, 115 deletions
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index e5497a4..68119a4 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -59,7 +59,7 @@
* UTF-8.
*/
-static CONST unsigned char totalBytes[256] = {
+static const unsigned char totalBytes[256] = {
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
@@ -73,16 +73,7 @@ static CONST unsigned char totalBytes[256] = {
#else
1,1,1,1,1,1,1,1,
#endif
-#if TCL_UTF_MAX > 4
- 5,5,5,5,
-#else
- 1,1,1,1,
-#endif
-#if TCL_UTF_MAX > 5
- 6,6,6,6
-#else
- 1,1,1,1
-#endif
+ 1,1,1,1,1,1,1,1
};
/*
@@ -111,25 +102,16 @@ INLINE static int
UtfCount(
int ch) /* The Tcl_UniChar whose size is returned. */
{
- if ((ch > 0) && (ch < UNICODE_SELF)) {
+ if ((unsigned)(ch - 1) < (UNICODE_SELF - 1)) {
return 1;
}
if (ch <= 0x7FF) {
return 2;
}
- if (ch <= 0xFFFF) {
- return 3;
- }
#if TCL_UTF_MAX > 3
- if (ch <= 0x1FFFFF) {
+ if (((unsigned)(ch - 0x10000) <= 0xfffff)) {
return 4;
}
- if (ch <= 0x3FFFFFF) {
- return 5;
- }
- if (ch <= 0x7FFFFFFF) {
- return 6;
- }
#endif
return 3;
}
@@ -161,7 +143,7 @@ Tcl_UniCharToUtf(
* large enough to hold the UTF-8 character
* (at most TCL_UTF_MAX bytes). */
{
- if ((ch > 0) && (ch < UNICODE_SELF)) {
+ if ((unsigned)(ch - 1) < (UNICODE_SELF - 1)) {
buf[0] = (char) ch;
return 1;
}
@@ -172,43 +154,43 @@ Tcl_UniCharToUtf(
return 2;
}
if (ch <= 0xFFFF) {
- three:
- buf[2] = (char) ((ch | 0x80) & 0xBF);
- buf[1] = (char) (((ch >> 6) | 0x80) & 0xBF);
- buf[0] = (char) ((ch >> 12) | 0xE0);
- return 3;
+#if TCL_UTF_MAX == 4
+ if ((ch & 0xF800) == 0xD800) {
+ if (ch & 0x0400) {
+ /* Low surrogate */
+ buf[3] = (char) ((ch | 0x80) & 0xBF);
+ buf[2] |= (char) (((ch >> 6) | 0x80) & 0x8F);
+ return 4;
+ } else {
+ /* High surrogate */
+ ch += 0x40;
+ buf[2] = (char) (((ch << 4) | 0x80) & 0xB0);
+ buf[1] = (char) (((ch >> 2) | 0x80) & 0xBF);
+ buf[0] = (char) (((ch >> 8) | 0xF0) & 0xF7);
+ return 0;
+ }
+ }
+#endif
+ goto three;
}
#if TCL_UTF_MAX > 3
- if (ch <= 0x1FFFFF) {
+ if (ch <= 0x10FFFF) {
buf[3] = (char) ((ch | 0x80) & 0xBF);
buf[2] = (char) (((ch >> 6) | 0x80) & 0xBF);
buf[1] = (char) (((ch >> 12) | 0x80) & 0xBF);
buf[0] = (char) ((ch >> 18) | 0xF0);
return 4;
}
- if (ch <= 0x3FFFFFF) {
- buf[4] = (char) ((ch | 0x80) & 0xBF);
- buf[3] = (char) (((ch >> 6) | 0x80) & 0xBF);
- buf[2] = (char) (((ch >> 12) | 0x80) & 0xBF);
- buf[1] = (char) (((ch >> 18) | 0x80) & 0xBF);
- buf[0] = (char) ((ch >> 24) | 0xF8);
- return 5;
- }
- if (ch <= 0x7FFFFFFF) {
- buf[5] = (char) ((ch | 0x80) & 0xBF);
- buf[4] = (char) (((ch >> 6) | 0x80) & 0xBF);
- buf[3] = (char) (((ch >> 12) | 0x80) & 0xBF);
- buf[2] = (char) (((ch >> 18) | 0x80) & 0xBF);
- buf[1] = (char) (((ch >> 24) | 0x80) & 0xBF);
- buf[0] = (char) ((ch >> 30) | 0xFC);
- return 6;
- }
#endif
}
ch = 0xFFFD;
- goto three;
+three:
+ buf[2] = (char) ((ch | 0x80) & 0xBF);
+ buf[1] = (char) (((ch >> 6) | 0x80) & 0xBF);
+ buf[0] = (char) ((ch >> 12) | 0xE0);
+ return 3;
}
/*
@@ -231,13 +213,13 @@ Tcl_UniCharToUtf(
char *
Tcl_UniCharToUtfDString(
- CONST Tcl_UniChar *uniStr, /* Unicode string to convert to UTF-8. */
+ const Tcl_UniChar *uniStr, /* Unicode string to convert to UTF-8. */
int uniLength, /* Length of Unicode string in Tcl_UniChars
* (must be >= 0). */
Tcl_DString *dsPtr) /* UTF-8 representation of string is appended
* to this previously initialized DString. */
{
- CONST Tcl_UniChar *w, *wEnd;
+ const Tcl_UniChar *w, *wEnd;
char *p, *string;
int oldLength;
@@ -289,7 +271,7 @@ Tcl_UniCharToUtfDString(
int
Tcl_UtfToUniChar(
- register CONST char *src, /* The UTF-8 string. */
+ register const char *src, /* The UTF-8 string. */
register Tcl_UniChar *chPtr)/* Filled with the Tcl_UniChar represented by
* the UTF-8 string. */
{
@@ -323,9 +305,6 @@ Tcl_UtfToUniChar(
* A two-byte-character lead-byte not followed by trail-byte
* represents itself.
*/
-
- *chPtr = (Tcl_UniChar) byte;
- return 1;
} else if (byte < 0xF0) {
if (((src[1] & 0xC0) == 0x80) && ((src[2] & 0xC0) == 0x80)) {
/*
@@ -341,31 +320,23 @@ Tcl_UtfToUniChar(
* A three-byte-character lead-byte not followed by two trail-bytes
* represents itself.
*/
-
- *chPtr = (Tcl_UniChar) byte;
- return 1;
}
#if TCL_UTF_MAX > 3
- {
- int ch, total, trail;
-
- total = totalBytes[byte];
- trail = total - 1;
- if (trail > 0) {
- ch = byte & (0x3F >> trail);
- do {
- src++;
- if ((*src & 0xC0) != 0x80) {
- *chPtr = byte;
- return 1;
- }
- ch <<= 6;
- ch |= (*src & 0x3F);
- trail--;
- } while (trail > 0);
- *chPtr = ch;
- return total;
+ else if (byte < 0xF8) {
+ if (((src[1] & 0xC0) == 0x80) && ((src[2] & 0xC0) == 0x80) && ((src[3] & 0xC0) == 0x80)) {
+ /*
+ * Four-byte-character lead byte followed by three trail bytes.
+ */
+
+ *chPtr = (Tcl_UniChar) (((byte & 0x0E) << 18) | ((src[1] & 0x3F) << 12)
+ | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F));
+ return 4;
}
+
+ /*
+ * A three-byte-character lead-byte not followed by two trail-bytes
+ * represents itself.
+ */
}
#endif
@@ -393,7 +364,7 @@ Tcl_UtfToUniChar(
Tcl_UniChar *
Tcl_UtfToUniCharDString(
- CONST char *src, /* UTF-8 string to convert to Unicode. */
+ const char *src, /* UTF-8 string to convert to Unicode. */
int length, /* Length of UTF-8 string in bytes, or -1 for
* strlen(). */
Tcl_DString *dsPtr) /* Unicode representation of string is
@@ -401,7 +372,7 @@ Tcl_UtfToUniCharDString(
* DString. */
{
Tcl_UniChar *w, *wString;
- CONST char *p, *end;
+ const char *p, *end;
int oldLength;
if (length < 0) {
@@ -414,6 +385,7 @@ Tcl_UtfToUniCharDString(
*/
oldLength = Tcl_DStringLength(dsPtr);
+/* TODO: fix overreach! */
Tcl_DStringSetLength(dsPtr,
(int) ((oldLength + length + 1) * sizeof(Tcl_UniChar)));
wString = (Tcl_UniChar *) (Tcl_DStringValue(dsPtr) + oldLength);
@@ -452,7 +424,7 @@ Tcl_UtfToUniCharDString(
int
Tcl_UtfCharComplete(
- CONST char *src, /* String to check if first few bytes contain
+ const char *src, /* String to check if first few bytes contain
* a complete UTF-8 character. */
int length) /* Length of above string in bytes. */
{
@@ -482,7 +454,7 @@ Tcl_UtfCharComplete(
int
Tcl_NumUtfChars(
- register CONST char *src, /* The UTF-8 string to measure. */
+ register const char *src, /* The UTF-8 string to measure. */
int length) /* The length of the string in bytes, or -1
* for strlen(string). */
{
@@ -540,9 +512,9 @@ Tcl_NumUtfChars(
*---------------------------------------------------------------------------
*/
-CONST char *
+const char *
Tcl_UtfFindFirst(
- CONST char *src, /* The UTF-8 string to be searched. */
+ const char *src, /* The UTF-8 string to be searched. */
int ch) /* The Tcl_UniChar to search for. */
{
int len;
@@ -579,14 +551,14 @@ Tcl_UtfFindFirst(
*---------------------------------------------------------------------------
*/
-CONST char *
+const char *
Tcl_UtfFindLast(
- CONST char *src, /* The UTF-8 string to be searched. */
+ const char *src, /* The UTF-8 string to be searched. */
int ch) /* The Tcl_UniChar to search for. */
{
int len;
Tcl_UniChar find;
- CONST char *last;
+ const char *last;
last = NULL;
while (1) {
@@ -621,9 +593,9 @@ Tcl_UtfFindLast(
*---------------------------------------------------------------------------
*/
-CONST char *
+const char *
Tcl_UtfNext(
- CONST char *src) /* The current location in the string. */
+ const char *src) /* The current location in the string. */
{
Tcl_UniChar ch;
@@ -651,13 +623,13 @@ Tcl_UtfNext(
*---------------------------------------------------------------------------
*/
-CONST char *
+const char *
Tcl_UtfPrev(
- CONST char *src, /* The current location in the string. */
- CONST char *start) /* Pointer to the beginning of the string, to
+ const char *src, /* The current location in the string. */
+ const char *start) /* Pointer to the beginning of the string, to
* avoid going backwards too far. */
{
- CONST char *look;
+ const char *look;
int i, byte;
src--;
@@ -700,10 +672,10 @@ Tcl_UtfPrev(
Tcl_UniChar
Tcl_UniCharAtIndex(
- register CONST char *src, /* The UTF-8 string to dereference. */
+ register const char *src, /* The UTF-8 string to dereference. */
register int index) /* The position of the desired character. */
{
- Tcl_UniChar ch;
+ Tcl_UniChar ch = 0;
while (index >= 0) {
index--;
@@ -729,9 +701,9 @@ Tcl_UniCharAtIndex(
*---------------------------------------------------------------------------
*/
-CONST char *
+const char *
Tcl_UtfAtIndex(
- register CONST char *src, /* The UTF-8 string. */
+ register const char *src, /* The UTF-8 string. */
register int index) /* The position of the desired character. */
{
Tcl_UniChar ch;
@@ -771,7 +743,7 @@ Tcl_UtfAtIndex(
int
Tcl_UtfBackslash(
- CONST char *src, /* Points to the backslash character of a
+ const char *src, /* Points to the backslash character of a
* backslash sequence. */
int *readPtr, /* Fill in with number of characters read from
* src, unless NULL. */
@@ -983,8 +955,8 @@ Tcl_UtfToTitle(
int
TclpUtfNcmp2(
- CONST char *cs, /* UTF string to compare to ct. */
- CONST char *ct, /* UTF string cs is compared to. */
+ const char *cs, /* UTF string to compare to ct. */
+ const char *ct, /* UTF string cs is compared to. */
unsigned long numBytes) /* Number of *bytes* to compare. */
{
/*
@@ -1030,8 +1002,8 @@ TclpUtfNcmp2(
int
Tcl_UtfNcmp(
- CONST char *cs, /* UTF string to compare to ct. */
- CONST char *ct, /* UTF string cs is compared to. */
+ const char *cs, /* UTF string to compare to ct. */
+ const char *ct, /* UTF string cs is compared to. */
unsigned long numChars) /* Number of UTF chars to compare. */
{
Tcl_UniChar ch1, ch2;
@@ -1078,8 +1050,8 @@ Tcl_UtfNcmp(
int
Tcl_UtfNcasecmp(
- CONST char *cs, /* UTF string to compare to ct. */
- CONST char *ct, /* UTF string cs is compared to. */
+ const char *cs, /* UTF string to compare to ct. */
+ const char *ct, /* UTF string cs is compared to. */
unsigned long numChars) /* Number of UTF chars to compare. */
{
Tcl_UniChar ch1, ch2;
@@ -1122,8 +1094,8 @@ Tcl_UtfNcasecmp(
int
TclUtfCasecmp(
- CONST char *cs, /* UTF string to compare to ct. */
- CONST char *ct) /* UTF string cs is compared to. */
+ const char *cs, /* UTF string to compare to ct. */
+ const char *ct) /* UTF string cs is compared to. */
{
while (*cs && *ct) {
Tcl_UniChar ch1, ch2;
@@ -1252,7 +1224,7 @@ Tcl_UniCharToTitle(
int
Tcl_UniCharLen(
- CONST Tcl_UniChar *uniStr) /* Unicode string to find length of. */
+ const Tcl_UniChar *uniStr) /* Unicode string to find length of. */
{
int len = 0;
@@ -1282,8 +1254,8 @@ Tcl_UniCharLen(
int
Tcl_UniCharNcmp(
- CONST Tcl_UniChar *ucs, /* Unicode string to compare to uct. */
- CONST Tcl_UniChar *uct, /* Unicode string ucs is compared to. */
+ const Tcl_UniChar *ucs, /* Unicode string to compare to uct. */
+ const Tcl_UniChar *uct, /* Unicode string ucs is compared to. */
unsigned long numChars) /* Number of unichars to compare. */
{
#ifdef WORDS_BIGENDIAN
@@ -1327,8 +1299,8 @@ Tcl_UniCharNcmp(
int
Tcl_UniCharNcasecmp(
- CONST Tcl_UniChar *ucs, /* Unicode string to compare to uct. */
- CONST Tcl_UniChar *uct, /* Unicode string ucs is compared to. */
+ const Tcl_UniChar *ucs, /* Unicode string to compare to uct. */
+ const Tcl_UniChar *uct, /* Unicode string ucs is compared to. */
unsigned long numChars) /* Number of unichars to compare. */
{
for ( ; numChars != 0; numChars--, ucs++, uct++) {
@@ -1364,6 +1336,11 @@ int
Tcl_UniCharIsAlnum(
int ch) /* Unicode character to test. */
{
+#if TCL_UTF_MAX > 3
+ if (UNICODE_OUT_OF_RANGE(ch)) {
+ return 0;
+ }
+#endif
return (((ALPHA_BITS | DIGIT_BITS) >> GetCategory(ch)) & 1);
}
@@ -1387,6 +1364,11 @@ int
Tcl_UniCharIsAlpha(
int ch) /* Unicode character to test. */
{
+#if TCL_UTF_MAX > 3
+ if (UNICODE_OUT_OF_RANGE(ch)) {
+ return 0;
+ }
+#endif
return ((ALPHA_BITS >> GetCategory(ch)) & 1);
}
@@ -1410,6 +1392,18 @@ int
Tcl_UniCharIsControl(
int ch) /* Unicode character to test. */
{
+#if TCL_UTF_MAX > 3
+ if (UNICODE_OUT_OF_RANGE(ch)) {
+ ch &= 0x1fffff;
+ if ((ch == 0xe0001) || ((ch >= 0xe0020) && (ch <= 0xe007f))) {
+ return 1;
+ }
+ if ((ch >= 0xf0000) && ((ch & 0xffff) <= 0xfffd)) {
+ return 1;
+ }
+ return 0;
+ }
+#endif
return ((CONTROL_BITS >> GetCategory(ch)) & 1);
}
@@ -1433,6 +1427,11 @@ int
Tcl_UniCharIsDigit(
int ch) /* Unicode character to test. */
{
+#if TCL_UTF_MAX > 3
+ if (UNICODE_OUT_OF_RANGE(ch)) {
+ return 0;
+ }
+#endif
return (GetCategory(ch) == DECIMAL_DIGIT_NUMBER);
}
@@ -1456,6 +1455,12 @@ int
Tcl_UniCharIsGraph(
int ch) /* Unicode character to test. */
{
+#if TCL_UTF_MAX > 3
+ if (UNICODE_OUT_OF_RANGE(ch)) {
+ ch &= 0x1fffff;
+ return (ch >= 0xe0100) && (ch <= 0xe01ef);
+ }
+#endif
return ((GRAPH_BITS >> GetCategory(ch)) & 1);
}
@@ -1479,6 +1484,11 @@ int
Tcl_UniCharIsLower(
int ch) /* Unicode character to test. */
{
+#if TCL_UTF_MAX > 3
+ if (UNICODE_OUT_OF_RANGE(ch)) {
+ return 0;
+ }
+#endif
return (GetCategory(ch) == LOWERCASE_LETTER);
}
@@ -1502,6 +1512,12 @@ int
Tcl_UniCharIsPrint(
int ch) /* Unicode character to test. */
{
+#if TCL_UTF_MAX > 3
+ if (UNICODE_OUT_OF_RANGE(ch)) {
+ ch &= 0x1fffff;
+ return (ch >= 0xe0100) && (ch <= 0xe01ef);
+ }
+#endif
return (((GRAPH_BITS|SPACE_BITS) >> GetCategory(ch)) & 1);
}
@@ -1525,6 +1541,11 @@ int
Tcl_UniCharIsPunct(
int ch) /* Unicode character to test. */
{
+#if TCL_UTF_MAX > 3
+ if (UNICODE_OUT_OF_RANGE(ch)) {
+ return 0;
+ }
+#endif
return ((PUNCT_BITS >> GetCategory(ch)) & 1);
}
@@ -1548,14 +1569,27 @@ int
Tcl_UniCharIsSpace(
int ch) /* Unicode character to test. */
{
+#if TCL_UTF_MAX > 3
+ /* Ignore upper 11 bits. */
+ ch &= 0x1fffff;
+#else
+ /* Ignore upper 16 bits. */
+ ch &= 0xffff;
+#endif
+
/*
* If the character is within the first 127 characters, just use the
* standard C function, otherwise consult the Unicode table.
*/
- if (((Tcl_UniChar) ch) < ((Tcl_UniChar) 0x80)) {
+ if (ch < 0x80) {
return TclIsSpaceProc((char) ch);
- } else if ((Tcl_UniChar) ch == 0x180e || (Tcl_UniChar) ch == 0x202f) {
+#if TCL_UTF_MAX > 3
+ } else if (UNICODE_OUT_OF_RANGE(ch)) {
+ return 0;
+#endif
+ } else if (ch == 0x0085 || ch == 0x180e || ch == 0x200b
+ || ch == 0x202f || ch == 0x2060 || ch == 0xfeff) {
return 1;
} else {
return ((SPACE_BITS >> GetCategory(ch)) & 1);
@@ -1582,6 +1616,11 @@ int
Tcl_UniCharIsUpper(
int ch) /* Unicode character to test. */
{
+#if TCL_UTF_MAX > 3
+ if (UNICODE_OUT_OF_RANGE(ch)) {
+ return 0;
+ }
+#endif
return (GetCategory(ch) == UPPERCASE_LETTER);
}
@@ -1605,6 +1644,11 @@ int
Tcl_UniCharIsWordChar(
int ch) /* Unicode character to test. */
{
+#if TCL_UTF_MAX > 3
+ if (UNICODE_OUT_OF_RANGE(ch)) {
+ return 0;
+ }
+#endif
return ((WORD_BITS >> GetCategory(ch)) & 1);
}
@@ -1633,8 +1677,8 @@ Tcl_UniCharIsWordChar(
int
Tcl_UniCharCaseMatch(
- CONST Tcl_UniChar *uniStr, /* Unicode String. */
- CONST Tcl_UniChar *uniPattern,
+ const Tcl_UniChar *uniStr, /* Unicode String. */
+ const Tcl_UniChar *uniPattern,
/* Pattern, which may contain special
* characters. */
int nocase) /* 0 for case sensitive, 1 for insensitive */
@@ -1821,14 +1865,14 @@ Tcl_UniCharCaseMatch(
int
TclUniCharMatch(
- CONST Tcl_UniChar *string, /* Unicode String. */
+ const Tcl_UniChar *string, /* Unicode String. */
int strLen, /* Length of String */
- CONST Tcl_UniChar *pattern, /* Pattern, which may contain special
+ const Tcl_UniChar *pattern, /* Pattern, which may contain special
* characters. */
int ptnLen, /* Length of Pattern */
int nocase) /* 0 for case sensitive, 1 for insensitive */
{
- CONST Tcl_UniChar *stringEnd, *patternEnd;
+ const Tcl_UniChar *stringEnd, *patternEnd;
Tcl_UniChar p;
stringEnd = string + strLen;