summaryrefslogtreecommitdiffstats
path: root/generic/tclUtf.c
diff options
context:
space:
mode:
Diffstat (limited to 'generic/tclUtf.c')
-rw-r--r--generic/tclUtf.c296
1 files changed, 129 insertions, 167 deletions
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index b33bf6a..e5497a4 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -59,7 +59,7 @@
* UTF-8.
*/
-static const unsigned char totalBytes[256] = {
+static CONST unsigned char totalBytes[256] = {
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
@@ -73,13 +73,28 @@ static const unsigned char totalBytes[256] = {
#else
1,1,1,1,1,1,1,1,
#endif
- 1,1,1,1,1,1,1,1
+#if TCL_UTF_MAX > 4
+ 5,5,5,5,
+#else
+ 1,1,1,1,
+#endif
+#if TCL_UTF_MAX > 5
+ 6,6,6,6
+#else
+ 1,1,1,1
+#endif
};
+
+/*
+ * Functions used only in this module.
+ */
+
+static int UtfCount(int ch);
/*
*---------------------------------------------------------------------------
*
- * TclUtfCount --
+ * UtfCount --
*
* Find the number of bytes in the Utf character "ch".
*
@@ -92,20 +107,29 @@ static const unsigned char totalBytes[256] = {
*---------------------------------------------------------------------------
*/
-int
-TclUtfCount(
+INLINE static int
+UtfCount(
int ch) /* The Tcl_UniChar whose size is returned. */
{
- if ((unsigned)(ch - 1) < (UNICODE_SELF - 1)) {
+ if ((ch > 0) && (ch < UNICODE_SELF)) {
return 1;
}
if (ch <= 0x7FF) {
return 2;
}
+ if (ch <= 0xFFFF) {
+ return 3;
+ }
#if TCL_UTF_MAX > 3
- if (((unsigned)(ch - 0x10000) <= 0xfffff)) {
+ if (ch <= 0x1FFFFF) {
return 4;
}
+ if (ch <= 0x3FFFFFF) {
+ return 5;
+ }
+ if (ch <= 0x7FFFFFFF) {
+ return 6;
+ }
#endif
return 3;
}
@@ -128,7 +152,7 @@ TclUtfCount(
*---------------------------------------------------------------------------
*/
-int
+INLINE int
Tcl_UniCharToUtf(
int ch, /* The Tcl_UniChar to be stored in the
* buffer. */
@@ -137,7 +161,7 @@ Tcl_UniCharToUtf(
* large enough to hold the UTF-8 character
* (at most TCL_UTF_MAX bytes). */
{
- if ((unsigned)(ch - 1) < (UNICODE_SELF - 1)) {
+ if ((ch > 0) && (ch < UNICODE_SELF)) {
buf[0] = (char) ch;
return 1;
}
@@ -148,43 +172,43 @@ Tcl_UniCharToUtf(
return 2;
}
if (ch <= 0xFFFF) {
-#if TCL_UTF_MAX == 4
- if ((ch & 0xF800) == 0xD800) {
- if (ch & 0x0400) {
- /* Low surrogate */
- buf[3] = (char) ((ch | 0x80) & 0xBF);
- buf[2] |= (char) (((ch >> 6) | 0x80) & 0x8F);
- return 4;
- } else {
- /* High surrogate */
- ch += 0x40;
- buf[2] = (char) (((ch << 4) | 0x80) & 0xB0);
- buf[1] = (char) (((ch >> 2) | 0x80) & 0xBF);
- buf[0] = (char) (((ch >> 8) | 0xF0) & 0xF7);
- return 0;
- }
- }
-#endif
- goto three;
+ three:
+ buf[2] = (char) ((ch | 0x80) & 0xBF);
+ buf[1] = (char) (((ch >> 6) | 0x80) & 0xBF);
+ buf[0] = (char) ((ch >> 12) | 0xE0);
+ return 3;
}
#if TCL_UTF_MAX > 3
- if (ch <= 0x10FFFF) {
+ if (ch <= 0x1FFFFF) {
buf[3] = (char) ((ch | 0x80) & 0xBF);
buf[2] = (char) (((ch >> 6) | 0x80) & 0xBF);
buf[1] = (char) (((ch >> 12) | 0x80) & 0xBF);
buf[0] = (char) ((ch >> 18) | 0xF0);
return 4;
}
+ if (ch <= 0x3FFFFFF) {
+ buf[4] = (char) ((ch | 0x80) & 0xBF);
+ buf[3] = (char) (((ch >> 6) | 0x80) & 0xBF);
+ buf[2] = (char) (((ch >> 12) | 0x80) & 0xBF);
+ buf[1] = (char) (((ch >> 18) | 0x80) & 0xBF);
+ buf[0] = (char) ((ch >> 24) | 0xF8);
+ return 5;
+ }
+ if (ch <= 0x7FFFFFFF) {
+ buf[5] = (char) ((ch | 0x80) & 0xBF);
+ buf[4] = (char) (((ch >> 6) | 0x80) & 0xBF);
+ buf[3] = (char) (((ch >> 12) | 0x80) & 0xBF);
+ buf[2] = (char) (((ch >> 18) | 0x80) & 0xBF);
+ buf[1] = (char) (((ch >> 24) | 0x80) & 0xBF);
+ buf[0] = (char) ((ch >> 30) | 0xFC);
+ return 6;
+ }
#endif
}
ch = 0xFFFD;
-three:
- buf[2] = (char) ((ch | 0x80) & 0xBF);
- buf[1] = (char) (((ch >> 6) | 0x80) & 0xBF);
- buf[0] = (char) ((ch >> 12) | 0xE0);
- return 3;
+ goto three;
}
/*
@@ -207,13 +231,13 @@ three:
char *
Tcl_UniCharToUtfDString(
- const Tcl_UniChar *uniStr, /* Unicode string to convert to UTF-8. */
+ CONST Tcl_UniChar *uniStr, /* Unicode string to convert to UTF-8. */
int uniLength, /* Length of Unicode string in Tcl_UniChars
* (must be >= 0). */
Tcl_DString *dsPtr) /* UTF-8 representation of string is appended
* to this previously initialized DString. */
{
- const Tcl_UniChar *w, *wEnd;
+ CONST Tcl_UniChar *w, *wEnd;
char *p, *string;
int oldLength;
@@ -265,7 +289,7 @@ Tcl_UniCharToUtfDString(
int
Tcl_UtfToUniChar(
- register const char *src, /* The UTF-8 string. */
+ register CONST char *src, /* The UTF-8 string. */
register Tcl_UniChar *chPtr)/* Filled with the Tcl_UniChar represented by
* the UTF-8 string. */
{
@@ -299,6 +323,9 @@ Tcl_UtfToUniChar(
* A two-byte-character lead-byte not followed by trail-byte
* represents itself.
*/
+
+ *chPtr = (Tcl_UniChar) byte;
+ return 1;
} else if (byte < 0xF0) {
if (((src[1] & 0xC0) == 0x80) && ((src[2] & 0xC0) == 0x80)) {
/*
@@ -314,23 +341,31 @@ Tcl_UtfToUniChar(
* A three-byte-character lead-byte not followed by two trail-bytes
* represents itself.
*/
+
+ *chPtr = (Tcl_UniChar) byte;
+ return 1;
}
#if TCL_UTF_MAX > 3
- else if (byte < 0xF8) {
- if (((src[1] & 0xC0) == 0x80) && ((src[2] & 0xC0) == 0x80) && ((src[3] & 0xC0) == 0x80)) {
- /*
- * Four-byte-character lead byte followed by three trail bytes.
- */
-
- *chPtr = (Tcl_UniChar) (((byte & 0x0E) << 18) | ((src[1] & 0x3F) << 12)
- | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F));
- return 4;
+ {
+ int ch, total, trail;
+
+ total = totalBytes[byte];
+ trail = total - 1;
+ if (trail > 0) {
+ ch = byte & (0x3F >> trail);
+ do {
+ src++;
+ if ((*src & 0xC0) != 0x80) {
+ *chPtr = byte;
+ return 1;
+ }
+ ch <<= 6;
+ ch |= (*src & 0x3F);
+ trail--;
+ } while (trail > 0);
+ *chPtr = ch;
+ return total;
}
-
- /*
- * A three-byte-character lead-byte not followed by two trail-bytes
- * represents itself.
- */
}
#endif
@@ -358,7 +393,7 @@ Tcl_UtfToUniChar(
Tcl_UniChar *
Tcl_UtfToUniCharDString(
- const char *src, /* UTF-8 string to convert to Unicode. */
+ CONST char *src, /* UTF-8 string to convert to Unicode. */
int length, /* Length of UTF-8 string in bytes, or -1 for
* strlen(). */
Tcl_DString *dsPtr) /* Unicode representation of string is
@@ -366,7 +401,7 @@ Tcl_UtfToUniCharDString(
* DString. */
{
Tcl_UniChar *w, *wString;
- const char *p, *end;
+ CONST char *p, *end;
int oldLength;
if (length < 0) {
@@ -379,7 +414,6 @@ Tcl_UtfToUniCharDString(
*/
oldLength = Tcl_DStringLength(dsPtr);
-/* TODO: fix overreach! */
Tcl_DStringSetLength(dsPtr,
(int) ((oldLength + length + 1) * sizeof(Tcl_UniChar)));
wString = (Tcl_UniChar *) (Tcl_DStringValue(dsPtr) + oldLength);
@@ -418,7 +452,7 @@ Tcl_UtfToUniCharDString(
int
Tcl_UtfCharComplete(
- const char *src, /* String to check if first few bytes contain
+ CONST char *src, /* String to check if first few bytes contain
* a complete UTF-8 character. */
int length) /* Length of above string in bytes. */
{
@@ -448,7 +482,7 @@ Tcl_UtfCharComplete(
int
Tcl_NumUtfChars(
- register const char *src, /* The UTF-8 string to measure. */
+ register CONST char *src, /* The UTF-8 string to measure. */
int length) /* The length of the string in bytes, or -1
* for strlen(string). */
{
@@ -506,9 +540,9 @@ Tcl_NumUtfChars(
*---------------------------------------------------------------------------
*/
-const char *
+CONST char *
Tcl_UtfFindFirst(
- const char *src, /* The UTF-8 string to be searched. */
+ CONST char *src, /* The UTF-8 string to be searched. */
int ch) /* The Tcl_UniChar to search for. */
{
int len;
@@ -545,14 +579,14 @@ Tcl_UtfFindFirst(
*---------------------------------------------------------------------------
*/
-const char *
+CONST char *
Tcl_UtfFindLast(
- const char *src, /* The UTF-8 string to be searched. */
+ CONST char *src, /* The UTF-8 string to be searched. */
int ch) /* The Tcl_UniChar to search for. */
{
int len;
Tcl_UniChar find;
- const char *last;
+ CONST char *last;
last = NULL;
while (1) {
@@ -587,9 +621,9 @@ Tcl_UtfFindLast(
*---------------------------------------------------------------------------
*/
-const char *
+CONST char *
Tcl_UtfNext(
- const char *src) /* The current location in the string. */
+ CONST char *src) /* The current location in the string. */
{
Tcl_UniChar ch;
@@ -617,13 +651,13 @@ Tcl_UtfNext(
*---------------------------------------------------------------------------
*/
-const char *
+CONST char *
Tcl_UtfPrev(
- const char *src, /* The current location in the string. */
- const char *start) /* Pointer to the beginning of the string, to
+ CONST char *src, /* The current location in the string. */
+ CONST char *start) /* Pointer to the beginning of the string, to
* avoid going backwards too far. */
{
- const char *look;
+ CONST char *look;
int i, byte;
src--;
@@ -666,10 +700,10 @@ Tcl_UtfPrev(
Tcl_UniChar
Tcl_UniCharAtIndex(
- register const char *src, /* The UTF-8 string to dereference. */
+ register CONST char *src, /* The UTF-8 string to dereference. */
register int index) /* The position of the desired character. */
{
- Tcl_UniChar ch = 0;
+ Tcl_UniChar ch;
while (index >= 0) {
index--;
@@ -695,9 +729,9 @@ Tcl_UniCharAtIndex(
*---------------------------------------------------------------------------
*/
-const char *
+CONST char *
Tcl_UtfAtIndex(
- register const char *src, /* The UTF-8 string. */
+ register CONST char *src, /* The UTF-8 string. */
register int index) /* The position of the desired character. */
{
Tcl_UniChar ch;
@@ -737,7 +771,7 @@ Tcl_UtfAtIndex(
int
Tcl_UtfBackslash(
- const char *src, /* Points to the backslash character of a
+ CONST char *src, /* Points to the backslash character of a
* backslash sequence. */
int *readPtr, /* Fill in with number of characters read from
* src, unless NULL. */
@@ -803,7 +837,7 @@ Tcl_UtfToUpper(
* char to dst if its size is <= the original char.
*/
- if (bytes < TclUtfCount(upChar)) {
+ if (bytes < UtfCount(upChar)) {
memcpy(dst, src, (size_t) bytes);
dst += bytes;
} else {
@@ -856,7 +890,7 @@ Tcl_UtfToLower(
* char to dst if its size is <= the original char.
*/
- if (bytes < TclUtfCount(lowChar)) {
+ if (bytes < UtfCount(lowChar)) {
memcpy(dst, src, (size_t) bytes);
dst += bytes;
} else {
@@ -906,7 +940,7 @@ Tcl_UtfToTitle(
bytes = TclUtfToUniChar(src, &ch);
titleChar = Tcl_UniCharToTitle(ch);
- if (bytes < TclUtfCount(titleChar)) {
+ if (bytes < UtfCount(titleChar)) {
memcpy(dst, src, (size_t) bytes);
dst += bytes;
} else {
@@ -918,7 +952,7 @@ Tcl_UtfToTitle(
bytes = TclUtfToUniChar(src, &ch);
lowChar = Tcl_UniCharToLower(ch);
- if (bytes < TclUtfCount(lowChar)) {
+ if (bytes < UtfCount(lowChar)) {
memcpy(dst, src, (size_t) bytes);
dst += bytes;
} else {
@@ -949,8 +983,8 @@ Tcl_UtfToTitle(
int
TclpUtfNcmp2(
- const char *cs, /* UTF string to compare to ct. */
- const char *ct, /* UTF string cs is compared to. */
+ CONST char *cs, /* UTF string to compare to ct. */
+ CONST char *ct, /* UTF string cs is compared to. */
unsigned long numBytes) /* Number of *bytes* to compare. */
{
/*
@@ -996,8 +1030,8 @@ TclpUtfNcmp2(
int
Tcl_UtfNcmp(
- const char *cs, /* UTF string to compare to ct. */
- const char *ct, /* UTF string cs is compared to. */
+ CONST char *cs, /* UTF string to compare to ct. */
+ CONST char *ct, /* UTF string cs is compared to. */
unsigned long numChars) /* Number of UTF chars to compare. */
{
Tcl_UniChar ch1, ch2;
@@ -1044,8 +1078,8 @@ Tcl_UtfNcmp(
int
Tcl_UtfNcasecmp(
- const char *cs, /* UTF string to compare to ct. */
- const char *ct, /* UTF string cs is compared to. */
+ CONST char *cs, /* UTF string to compare to ct. */
+ CONST char *ct, /* UTF string cs is compared to. */
unsigned long numChars) /* Number of UTF chars to compare. */
{
Tcl_UniChar ch1, ch2;
@@ -1088,8 +1122,8 @@ Tcl_UtfNcasecmp(
int
TclUtfCasecmp(
- const char *cs, /* UTF string to compare to ct. */
- const char *ct) /* UTF string cs is compared to. */
+ CONST char *cs, /* UTF string to compare to ct. */
+ CONST char *ct) /* UTF string cs is compared to. */
{
while (*cs && *ct) {
Tcl_UniChar ch1, ch2;
@@ -1218,7 +1252,7 @@ Tcl_UniCharToTitle(
int
Tcl_UniCharLen(
- const Tcl_UniChar *uniStr) /* Unicode string to find length of. */
+ CONST Tcl_UniChar *uniStr) /* Unicode string to find length of. */
{
int len = 0;
@@ -1248,8 +1282,8 @@ Tcl_UniCharLen(
int
Tcl_UniCharNcmp(
- const Tcl_UniChar *ucs, /* Unicode string to compare to uct. */
- const Tcl_UniChar *uct, /* Unicode string ucs is compared to. */
+ CONST Tcl_UniChar *ucs, /* Unicode string to compare to uct. */
+ CONST Tcl_UniChar *uct, /* Unicode string ucs is compared to. */
unsigned long numChars) /* Number of unichars to compare. */
{
#ifdef WORDS_BIGENDIAN
@@ -1293,8 +1327,8 @@ Tcl_UniCharNcmp(
int
Tcl_UniCharNcasecmp(
- const Tcl_UniChar *ucs, /* Unicode string to compare to uct. */
- const Tcl_UniChar *uct, /* Unicode string ucs is compared to. */
+ CONST Tcl_UniChar *ucs, /* Unicode string to compare to uct. */
+ CONST Tcl_UniChar *uct, /* Unicode string ucs is compared to. */
unsigned long numChars) /* Number of unichars to compare. */
{
for ( ; numChars != 0; numChars--, ucs++, uct++) {
@@ -1330,11 +1364,6 @@ int
Tcl_UniCharIsAlnum(
int ch) /* Unicode character to test. */
{
-#if TCL_UTF_MAX > 3
- if (UNICODE_OUT_OF_RANGE(ch)) {
- return 0;
- }
-#endif
return (((ALPHA_BITS | DIGIT_BITS) >> GetCategory(ch)) & 1);
}
@@ -1358,11 +1387,6 @@ int
Tcl_UniCharIsAlpha(
int ch) /* Unicode character to test. */
{
-#if TCL_UTF_MAX > 3
- if (UNICODE_OUT_OF_RANGE(ch)) {
- return 0;
- }
-#endif
return ((ALPHA_BITS >> GetCategory(ch)) & 1);
}
@@ -1386,18 +1410,6 @@ int
Tcl_UniCharIsControl(
int ch) /* Unicode character to test. */
{
-#if TCL_UTF_MAX > 3
- if (UNICODE_OUT_OF_RANGE(ch)) {
- ch &= 0x1fffff;
- if ((ch == 0xe0001) || ((ch >= 0xe0020) && (ch <= 0xe007f))) {
- return 1;
- }
- if ((ch >= 0xf0000) && ((ch & 0xffff) <= 0xfffd)) {
- return 1;
- }
- return 0;
- }
-#endif
return ((CONTROL_BITS >> GetCategory(ch)) & 1);
}
@@ -1421,11 +1433,6 @@ int
Tcl_UniCharIsDigit(
int ch) /* Unicode character to test. */
{
-#if TCL_UTF_MAX > 3
- if (UNICODE_OUT_OF_RANGE(ch)) {
- return 0;
- }
-#endif
return (GetCategory(ch) == DECIMAL_DIGIT_NUMBER);
}
@@ -1449,12 +1456,6 @@ int
Tcl_UniCharIsGraph(
int ch) /* Unicode character to test. */
{
-#if TCL_UTF_MAX > 3
- if (UNICODE_OUT_OF_RANGE(ch)) {
- ch &= 0x1fffff;
- return (ch >= 0xe0100) && (ch <= 0xe01ef);
- }
-#endif
return ((GRAPH_BITS >> GetCategory(ch)) & 1);
}
@@ -1478,11 +1479,6 @@ int
Tcl_UniCharIsLower(
int ch) /* Unicode character to test. */
{
-#if TCL_UTF_MAX > 3
- if (UNICODE_OUT_OF_RANGE(ch)) {
- return 0;
- }
-#endif
return (GetCategory(ch) == LOWERCASE_LETTER);
}
@@ -1506,12 +1502,6 @@ int
Tcl_UniCharIsPrint(
int ch) /* Unicode character to test. */
{
-#if TCL_UTF_MAX > 3
- if (UNICODE_OUT_OF_RANGE(ch)) {
- ch &= 0x1fffff;
- return (ch >= 0xe0100) && (ch <= 0xe01ef);
- }
-#endif
return (((GRAPH_BITS|SPACE_BITS) >> GetCategory(ch)) & 1);
}
@@ -1535,11 +1525,6 @@ int
Tcl_UniCharIsPunct(
int ch) /* Unicode character to test. */
{
-#if TCL_UTF_MAX > 3
- if (UNICODE_OUT_OF_RANGE(ch)) {
- return 0;
- }
-#endif
return ((PUNCT_BITS >> GetCategory(ch)) & 1);
}
@@ -1563,27 +1548,14 @@ int
Tcl_UniCharIsSpace(
int ch) /* Unicode character to test. */
{
-#if TCL_UTF_MAX > 3
- /* Ignore upper 11 bits. */
- ch &= 0x1fffff;
-#else
- /* Ignore upper 16 bits. */
- ch &= 0xffff;
-#endif
-
/*
* If the character is within the first 127 characters, just use the
* standard C function, otherwise consult the Unicode table.
*/
- if (ch < 0x80) {
+ if (((Tcl_UniChar) ch) < ((Tcl_UniChar) 0x80)) {
return TclIsSpaceProc((char) ch);
-#if TCL_UTF_MAX > 3
- } else if (UNICODE_OUT_OF_RANGE(ch)) {
- return 0;
-#endif
- } else if (ch == 0x0085 || ch == 0x180e || ch == 0x200b
- || ch == 0x202f || ch == 0x2060 || ch == 0xfeff) {
+ } else if ((Tcl_UniChar) ch == 0x180e || (Tcl_UniChar) ch == 0x202f) {
return 1;
} else {
return ((SPACE_BITS >> GetCategory(ch)) & 1);
@@ -1610,11 +1582,6 @@ int
Tcl_UniCharIsUpper(
int ch) /* Unicode character to test. */
{
-#if TCL_UTF_MAX > 3
- if (UNICODE_OUT_OF_RANGE(ch)) {
- return 0;
- }
-#endif
return (GetCategory(ch) == UPPERCASE_LETTER);
}
@@ -1638,11 +1605,6 @@ int
Tcl_UniCharIsWordChar(
int ch) /* Unicode character to test. */
{
-#if TCL_UTF_MAX > 3
- if (UNICODE_OUT_OF_RANGE(ch)) {
- return 0;
- }
-#endif
return ((WORD_BITS >> GetCategory(ch)) & 1);
}
@@ -1671,8 +1633,8 @@ Tcl_UniCharIsWordChar(
int
Tcl_UniCharCaseMatch(
- const Tcl_UniChar *uniStr, /* Unicode String. */
- const Tcl_UniChar *uniPattern,
+ CONST Tcl_UniChar *uniStr, /* Unicode String. */
+ CONST Tcl_UniChar *uniPattern,
/* Pattern, which may contain special
* characters. */
int nocase) /* 0 for case sensitive, 1 for insensitive */
@@ -1859,14 +1821,14 @@ Tcl_UniCharCaseMatch(
int
TclUniCharMatch(
- const Tcl_UniChar *string, /* Unicode String. */
+ CONST Tcl_UniChar *string, /* Unicode String. */
int strLen, /* Length of String */
- const Tcl_UniChar *pattern, /* Pattern, which may contain special
+ CONST Tcl_UniChar *pattern, /* Pattern, which may contain special
* characters. */
int ptnLen, /* Length of Pattern */
int nocase) /* 0 for case sensitive, 1 for insensitive */
{
- const Tcl_UniChar *stringEnd, *patternEnd;
+ CONST Tcl_UniChar *stringEnd, *patternEnd;
Tcl_UniChar p;
stringEnd = string + strLen;