summaryrefslogtreecommitdiffstats
path: root/generic/tclUtf.c
diff options
context:
space:
mode:
Diffstat (limited to 'generic/tclUtf.c')
-rw-r--r--generic/tclUtf.c208
1 files changed, 136 insertions, 72 deletions
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index e5497a4..b878149 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -59,7 +59,7 @@
* UTF-8.
*/
-static CONST unsigned char totalBytes[256] = {
+static const unsigned char totalBytes[256] = {
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
@@ -117,19 +117,10 @@ UtfCount(
if (ch <= 0x7FF) {
return 2;
}
- if (ch <= 0xFFFF) {
- return 3;
- }
#if TCL_UTF_MAX > 3
- if (ch <= 0x1FFFFF) {
+ if ((ch > 0xFFFF) && (ch <= 0x10FFFF)) {
return 4;
}
- if (ch <= 0x3FFFFFF) {
- return 5;
- }
- if (ch <= 0x7FFFFFFF) {
- return 6;
- }
#endif
return 3;
}
@@ -172,6 +163,23 @@ Tcl_UniCharToUtf(
return 2;
}
if (ch <= 0xFFFF) {
+#if TCL_UTF_MAX == 4
+ if ((ch & 0xF800) == 0xD800) {
+ if (ch & 0x0400) {
+ /* Low surrogate */
+ buf[3] = (char) ((ch | 0x80) & 0xBF);
+ buf[2] |= (char) (((ch >> 6) | 0x80) & 0x8F);
+ return 4;
+ } else {
+ /* High surrogate */
+ ch += 0x40;
+ buf[2] = (char) (((ch << 4) | 0x80) & 0xB0);
+ buf[1] = (char) (((ch >> 2) | 0x80) & 0xBF);
+ buf[0] = (char) (((ch >> 8) | 0xF0) & 0xF7);
+ return 0;
+ }
+ }
+#endif
three:
buf[2] = (char) ((ch | 0x80) & 0xBF);
buf[1] = (char) (((ch >> 6) | 0x80) & 0xBF);
@@ -180,30 +188,13 @@ Tcl_UniCharToUtf(
}
#if TCL_UTF_MAX > 3
- if (ch <= 0x1FFFFF) {
+ if (ch <= 0x10FFFF) {
buf[3] = (char) ((ch | 0x80) & 0xBF);
buf[2] = (char) (((ch >> 6) | 0x80) & 0xBF);
buf[1] = (char) (((ch >> 12) | 0x80) & 0xBF);
buf[0] = (char) ((ch >> 18) | 0xF0);
return 4;
}
- if (ch <= 0x3FFFFFF) {
- buf[4] = (char) ((ch | 0x80) & 0xBF);
- buf[3] = (char) (((ch >> 6) | 0x80) & 0xBF);
- buf[2] = (char) (((ch >> 12) | 0x80) & 0xBF);
- buf[1] = (char) (((ch >> 18) | 0x80) & 0xBF);
- buf[0] = (char) ((ch >> 24) | 0xF8);
- return 5;
- }
- if (ch <= 0x7FFFFFFF) {
- buf[5] = (char) ((ch | 0x80) & 0xBF);
- buf[4] = (char) (((ch >> 6) | 0x80) & 0xBF);
- buf[3] = (char) (((ch >> 12) | 0x80) & 0xBF);
- buf[2] = (char) (((ch >> 18) | 0x80) & 0xBF);
- buf[1] = (char) (((ch >> 24) | 0x80) & 0xBF);
- buf[0] = (char) ((ch >> 30) | 0xFC);
- return 6;
- }
#endif
}
@@ -231,13 +222,13 @@ Tcl_UniCharToUtf(
char *
Tcl_UniCharToUtfDString(
- CONST Tcl_UniChar *uniStr, /* Unicode string to convert to UTF-8. */
+ const Tcl_UniChar *uniStr, /* Unicode string to convert to UTF-8. */
int uniLength, /* Length of Unicode string in Tcl_UniChars
* (must be >= 0). */
Tcl_DString *dsPtr) /* UTF-8 representation of string is appended
* to this previously initialized DString. */
{
- CONST Tcl_UniChar *w, *wEnd;
+ const Tcl_UniChar *w, *wEnd;
char *p, *string;
int oldLength;
@@ -289,7 +280,7 @@ Tcl_UniCharToUtfDString(
int
Tcl_UtfToUniChar(
- register CONST char *src, /* The UTF-8 string. */
+ register const char *src, /* The UTF-8 string. */
register Tcl_UniChar *chPtr)/* Filled with the Tcl_UniChar represented by
* the UTF-8 string. */
{
@@ -393,7 +384,7 @@ Tcl_UtfToUniChar(
Tcl_UniChar *
Tcl_UtfToUniCharDString(
- CONST char *src, /* UTF-8 string to convert to Unicode. */
+ const char *src, /* UTF-8 string to convert to Unicode. */
int length, /* Length of UTF-8 string in bytes, or -1 for
* strlen(). */
Tcl_DString *dsPtr) /* Unicode representation of string is
@@ -401,7 +392,7 @@ Tcl_UtfToUniCharDString(
* DString. */
{
Tcl_UniChar *w, *wString;
- CONST char *p, *end;
+ const char *p, *end;
int oldLength;
if (length < 0) {
@@ -414,6 +405,7 @@ Tcl_UtfToUniCharDString(
*/
oldLength = Tcl_DStringLength(dsPtr);
+/* TODO: fix overreach! */
Tcl_DStringSetLength(dsPtr,
(int) ((oldLength + length + 1) * sizeof(Tcl_UniChar)));
wString = (Tcl_UniChar *) (Tcl_DStringValue(dsPtr) + oldLength);
@@ -452,7 +444,7 @@ Tcl_UtfToUniCharDString(
int
Tcl_UtfCharComplete(
- CONST char *src, /* String to check if first few bytes contain
+ const char *src, /* String to check if first few bytes contain
* a complete UTF-8 character. */
int length) /* Length of above string in bytes. */
{
@@ -482,7 +474,7 @@ Tcl_UtfCharComplete(
int
Tcl_NumUtfChars(
- register CONST char *src, /* The UTF-8 string to measure. */
+ register const char *src, /* The UTF-8 string to measure. */
int length) /* The length of the string in bytes, or -1
* for strlen(string). */
{
@@ -540,9 +532,9 @@ Tcl_NumUtfChars(
*---------------------------------------------------------------------------
*/
-CONST char *
+const char *
Tcl_UtfFindFirst(
- CONST char *src, /* The UTF-8 string to be searched. */
+ const char *src, /* The UTF-8 string to be searched. */
int ch) /* The Tcl_UniChar to search for. */
{
int len;
@@ -579,14 +571,14 @@ Tcl_UtfFindFirst(
*---------------------------------------------------------------------------
*/
-CONST char *
+const char *
Tcl_UtfFindLast(
- CONST char *src, /* The UTF-8 string to be searched. */
+ const char *src, /* The UTF-8 string to be searched. */
int ch) /* The Tcl_UniChar to search for. */
{
int len;
Tcl_UniChar find;
- CONST char *last;
+ const char *last;
last = NULL;
while (1) {
@@ -621,9 +613,9 @@ Tcl_UtfFindLast(
*---------------------------------------------------------------------------
*/
-CONST char *
+const char *
Tcl_UtfNext(
- CONST char *src) /* The current location in the string. */
+ const char *src) /* The current location in the string. */
{
Tcl_UniChar ch;
@@ -651,13 +643,13 @@ Tcl_UtfNext(
*---------------------------------------------------------------------------
*/
-CONST char *
+const char *
Tcl_UtfPrev(
- CONST char *src, /* The current location in the string. */
- CONST char *start) /* Pointer to the beginning of the string, to
+ const char *src, /* The current location in the string. */
+ const char *start) /* Pointer to the beginning of the string, to
* avoid going backwards too far. */
{
- CONST char *look;
+ const char *look;
int i, byte;
src--;
@@ -700,10 +692,10 @@ Tcl_UtfPrev(
Tcl_UniChar
Tcl_UniCharAtIndex(
- register CONST char *src, /* The UTF-8 string to dereference. */
+ register const char *src, /* The UTF-8 string to dereference. */
register int index) /* The position of the desired character. */
{
- Tcl_UniChar ch;
+ Tcl_UniChar ch = 0;
while (index >= 0) {
index--;
@@ -729,9 +721,9 @@ Tcl_UniCharAtIndex(
*---------------------------------------------------------------------------
*/
-CONST char *
+const char *
Tcl_UtfAtIndex(
- register CONST char *src, /* The UTF-8 string. */
+ register const char *src, /* The UTF-8 string. */
register int index) /* The position of the desired character. */
{
Tcl_UniChar ch;
@@ -771,7 +763,7 @@ Tcl_UtfAtIndex(
int
Tcl_UtfBackslash(
- CONST char *src, /* Points to the backslash character of a
+ const char *src, /* Points to the backslash character of a
* backslash sequence. */
int *readPtr, /* Fill in with number of characters read from
* src, unless NULL. */
@@ -983,8 +975,8 @@ Tcl_UtfToTitle(
int
TclpUtfNcmp2(
- CONST char *cs, /* UTF string to compare to ct. */
- CONST char *ct, /* UTF string cs is compared to. */
+ const char *cs, /* UTF string to compare to ct. */
+ const char *ct, /* UTF string cs is compared to. */
unsigned long numBytes) /* Number of *bytes* to compare. */
{
/*
@@ -1030,8 +1022,8 @@ TclpUtfNcmp2(
int
Tcl_UtfNcmp(
- CONST char *cs, /* UTF string to compare to ct. */
- CONST char *ct, /* UTF string cs is compared to. */
+ const char *cs, /* UTF string to compare to ct. */
+ const char *ct, /* UTF string cs is compared to. */
unsigned long numChars) /* Number of UTF chars to compare. */
{
Tcl_UniChar ch1, ch2;
@@ -1078,8 +1070,8 @@ Tcl_UtfNcmp(
int
Tcl_UtfNcasecmp(
- CONST char *cs, /* UTF string to compare to ct. */
- CONST char *ct, /* UTF string cs is compared to. */
+ const char *cs, /* UTF string to compare to ct. */
+ const char *ct, /* UTF string cs is compared to. */
unsigned long numChars) /* Number of UTF chars to compare. */
{
Tcl_UniChar ch1, ch2;
@@ -1122,8 +1114,8 @@ Tcl_UtfNcasecmp(
int
TclUtfCasecmp(
- CONST char *cs, /* UTF string to compare to ct. */
- CONST char *ct) /* UTF string cs is compared to. */
+ const char *cs, /* UTF string to compare to ct. */
+ const char *ct) /* UTF string cs is compared to. */
{
while (*cs && *ct) {
Tcl_UniChar ch1, ch2;
@@ -1252,7 +1244,7 @@ Tcl_UniCharToTitle(
int
Tcl_UniCharLen(
- CONST Tcl_UniChar *uniStr) /* Unicode string to find length of. */
+ const Tcl_UniChar *uniStr) /* Unicode string to find length of. */
{
int len = 0;
@@ -1282,8 +1274,8 @@ Tcl_UniCharLen(
int
Tcl_UniCharNcmp(
- CONST Tcl_UniChar *ucs, /* Unicode string to compare to uct. */
- CONST Tcl_UniChar *uct, /* Unicode string ucs is compared to. */
+ const Tcl_UniChar *ucs, /* Unicode string to compare to uct. */
+ const Tcl_UniChar *uct, /* Unicode string ucs is compared to. */
unsigned long numChars) /* Number of unichars to compare. */
{
#ifdef WORDS_BIGENDIAN
@@ -1327,8 +1319,8 @@ Tcl_UniCharNcmp(
int
Tcl_UniCharNcasecmp(
- CONST Tcl_UniChar *ucs, /* Unicode string to compare to uct. */
- CONST Tcl_UniChar *uct, /* Unicode string ucs is compared to. */
+ const Tcl_UniChar *ucs, /* Unicode string to compare to uct. */
+ const Tcl_UniChar *uct, /* Unicode string ucs is compared to. */
unsigned long numChars) /* Number of unichars to compare. */
{
for ( ; numChars != 0; numChars--, ucs++, uct++) {
@@ -1364,6 +1356,11 @@ int
Tcl_UniCharIsAlnum(
int ch) /* Unicode character to test. */
{
+#if TCL_UTF_MAX > 3
+ if (UNICODE_OUT_OF_RANGE(ch)) {
+ return 0;
+ }
+#endif
return (((ALPHA_BITS | DIGIT_BITS) >> GetCategory(ch)) & 1);
}
@@ -1387,6 +1384,11 @@ int
Tcl_UniCharIsAlpha(
int ch) /* Unicode character to test. */
{
+#if TCL_UTF_MAX > 3
+ if (UNICODE_OUT_OF_RANGE(ch)) {
+ return 0;
+ }
+#endif
return ((ALPHA_BITS >> GetCategory(ch)) & 1);
}
@@ -1410,6 +1412,18 @@ int
Tcl_UniCharIsControl(
int ch) /* Unicode character to test. */
{
+#if TCL_UTF_MAX > 3
+ if (UNICODE_OUT_OF_RANGE(ch)) {
+ ch &= 0x1fffff;
+ if ((ch == 0xe0001) || ((ch >= 0xe0020) && (ch <= 0xe007f))) {
+ return 1;
+ }
+ if ((ch >= 0xf0000) && ((ch & 0xffff) <= 0xfffd)) {
+ return 1;
+ }
+ return 0;
+ }
+#endif
return ((CONTROL_BITS >> GetCategory(ch)) & 1);
}
@@ -1433,6 +1447,11 @@ int
Tcl_UniCharIsDigit(
int ch) /* Unicode character to test. */
{
+#if TCL_UTF_MAX > 3
+ if (UNICODE_OUT_OF_RANGE(ch)) {
+ return 0;
+ }
+#endif
return (GetCategory(ch) == DECIMAL_DIGIT_NUMBER);
}
@@ -1456,6 +1475,12 @@ int
Tcl_UniCharIsGraph(
int ch) /* Unicode character to test. */
{
+#if TCL_UTF_MAX > 3
+ if (UNICODE_OUT_OF_RANGE(ch)) {
+ ch &= 0x1fffff;
+ return (ch >= 0xe0100) && (ch <= 0xe01ef);
+ }
+#endif
return ((GRAPH_BITS >> GetCategory(ch)) & 1);
}
@@ -1479,6 +1504,11 @@ int
Tcl_UniCharIsLower(
int ch) /* Unicode character to test. */
{
+#if TCL_UTF_MAX > 3
+ if (UNICODE_OUT_OF_RANGE(ch)) {
+ return 0;
+ }
+#endif
return (GetCategory(ch) == LOWERCASE_LETTER);
}
@@ -1502,6 +1532,12 @@ int
Tcl_UniCharIsPrint(
int ch) /* Unicode character to test. */
{
+#if TCL_UTF_MAX > 3
+ if (UNICODE_OUT_OF_RANGE(ch)) {
+ ch &= 0x1fffff;
+ return (ch >= 0xe0100) && (ch <= 0xe01ef);
+ }
+#endif
return (((GRAPH_BITS|SPACE_BITS) >> GetCategory(ch)) & 1);
}
@@ -1525,6 +1561,11 @@ int
Tcl_UniCharIsPunct(
int ch) /* Unicode character to test. */
{
+#if TCL_UTF_MAX > 3
+ if (UNICODE_OUT_OF_RANGE(ch)) {
+ return 0;
+ }
+#endif
return ((PUNCT_BITS >> GetCategory(ch)) & 1);
}
@@ -1548,14 +1589,27 @@ int
Tcl_UniCharIsSpace(
int ch) /* Unicode character to test. */
{
+#if TCL_UTF_MAX > 3
+ /* Ignore upper 11 bits. */
+ ch &= 0x1fffff;
+#else
+ /* Ignore upper 16 bits. */
+ ch &= 0xffff;
+#endif
+
/*
* If the character is within the first 127 characters, just use the
* standard C function, otherwise consult the Unicode table.
*/
- if (((Tcl_UniChar) ch) < ((Tcl_UniChar) 0x80)) {
+ if (ch < 0x80) {
return TclIsSpaceProc((char) ch);
- } else if ((Tcl_UniChar) ch == 0x180e || (Tcl_UniChar) ch == 0x202f) {
+#if TCL_UTF_MAX > 3
+ } else if (UNICODE_OUT_OF_RANGE(ch)) {
+ return 0;
+#endif
+ } else if (ch == 0x0085 || ch == 0x180e || ch == 0x200b
+ || ch == 0x202f || ch == 0x2060 || ch == 0xfeff) {
return 1;
} else {
return ((SPACE_BITS >> GetCategory(ch)) & 1);
@@ -1582,6 +1636,11 @@ int
Tcl_UniCharIsUpper(
int ch) /* Unicode character to test. */
{
+#if TCL_UTF_MAX > 3
+ if (UNICODE_OUT_OF_RANGE(ch)) {
+ return 0;
+ }
+#endif
return (GetCategory(ch) == UPPERCASE_LETTER);
}
@@ -1605,6 +1664,11 @@ int
Tcl_UniCharIsWordChar(
int ch) /* Unicode character to test. */
{
+#if TCL_UTF_MAX > 3
+ if (UNICODE_OUT_OF_RANGE(ch)) {
+ return 0;
+ }
+#endif
return ((WORD_BITS >> GetCategory(ch)) & 1);
}
@@ -1633,8 +1697,8 @@ Tcl_UniCharIsWordChar(
int
Tcl_UniCharCaseMatch(
- CONST Tcl_UniChar *uniStr, /* Unicode String. */
- CONST Tcl_UniChar *uniPattern,
+ const Tcl_UniChar *uniStr, /* Unicode String. */
+ const Tcl_UniChar *uniPattern,
/* Pattern, which may contain special
* characters. */
int nocase) /* 0 for case sensitive, 1 for insensitive */
@@ -1821,14 +1885,14 @@ Tcl_UniCharCaseMatch(
int
TclUniCharMatch(
- CONST Tcl_UniChar *string, /* Unicode String. */
+ const Tcl_UniChar *string, /* Unicode String. */
int strLen, /* Length of String */
- CONST Tcl_UniChar *pattern, /* Pattern, which may contain special
+ const Tcl_UniChar *pattern, /* Pattern, which may contain special
* characters. */
int ptnLen, /* Length of Pattern */
int nocase) /* 0 for case sensitive, 1 for insensitive */
{
- CONST Tcl_UniChar *stringEnd, *patternEnd;
+ const Tcl_UniChar *stringEnd, *patternEnd;
Tcl_UniChar p;
stringEnd = string + strLen;