summaryrefslogtreecommitdiffstats
path: root/generic/tclUtf.c
diff options
context:
space:
mode:
Diffstat (limited to 'generic/tclUtf.c')
-rw-r--r--generic/tclUtf.c472
1 files changed, 252 insertions, 220 deletions
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index 992a55f..15529c7 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -7,8 +7,6 @@
*
* See the file "license.terms" for information on usage and redistribution of
* this file, and for a DISCLAIMER OF ALL WARRANTIES.
- *
- * RCS: @(#) $Id: tclUtf.c,v 1.36 2005/09/07 15:31:10 dgp Exp $
*/
#include "tclInt.h"
@@ -23,46 +21,45 @@
* The following macros are used for fast character category tests. The x_BITS
* values are shifted right by the category value to determine whether the
* given category is included in the set.
- */
+ */
#define ALPHA_BITS ((1 << UPPERCASE_LETTER) | (1 << LOWERCASE_LETTER) \
| (1 << TITLECASE_LETTER) | (1 << MODIFIER_LETTER) | (1<<OTHER_LETTER))
+#define CONTROL_BITS ((1 << CONTROL) | (1 << FORMAT) | (1 << PRIVATE_USE))
+
#define DIGIT_BITS (1 << DECIMAL_DIGIT_NUMBER)
#define SPACE_BITS ((1 << SPACE_SEPARATOR) | (1 << LINE_SEPARATOR) \
| (1 << PARAGRAPH_SEPARATOR))
-#define CONNECTOR_BITS (1 << CONNECTOR_PUNCTUATION)
-
-#define PRINT_BITS (ALPHA_BITS | DIGIT_BITS | SPACE_BITS | \
- (1 << NON_SPACING_MARK) | (1 << ENCLOSING_MARK) | \
- (1 << COMBINING_SPACING_MARK) | (1 << LETTER_NUMBER) | \
- (1 << OTHER_NUMBER) | (1 << CONNECTOR_PUNCTUATION) | \
- (1 << DASH_PUNCTUATION) | (1 << OPEN_PUNCTUATION) | \
- (1 << CLOSE_PUNCTUATION) | (1 << INITIAL_QUOTE_PUNCTUATION) | \
- (1 << FINAL_QUOTE_PUNCTUATION) | (1 << OTHER_PUNCTUATION) | \
- (1 << MATH_SYMBOL) | (1 << CURRENCY_SYMBOL) | \
- (1 << MODIFIER_SYMBOL) | (1 << OTHER_SYMBOL))
+#define WORD_BITS (ALPHA_BITS | DIGIT_BITS | (1 << CONNECTOR_PUNCTUATION))
#define PUNCT_BITS ((1 << CONNECTOR_PUNCTUATION) | \
(1 << DASH_PUNCTUATION) | (1 << OPEN_PUNCTUATION) | \
(1 << CLOSE_PUNCTUATION) | (1 << INITIAL_QUOTE_PUNCTUATION) | \
(1 << FINAL_QUOTE_PUNCTUATION) | (1 << OTHER_PUNCTUATION))
+#define GRAPH_BITS (WORD_BITS | PUNCT_BITS | \
+ (1 << NON_SPACING_MARK) | (1 << ENCLOSING_MARK) | \
+ (1 << COMBINING_SPACING_MARK) | (1 << LETTER_NUMBER) | \
+ (1 << OTHER_NUMBER) | \
+ (1 << MATH_SYMBOL) | (1 << CURRENCY_SYMBOL) | \
+ (1 << MODIFIER_SYMBOL) | (1 << OTHER_SYMBOL))
+
/*
- * Unicode characters less than this value are represented by themselves
- * in UTF-8 strings.
+ * Unicode characters less than this value are represented by themselves in
+ * UTF-8 strings.
*/
#define UNICODE_SELF 0x80
/*
- * The following structures are used when mapping between Unicode (UCS-2)
- * and UTF-8.
+ * The following structures are used when mapping between Unicode (UCS-2) and
+ * UTF-8.
*/
-static CONST unsigned char totalBytes[256] = {
+static const unsigned char totalBytes[256] = {
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
@@ -89,10 +86,10 @@ static CONST unsigned char totalBytes[256] = {
};
/*
- * Procedures used only in this module.
+ * Functions used only in this module.
*/
-static int UtfCount _ANSI_ARGS_((int ch));
+static int UtfCount(int ch);
/*
*---------------------------------------------------------------------------
@@ -109,10 +106,10 @@ static int UtfCount _ANSI_ARGS_((int ch));
*
*---------------------------------------------------------------------------
*/
-
+
INLINE static int
-UtfCount(ch)
- int ch; /* The Tcl_UniChar whose size is returned. */
+UtfCount(
+ int ch) /* The Tcl_UniChar whose size is returned. */
{
if ((ch > 0) && (ch < UNICODE_SELF)) {
return 1;
@@ -154,12 +151,12 @@ UtfCount(ch)
*
*---------------------------------------------------------------------------
*/
-
+
INLINE int
-Tcl_UniCharToUtf(ch, buf)
- int ch; /* The Tcl_UniChar to be stored in the
+Tcl_UniCharToUtf(
+ int ch, /* The Tcl_UniChar to be stored in the
* buffer. */
- char *buf; /* Buffer in which the UTF-8 representation of
+ char *buf) /* Buffer in which the UTF-8 representation of
* the Tcl_UniChar is stored. Buffer must be
* large enough to hold the UTF-8 character
* (at most TCL_UTF_MAX bytes). */
@@ -231,16 +228,16 @@ Tcl_UniCharToUtf(ch, buf)
*
*---------------------------------------------------------------------------
*/
-
+
char *
-Tcl_UniCharToUtfDString(uniStr, uniLength, dsPtr)
- CONST Tcl_UniChar *uniStr; /* Unicode string to convert to UTF-8. */
- int uniLength; /* Length of Unicode string in Tcl_UniChars
+Tcl_UniCharToUtfDString(
+ const Tcl_UniChar *uniStr, /* Unicode string to convert to UTF-8. */
+ int uniLength, /* Length of Unicode string in Tcl_UniChars
* (must be >= 0). */
- Tcl_DString *dsPtr; /* UTF-8 representation of string is appended
+ Tcl_DString *dsPtr) /* UTF-8 representation of string is appended
* to this previously initialized DString. */
{
- CONST Tcl_UniChar *w, *wEnd;
+ const Tcl_UniChar *w, *wEnd;
char *p, *string;
int oldLength;
@@ -289,15 +286,15 @@ Tcl_UniCharToUtfDString(uniStr, uniLength, dsPtr)
*
*---------------------------------------------------------------------------
*/
-
+
int
-Tcl_UtfToUniChar(src, chPtr)
- register CONST char *src; /* The UTF-8 string. */
- register Tcl_UniChar *chPtr; /* Filled with the Tcl_UniChar represented by
- * the UTF-8 string. */
+Tcl_UtfToUniChar(
+ register const char *src, /* The UTF-8 string. */
+ register Tcl_UniChar *chPtr)/* Filled with the Tcl_UniChar represented by
+ * the UTF-8 string. */
{
register int byte;
-
+
/*
* Unroll 1 to 3 byte UTF-8 sequences, use loop to handle longer ones.
*/
@@ -335,7 +332,7 @@ Tcl_UtfToUniChar(src, chPtr)
* Three-byte-character lead byte followed by two trail bytes.
*/
- *chPtr = (Tcl_UniChar) (((byte & 0x0F) << 12)
+ *chPtr = (Tcl_UniChar) (((byte & 0x0F) << 12)
| ((src[1] & 0x3F) << 6) | (src[2] & 0x3F));
return 3;
}
@@ -349,7 +346,7 @@ Tcl_UtfToUniChar(src, chPtr)
return 1;
}
#if TCL_UTF_MAX > 3
- else {
+ {
int ch, total, trail;
total = totalBytes[byte];
@@ -395,16 +392,16 @@ Tcl_UtfToUniChar(src, chPtr)
*/
Tcl_UniChar *
-Tcl_UtfToUniCharDString(src, length, dsPtr)
- CONST char *src; /* UTF-8 string to convert to Unicode. */
- int length; /* Length of UTF-8 string in bytes, or -1 for
+Tcl_UtfToUniCharDString(
+ const char *src, /* UTF-8 string to convert to Unicode. */
+ int length, /* Length of UTF-8 string in bytes, or -1 for
* strlen(). */
- Tcl_DString *dsPtr; /* Unicode representation of string is
+ Tcl_DString *dsPtr) /* Unicode representation of string is
* appended to this previously initialized
* DString. */
{
Tcl_UniChar *w, *wString;
- CONST char *p, *end;
+ const char *p, *end;
int oldLength;
if (length < 0) {
@@ -417,6 +414,7 @@ Tcl_UtfToUniCharDString(src, length, dsPtr)
*/
oldLength = Tcl_DStringLength(dsPtr);
+/* TODO: fix overreach! */
Tcl_DStringSetLength(dsPtr,
(int) ((oldLength + length + 1) * sizeof(Tcl_UniChar)));
wString = (Tcl_UniChar *) (Tcl_DStringValue(dsPtr) + oldLength);
@@ -454,10 +452,10 @@ Tcl_UtfToUniCharDString(src, length, dsPtr)
*/
int
-Tcl_UtfCharComplete(src, length)
- CONST char *src; /* String to check if first few bytes contain
+Tcl_UtfCharComplete(
+ const char *src, /* String to check if first few bytes contain
* a complete UTF-8 character. */
- int length; /* Length of above string in bytes. */
+ int length) /* Length of above string in bytes. */
{
int ch;
@@ -475,18 +473,18 @@ Tcl_UtfCharComplete(src, length)
* utflen() and utfnlen().
*
* Results:
- * As above.
+ * As above.
*
* Side effects:
* None.
*
*---------------------------------------------------------------------------
*/
-
-int
-Tcl_NumUtfChars(src, length)
- register CONST char *src; /* The UTF-8 string to measure. */
- int length; /* The length of the string in bytes, or -1
+
+int
+Tcl_NumUtfChars(
+ register const char *src, /* The UTF-8 string to measure. */
+ int length) /* The length of the string in bytes, or -1
* for strlen(string). */
{
Tcl_UniChar ch;
@@ -542,14 +540,15 @@ Tcl_NumUtfChars(src, length)
*
*---------------------------------------------------------------------------
*/
-CONST char *
-Tcl_UtfFindFirst(src, ch)
- CONST char *src; /* The UTF-8 string to be searched. */
- int ch; /* The Tcl_UniChar to search for. */
+
+const char *
+Tcl_UtfFindFirst(
+ const char *src, /* The UTF-8 string to be searched. */
+ int ch) /* The Tcl_UniChar to search for. */
{
int len;
Tcl_UniChar find;
-
+
while (1) {
len = TclUtfToUniChar(src, &find);
if (find == ch) {
@@ -581,15 +580,15 @@ Tcl_UtfFindFirst(src, ch)
*---------------------------------------------------------------------------
*/
-CONST char *
-Tcl_UtfFindLast(src, ch)
- CONST char *src; /* The UTF-8 string to be searched. */
- int ch; /* The Tcl_UniChar to search for. */
+const char *
+Tcl_UtfFindLast(
+ const char *src, /* The UTF-8 string to be searched. */
+ int ch) /* The Tcl_UniChar to search for. */
{
int len;
Tcl_UniChar find;
- CONST char *last;
-
+ const char *last;
+
last = NULL;
while (1) {
len = TclUtfToUniChar(src, &find);
@@ -622,10 +621,10 @@ Tcl_UtfFindLast(src, ch)
*
*---------------------------------------------------------------------------
*/
-
-CONST char *
-Tcl_UtfNext(src)
- CONST char *src; /* The current location in the string. */
+
+const char *
+Tcl_UtfNext(
+ const char *src) /* The current location in the string. */
{
Tcl_UniChar ch;
@@ -653,15 +652,15 @@ Tcl_UtfNext(src)
*---------------------------------------------------------------------------
*/
-CONST char *
-Tcl_UtfPrev(src, start)
- CONST char *src; /* The current location in the string. */
- CONST char *start; /* Pointer to the beginning of the string, to
+const char *
+Tcl_UtfPrev(
+ const char *src, /* The current location in the string. */
+ const char *start) /* Pointer to the beginning of the string, to
* avoid going backwards too far. */
{
- CONST char *look;
+ const char *look;
int i, byte;
-
+
src--;
look = src;
for (i = 0; i < TCL_UTF_MAX; i++) {
@@ -682,7 +681,7 @@ Tcl_UtfPrev(src, start)
}
return src;
}
-
+
/*
*---------------------------------------------------------------------------
*
@@ -699,13 +698,13 @@ Tcl_UtfPrev(src, start)
*
*---------------------------------------------------------------------------
*/
-
+
Tcl_UniChar
-Tcl_UniCharAtIndex(src, index)
- register CONST char *src; /* The UTF-8 string to dereference. */
- register int index; /* The position of the desired character. */
+Tcl_UniCharAtIndex(
+ register const char *src, /* The UTF-8 string to dereference. */
+ register int index) /* The position of the desired character. */
{
- Tcl_UniChar ch;
+ Tcl_UniChar ch = 0;
while (index >= 0) {
index--;
@@ -731,13 +730,13 @@ Tcl_UniCharAtIndex(src, index)
*---------------------------------------------------------------------------
*/
-CONST char *
-Tcl_UtfAtIndex(src, index)
- register CONST char *src; /* The UTF-8 string. */
- register int index; /* The position of the desired character. */
+const char *
+Tcl_UtfAtIndex(
+ register const char *src, /* The UTF-8 string. */
+ register int index) /* The position of the desired character. */
{
Tcl_UniChar ch;
-
+
while (index > 0) {
index--;
src += TclUtfToUniChar(src, &ch);
@@ -772,12 +771,12 @@ Tcl_UtfAtIndex(src, index)
*/
int
-Tcl_UtfBackslash(src, readPtr, dst)
- CONST char *src; /* Points to the backslash character of a
+Tcl_UtfBackslash(
+ const char *src, /* Points to the backslash character of a
* backslash sequence. */
- int *readPtr; /* Fill in with number of characters read from
+ int *readPtr, /* Fill in with number of characters read from
* src, unless NULL. */
- char *dst; /* Filled with the bytes represented by the
+ char *dst) /* Filled with the bytes represented by the
* backslash sequence. */
{
#define LINE_LENGTH 128
@@ -786,7 +785,10 @@ Tcl_UtfBackslash(src, readPtr, dst)
result = TclParseBackslash(src, LINE_LENGTH, &numRead, dst);
if (numRead == LINE_LENGTH) {
- /* We ate a whole line. Pay the price of a strlen() */
+ /*
+ * We ate a whole line. Pay the price of a strlen()
+ */
+
result = TclParseBackslash(src, (int)strlen(src), &numRead, dst);
}
if (readPtr != NULL) {
@@ -814,8 +816,8 @@ Tcl_UtfBackslash(src, readPtr, dst)
*/
int
-Tcl_UtfToUpper(str)
- char *str; /* String to convert in place. */
+Tcl_UtfToUpper(
+ char *str) /* String to convert in place. */
{
Tcl_UniChar ch, upChar;
char *src, *dst;
@@ -835,7 +837,7 @@ Tcl_UtfToUpper(str)
* conversion (thereby causing a segfault), only copy the upper case
* char to dst if its size is <= the original char.
*/
-
+
if (bytes < UtfCount(upChar)) {
memcpy(dst, src, (size_t) bytes);
dst += bytes;
@@ -867,13 +869,13 @@ Tcl_UtfToUpper(str)
*/
int
-Tcl_UtfToLower(str)
- char *str; /* String to convert in place. */
+Tcl_UtfToLower(
+ char *str) /* String to convert in place. */
{
Tcl_UniChar ch, lowChar;
char *src, *dst;
int bytes;
-
+
/*
* Iterate over the string until we hit the terminating null.
*/
@@ -888,7 +890,7 @@ Tcl_UtfToLower(str)
* conversion (thereby causing a segfault), only copy the lower case
* char to dst if its size is <= the original char.
*/
-
+
if (bytes < UtfCount(lowChar)) {
memcpy(dst, src, (size_t) bytes);
dst += bytes;
@@ -921,13 +923,13 @@ Tcl_UtfToLower(str)
*/
int
-Tcl_UtfToTitle(str)
- char *str; /* String to convert in place. */
+Tcl_UtfToTitle(
+ char *str) /* String to convert in place. */
{
Tcl_UniChar ch, titleChar, lowChar;
char *src, *dst;
int bytes;
-
+
/*
* Capitalize the first character and then lowercase the rest of the
* characters until we get to a null.
@@ -968,8 +970,8 @@ Tcl_UtfToTitle(str)
*
* TclpUtfNcmp2 --
*
- * Compare at most n bytes of utf-8 strings cs and ct. Both cs and ct are
- * assumed to be at least n bytes long.
+ * Compare at most numBytes bytes of utf-8 strings cs and ct. Both cs and
+ * ct are assumed to be at least numBytes bytes long.
*
* Results:
* Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
@@ -981,26 +983,26 @@ Tcl_UtfToTitle(str)
*/
int
-TclpUtfNcmp2(cs, ct, n)
- CONST char *cs; /* UTF string to compare to ct. */
- CONST char *ct; /* UTF string cs is compared to. */
- unsigned long n; /* Number of *bytes* to compare. */
+TclpUtfNcmp2(
+ const char *cs, /* UTF string to compare to ct. */
+ const char *ct, /* UTF string cs is compared to. */
+ unsigned long numBytes) /* Number of *bytes* to compare. */
{
/*
- * We can't simply call 'memcmp(cs, ct, n);' because we need to check for
- * Tcl's \xC0\x80 non-utf-8 null encoding. Otherwise utf-8 lexes fine in
- * the strcmp manner.
+ * We can't simply call 'memcmp(cs, ct, numBytes);' because we need to
+ * check for Tcl's \xC0\x80 non-utf-8 null encoding. Otherwise utf-8 lexes
+ * fine in the strcmp manner.
*/
register int result = 0;
- for ( ; n != 0; n--, cs++, ct++) {
+ for ( ; numBytes != 0; numBytes--, cs++, ct++) {
if (*cs != *ct) {
result = UCHAR(*cs) - UCHAR(*ct);
break;
}
}
- if (n && ((UCHAR(*cs) == 0xC0) || (UCHAR(*ct) == 0xC0))) {
+ if (numBytes && ((UCHAR(*cs) == 0xC0) || (UCHAR(*ct) == 0xC0))) {
unsigned char c1, c2;
c1 = ((UCHAR(*cs) == 0xC0) && (UCHAR(cs[1]) == 0x80)) ? 0 : UCHAR(*cs);
@@ -1028,10 +1030,10 @@ TclpUtfNcmp2(cs, ct, n)
*/
int
-Tcl_UtfNcmp(cs, ct, numChars)
- CONST char *cs; /* UTF string to compare to ct. */
- CONST char *ct; /* UTF string cs is compared to. */
- unsigned long numChars; /* Number of UTF chars to compare. */
+Tcl_UtfNcmp(
+ const char *cs, /* UTF string to compare to ct. */
+ const char *ct, /* UTF string cs is compared to. */
+ unsigned long numChars) /* Number of UTF chars to compare. */
{
Tcl_UniChar ch1, ch2;
@@ -1063,8 +1065,8 @@ Tcl_UtfNcmp(cs, ct, numChars)
* Tcl_UtfNcasecmp --
*
* Compare at most numChars UTF chars of string cs to string ct case
- * insensitive. Both cs and ct are assumed to be at least numChars
- * UTF chars long.
+ * insensitive. Both cs and ct are assumed to be at least numChars UTF
+ * chars long.
*
* Results:
* Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
@@ -1076,10 +1078,10 @@ Tcl_UtfNcmp(cs, ct, numChars)
*/
int
-Tcl_UtfNcasecmp(cs, ct, numChars)
- CONST char *cs; /* UTF string to compare to ct. */
- CONST char *ct; /* UTF string cs is compared to. */
- unsigned long numChars; /* Number of UTF chars to compare. */
+Tcl_UtfNcasecmp(
+ const char *cs, /* UTF string to compare to ct. */
+ const char *ct, /* UTF string cs is compared to. */
+ unsigned long numChars) /* Number of UTF chars to compare. */
{
Tcl_UniChar ch1, ch2;
while (numChars-- > 0) {
@@ -1104,6 +1106,46 @@ Tcl_UtfNcasecmp(cs, ct, numChars)
/*
*----------------------------------------------------------------------
*
+ * Tcl_UtfNcasecmp --
+ *
+ * Compare UTF chars of string cs to string ct case insensitively.
+ * Replacement for strcasecmp in Tcl core, in places where UTF-8 should
+ * be handled.
+ *
+ * Results:
+ * Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
+ *
+ * Side effects:
+ * None.
+ *
+ *----------------------------------------------------------------------
+ */
+
+int
+TclUtfCasecmp(
+ const char *cs, /* UTF string to compare to ct. */
+ const char *ct) /* UTF string cs is compared to. */
+{
+ while (*cs && *ct) {
+ Tcl_UniChar ch1, ch2;
+
+ cs += TclUtfToUniChar(cs, &ch1);
+ ct += TclUtfToUniChar(ct, &ch2);
+ if (ch1 != ch2) {
+ ch1 = Tcl_UniCharToLower(ch1);
+ ch2 = Tcl_UniCharToLower(ch2);
+ if (ch1 != ch2) {
+ return ch1 - ch2;
+ }
+ }
+ }
+ return UCHAR(*cs) - UCHAR(*ct);
+}
+
+
+/*
+ *----------------------------------------------------------------------
+ *
* Tcl_UniCharToUpper --
*
* Compute the uppercase equivalent of the given Unicode character.
@@ -1118,16 +1160,15 @@ Tcl_UtfNcasecmp(cs, ct, numChars)
*/
Tcl_UniChar
-Tcl_UniCharToUpper(ch)
- int ch; /* Unicode character to convert. */
+Tcl_UniCharToUpper(
+ int ch) /* Unicode character to convert. */
{
int info = GetUniCharInfo(ch);
if (GetCaseType(info) & 0x04) {
- return (Tcl_UniChar) (ch - GetDelta(info));
- } else {
- return ch;
+ ch -= GetDelta(info);
}
+ return (Tcl_UniChar) ch;
}
/*
@@ -1147,16 +1188,15 @@ Tcl_UniCharToUpper(ch)
*/
Tcl_UniChar
-Tcl_UniCharToLower(ch)
- int ch; /* Unicode character to convert. */
+Tcl_UniCharToLower(
+ int ch) /* Unicode character to convert. */
{
int info = GetUniCharInfo(ch);
if (GetCaseType(info) & 0x02) {
- return (Tcl_UniChar) (ch + GetDelta(info));
- } else {
- return ch;
+ ch += GetDelta(info);
}
+ return (Tcl_UniChar) ch;
}
/*
@@ -1176,8 +1216,8 @@ Tcl_UniCharToLower(ch)
*/
Tcl_UniChar
-Tcl_UniCharToTitle(ch)
- int ch; /* Unicode character to convert. */
+Tcl_UniCharToTitle(
+ int ch) /* Unicode character to convert. */
{
int info = GetUniCharInfo(ch);
int mode = GetCaseType(info);
@@ -1187,12 +1227,11 @@ Tcl_UniCharToTitle(ch)
* Subtract or add one depending on the original case.
*/
- return (Tcl_UniChar) (ch + ((mode & 0x4) ? -1 : 1));
+ ch += ((mode & 0x4) ? -1 : 1);
} else if (mode == 0x4) {
- return (Tcl_UniChar) (ch - GetDelta(info));
- } else {
- return ch;
+ ch -= GetDelta(info);
}
+ return (Tcl_UniChar) ch;
}
/*
@@ -1200,7 +1239,7 @@ Tcl_UniCharToTitle(ch)
*
* Tcl_UniCharLen --
*
- * Find the length of a UniChar string. The str input must be null
+ * Find the length of a UniChar string. The str input must be null
* terminated.
*
* Results:
@@ -1213,11 +1252,11 @@ Tcl_UniCharToTitle(ch)
*/
int
-Tcl_UniCharLen(uniStr)
- CONST Tcl_UniChar *uniStr; /* Unicode string to find length of. */
+Tcl_UniCharLen(
+ const Tcl_UniChar *uniStr) /* Unicode string to find length of. */
{
int len = 0;
-
+
while (*uniStr != '\0') {
len++;
uniStr++;
@@ -1243,10 +1282,10 @@ Tcl_UniCharLen(uniStr)
*/
int
-Tcl_UniCharNcmp(ucs, uct, numChars)
- CONST Tcl_UniChar *ucs; /* Unicode string to compare to uct. */
- CONST Tcl_UniChar *uct; /* Unicode string ucs is compared to. */
- unsigned long numChars; /* Number of unichars to compare. */
+Tcl_UniCharNcmp(
+ const Tcl_UniChar *ucs, /* Unicode string to compare to uct. */
+ const Tcl_UniChar *uct, /* Unicode string ucs is compared to. */
+ unsigned long numChars) /* Number of unichars to compare. */
{
#ifdef WORDS_BIGENDIAN
/*
@@ -1275,7 +1314,7 @@ Tcl_UniCharNcmp(ucs, uct, numChars)
* Tcl_UniCharNcasecmp --
*
* Compare at most numChars unichars of string ucs to string uct case
- * insensitive. Both ucs and uct are assumed to be at least numChars
+ * insensitive. Both ucs and uct are assumed to be at least numChars
* unichars long.
*
* Results:
@@ -1288,15 +1327,16 @@ Tcl_UniCharNcmp(ucs, uct, numChars)
*/
int
-Tcl_UniCharNcasecmp(ucs, uct, numChars)
- CONST Tcl_UniChar *ucs; /* Unicode string to compare to uct. */
- CONST Tcl_UniChar *uct; /* Unicode string ucs is compared to. */
- unsigned long numChars; /* Number of unichars to compare. */
+Tcl_UniCharNcasecmp(
+ const Tcl_UniChar *ucs, /* Unicode string to compare to uct. */
+ const Tcl_UniChar *uct, /* Unicode string ucs is compared to. */
+ unsigned long numChars) /* Number of unichars to compare. */
{
for ( ; numChars != 0; numChars--, ucs++, uct++) {
if (*ucs != *uct) {
Tcl_UniChar lcs = Tcl_UniCharToLower(*ucs);
Tcl_UniChar lct = Tcl_UniCharToLower(*uct);
+
if (lcs != lct) {
return (lcs - lct);
}
@@ -1322,12 +1362,10 @@ Tcl_UniCharNcasecmp(ucs, uct, numChars)
*/
int
-Tcl_UniCharIsAlnum(ch)
- int ch; /* Unicode character to test. */
+Tcl_UniCharIsAlnum(
+ int ch) /* Unicode character to test. */
{
- register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
-
- return (((ALPHA_BITS | DIGIT_BITS) >> category) & 1);
+ return (((ALPHA_BITS | DIGIT_BITS) >> GetCategory(ch)) & 1);
}
/*
@@ -1347,11 +1385,10 @@ Tcl_UniCharIsAlnum(ch)
*/
int
-Tcl_UniCharIsAlpha(ch)
- int ch; /* Unicode character to test. */
+Tcl_UniCharIsAlpha(
+ int ch) /* Unicode character to test. */
{
- register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
- return ((ALPHA_BITS >> category) & 1);
+ return ((ALPHA_BITS >> GetCategory(ch)) & 1);
}
/*
@@ -1371,10 +1408,10 @@ Tcl_UniCharIsAlpha(ch)
*/
int
-Tcl_UniCharIsControl(ch)
- int ch; /* Unicode character to test. */
+Tcl_UniCharIsControl(
+ int ch) /* Unicode character to test. */
{
- return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == CONTROL);
+ return ((CONTROL_BITS >> GetCategory(ch)) & 1);
}
/*
@@ -1394,11 +1431,10 @@ Tcl_UniCharIsControl(ch)
*/
int
-Tcl_UniCharIsDigit(ch)
- int ch; /* Unicode character to test. */
+Tcl_UniCharIsDigit(
+ int ch) /* Unicode character to test. */
{
- return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK)
- == DECIMAL_DIGIT_NUMBER);
+ return (GetCategory(ch) == DECIMAL_DIGIT_NUMBER);
}
/*
@@ -1418,11 +1454,10 @@ Tcl_UniCharIsDigit(ch)
*/
int
-Tcl_UniCharIsGraph(ch)
- int ch; /* Unicode character to test. */
+Tcl_UniCharIsGraph(
+ int ch) /* Unicode character to test. */
{
- register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
- return (((PRINT_BITS >> category) & 1) && ((unsigned char) ch != ' '));
+ return ((GRAPH_BITS >> GetCategory(ch)) & 1);
}
/*
@@ -1442,10 +1477,10 @@ Tcl_UniCharIsGraph(ch)
*/
int
-Tcl_UniCharIsLower(ch)
- int ch; /* Unicode character to test. */
+Tcl_UniCharIsLower(
+ int ch) /* Unicode character to test. */
{
- return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == LOWERCASE_LETTER);
+ return (GetCategory(ch) == LOWERCASE_LETTER);
}
/*
@@ -1465,11 +1500,10 @@ Tcl_UniCharIsLower(ch)
*/
int
-Tcl_UniCharIsPrint(ch)
- int ch; /* Unicode character to test. */
+Tcl_UniCharIsPrint(
+ int ch) /* Unicode character to test. */
{
- register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
- return ((PRINT_BITS >> category) & 1);
+ return (((GRAPH_BITS|SPACE_BITS) >> GetCategory(ch)) & 1);
}
/*
@@ -1489,11 +1523,10 @@ Tcl_UniCharIsPrint(ch)
*/
int
-Tcl_UniCharIsPunct(ch)
- int ch; /* Unicode character to test. */
+Tcl_UniCharIsPunct(
+ int ch) /* Unicode character to test. */
{
- register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
- return ((PUNCT_BITS >> category) & 1);
+ return ((PUNCT_BITS >> GetCategory(ch)) & 1);
}
/*
@@ -1513,21 +1546,22 @@ Tcl_UniCharIsPunct(ch)
*/
int
-Tcl_UniCharIsSpace(ch)
- int ch; /* Unicode character to test. */
+Tcl_UniCharIsSpace(
+ int ch) /* Unicode character to test. */
{
- register int category;
-
/*
* If the character is within the first 127 characters, just use the
* standard C function, otherwise consult the Unicode table.
*/
- if (ch < 0x80) {
- return isspace(UCHAR(ch)); /* INTL: ISO space */
+ if (((Tcl_UniChar) ch) < ((Tcl_UniChar) 0x80)) {
+ return TclIsSpaceProc((char) ch);
+ } else if ((Tcl_UniChar) ch == 0x0085 || (Tcl_UniChar) ch == 0x180e
+ || (Tcl_UniChar) ch == 0x200b || (Tcl_UniChar) ch == 0x2060
+ || (Tcl_UniChar) ch == 0x202f || (Tcl_UniChar) ch == 0xfeff) {
+ return 1;
} else {
- category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
- return ((SPACE_BITS >> category) & 1);
+ return ((SPACE_BITS >> GetCategory(ch)) & 1);
}
}
@@ -1548,10 +1582,10 @@ Tcl_UniCharIsSpace(ch)
*/
int
-Tcl_UniCharIsUpper(ch)
- int ch; /* Unicode character to test. */
+Tcl_UniCharIsUpper(
+ int ch) /* Unicode character to test. */
{
- return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == UPPERCASE_LETTER);
+ return (GetCategory(ch) == UPPERCASE_LETTER);
}
/*
@@ -1571,12 +1605,10 @@ Tcl_UniCharIsUpper(ch)
*/
int
-Tcl_UniCharIsWordChar(ch)
- int ch; /* Unicode character to test. */
+Tcl_UniCharIsWordChar(
+ int ch) /* Unicode character to test. */
{
- register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
-
- return (((ALPHA_BITS | DIGIT_BITS | CONNECTOR_BITS) >> category) & 1);
+ return ((WORD_BITS >> GetCategory(ch)) & 1);
}
/*
@@ -1603,24 +1635,24 @@ Tcl_UniCharIsWordChar(ch)
*/
int
-Tcl_UniCharCaseMatch(uniStr, uniPattern, nocase)
- CONST Tcl_UniChar *uniStr; /* Unicode String. */
- CONST Tcl_UniChar *uniPattern;
+Tcl_UniCharCaseMatch(
+ const Tcl_UniChar *uniStr, /* Unicode String. */
+ const Tcl_UniChar *uniPattern,
/* Pattern, which may contain special
* characters. */
- int nocase; /* 0 for case sensitive, 1 for insensitive */
+ int nocase) /* 0 for case sensitive, 1 for insensitive */
{
Tcl_UniChar ch1, p;
-
+
while (1) {
p = *uniPattern;
-
+
/*
* See if we're at the end of both the pattern and the string. If so,
* we succeeded. If we're at the end of the pattern but not at the end
* of the string, we failed.
*/
-
+
if (p == 0) {
return (*uniStr == 0);
}
@@ -1635,7 +1667,7 @@ Tcl_UniCharCaseMatch(uniStr, uniPattern, nocase)
* recursively for each postfix of string, until either we match or we
* reach the end of the string.
*/
-
+
if (p == '*') {
/*
* Skip all successive *'s in the pattern
@@ -1696,7 +1728,7 @@ Tcl_UniCharCaseMatch(uniStr, uniPattern, nocase)
* list of characters that are acceptable, or by a range (two
* characters separated by "-").
*/
-
+
if (p == '[') {
Tcl_UniChar startChar, endChar;
@@ -1752,8 +1784,8 @@ Tcl_UniCharCaseMatch(uniStr, uniPattern, nocase)
}
/*
- * There's no special character. Just make sure that the next bytes
- * of each string match.
+ * There's no special character. Just make sure that the next bytes of
+ * each string match.
*/
if (nocase) {
@@ -1791,15 +1823,15 @@ Tcl_UniCharCaseMatch(uniStr, uniPattern, nocase)
*/
int
-TclUniCharMatch(string, strLen, pattern, ptnLen, nocase)
- CONST Tcl_UniChar *string; /* Unicode String. */
- int strLen; /* length of String */
- CONST Tcl_UniChar *pattern; /* Pattern, which may contain special
+TclUniCharMatch(
+ const Tcl_UniChar *string, /* Unicode String. */
+ int strLen, /* Length of String */
+ const Tcl_UniChar *pattern, /* Pattern, which may contain special
* characters. */
- int ptnLen; /* length of Pattern */
- int nocase; /* 0 for case sensitive, 1 for insensitive */
+ int ptnLen, /* Length of Pattern */
+ int nocase) /* 0 for case sensitive, 1 for insensitive */
{
- CONST Tcl_UniChar *stringEnd, *patternEnd;
+ const Tcl_UniChar *stringEnd, *patternEnd;
Tcl_UniChar p;
stringEnd = string + strLen;
@@ -1827,7 +1859,7 @@ TclUniCharMatch(string, strLen, pattern, ptnLen, nocase)
* recursively for each postfix of string, until either we match or we
* reach the end of the string.
*/
-
+
if (p == '*') {
/*
* Skip all successive *'s in the pattern.
@@ -1889,7 +1921,7 @@ TclUniCharMatch(string, strLen, pattern, ptnLen, nocase)
* list of characters that are acceptable, or by a range (two
* characters separated by "-").
*/
-
+
if (p == '[') {
Tcl_UniChar ch1, startChar, endChar;