summaryrefslogtreecommitdiffstats
path: root/generic/tclUtf.c
diff options
context:
space:
mode:
Diffstat (limited to 'generic/tclUtf.c')
-rw-r--r--generic/tclUtf.c1359
1 files changed, 776 insertions, 583 deletions
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index 0148ca6..15529c7 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -5,10 +5,8 @@
*
* Copyright (c) 1997-1998 Sun Microsystems, Inc.
*
- * See the file "license.terms" for information on usage and redistribution
- * of this file, and for a DISCLAIMER OF ALL WARRANTIES.
- *
- * RCS: @(#) $Id: tclUtf.c,v 1.23 2002/02/08 02:52:54 dgp Exp $
+ * See the file "license.terms" for information on usage and redistribution of
+ * this file, and for a DISCLAIMER OF ALL WARRANTIES.
*/
#include "tclInt.h"
@@ -20,49 +18,48 @@
#include "tclUniData.c"
/*
- * The following macros are used for fast character category tests. The
- * x_BITS values are shifted right by the category value to determine whether
- * the given category is included in the set.
- */
+ * The following macros are used for fast character category tests. The x_BITS
+ * values are shifted right by the category value to determine whether the
+ * given category is included in the set.
+ */
#define ALPHA_BITS ((1 << UPPERCASE_LETTER) | (1 << LOWERCASE_LETTER) \
- | (1 << TITLECASE_LETTER) | (1 << MODIFIER_LETTER) | (1 << OTHER_LETTER))
+ | (1 << TITLECASE_LETTER) | (1 << MODIFIER_LETTER) | (1<<OTHER_LETTER))
+
+#define CONTROL_BITS ((1 << CONTROL) | (1 << FORMAT) | (1 << PRIVATE_USE))
#define DIGIT_BITS (1 << DECIMAL_DIGIT_NUMBER)
#define SPACE_BITS ((1 << SPACE_SEPARATOR) | (1 << LINE_SEPARATOR) \
- | (1 << PARAGRAPH_SEPARATOR))
+ | (1 << PARAGRAPH_SEPARATOR))
-#define CONNECTOR_BITS (1 << CONNECTOR_PUNCTUATION)
-
-#define PRINT_BITS (ALPHA_BITS | DIGIT_BITS | SPACE_BITS | \
- (1 << NON_SPACING_MARK) | (1 << ENCLOSING_MARK) | \
- (1 << COMBINING_SPACING_MARK) | (1 << LETTER_NUMBER) | \
- (1 << OTHER_NUMBER) | (1 << CONNECTOR_PUNCTUATION) | \
- (1 << DASH_PUNCTUATION) | (1 << OPEN_PUNCTUATION) | \
- (1 << CLOSE_PUNCTUATION) | (1 << INITIAL_QUOTE_PUNCTUATION) | \
- (1 << FINAL_QUOTE_PUNCTUATION) | (1 << OTHER_PUNCTUATION) | \
- (1 << MATH_SYMBOL) | (1 << CURRENCY_SYMBOL) | \
- (1 << MODIFIER_SYMBOL) | (1 << OTHER_SYMBOL))
+#define WORD_BITS (ALPHA_BITS | DIGIT_BITS | (1 << CONNECTOR_PUNCTUATION))
#define PUNCT_BITS ((1 << CONNECTOR_PUNCTUATION) | \
- (1 << DASH_PUNCTUATION) | (1 << OPEN_PUNCTUATION) | \
- (1 << CLOSE_PUNCTUATION) | (1 << INITIAL_QUOTE_PUNCTUATION) | \
- (1 << FINAL_QUOTE_PUNCTUATION) | (1 << OTHER_PUNCTUATION))
+ (1 << DASH_PUNCTUATION) | (1 << OPEN_PUNCTUATION) | \
+ (1 << CLOSE_PUNCTUATION) | (1 << INITIAL_QUOTE_PUNCTUATION) | \
+ (1 << FINAL_QUOTE_PUNCTUATION) | (1 << OTHER_PUNCTUATION))
+
+#define GRAPH_BITS (WORD_BITS | PUNCT_BITS | \
+ (1 << NON_SPACING_MARK) | (1 << ENCLOSING_MARK) | \
+ (1 << COMBINING_SPACING_MARK) | (1 << LETTER_NUMBER) | \
+ (1 << OTHER_NUMBER) | \
+ (1 << MATH_SYMBOL) | (1 << CURRENCY_SYMBOL) | \
+ (1 << MODIFIER_SYMBOL) | (1 << OTHER_SYMBOL))
/*
- * Unicode characters less than this value are represented by themselves
- * in UTF-8 strings.
+ * Unicode characters less than this value are represented by themselves in
+ * UTF-8 strings.
*/
#define UNICODE_SELF 0x80
/*
- * The following structures are used when mapping between Unicode (UCS-2)
- * and UTF-8.
+ * The following structures are used when mapping between Unicode (UCS-2) and
+ * UTF-8.
*/
-
-CONST unsigned char totalBytes[256] = {
+
+static const unsigned char totalBytes[256] = {
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
@@ -89,11 +86,10 @@ CONST unsigned char totalBytes[256] = {
};
/*
- * Procedures used only in this module.
+ * Functions used only in this module.
*/
-static int UtfCount _ANSI_ARGS_((int ch));
-
+static int UtfCount(int ch);
/*
*---------------------------------------------------------------------------
@@ -110,10 +106,10 @@ static int UtfCount _ANSI_ARGS_((int ch));
*
*---------------------------------------------------------------------------
*/
-
+
INLINE static int
-UtfCount(ch)
- int ch; /* The Tcl_UniChar whose size is returned. */
+UtfCount(
+ int ch) /* The Tcl_UniChar whose size is returned. */
{
if ((ch > 0) && (ch < UNICODE_SELF)) {
return 1;
@@ -144,70 +140,72 @@ UtfCount(ch)
* Tcl_UniCharToUtf --
*
* Store the given Tcl_UniChar as a sequence of UTF-8 bytes in the
- * provided buffer. Equivalent to Plan 9 runetochar().
+ * provided buffer. Equivalent to Plan 9 runetochar().
*
* Results:
- * The return values is the number of bytes in the buffer that
- * were consumed.
+ * The return values is the number of bytes in the buffer that were
+ * consumed.
*
* Side effects:
* None.
*
*---------------------------------------------------------------------------
*/
-
+
INLINE int
-Tcl_UniCharToUtf(ch, str)
- int ch; /* The Tcl_UniChar to be stored in the
+Tcl_UniCharToUtf(
+ int ch, /* The Tcl_UniChar to be stored in the
* buffer. */
- char *str; /* Buffer in which the UTF-8 representation
- * of the Tcl_UniChar is stored. Buffer must
- * be large enough to hold the UTF-8 character
+ char *buf) /* Buffer in which the UTF-8 representation of
+ * the Tcl_UniChar is stored. Buffer must be
+ * large enough to hold the UTF-8 character
* (at most TCL_UTF_MAX bytes). */
{
if ((ch > 0) && (ch < UNICODE_SELF)) {
- str[0] = (char) ch;
+ buf[0] = (char) ch;
return 1;
}
- if (ch <= 0x7FF) {
- str[1] = (char) ((ch | 0x80) & 0xBF);
- str[0] = (char) ((ch >> 6) | 0xC0);
- return 2;
- }
- if (ch <= 0xFFFF) {
+ if (ch >= 0) {
+ if (ch <= 0x7FF) {
+ buf[1] = (char) ((ch | 0x80) & 0xBF);
+ buf[0] = (char) ((ch >> 6) | 0xC0);
+ return 2;
+ }
+ if (ch <= 0xFFFF) {
three:
- str[2] = (char) ((ch | 0x80) & 0xBF);
- str[1] = (char) (((ch >> 6) | 0x80) & 0xBF);
- str[0] = (char) ((ch >> 12) | 0xE0);
- return 3;
- }
+ buf[2] = (char) ((ch | 0x80) & 0xBF);
+ buf[1] = (char) (((ch >> 6) | 0x80) & 0xBF);
+ buf[0] = (char) ((ch >> 12) | 0xE0);
+ return 3;
+ }
#if TCL_UTF_MAX > 3
- if (ch <= 0x1FFFFF) {
- str[3] = (char) ((ch | 0x80) & 0xBF);
- str[2] = (char) (((ch >> 6) | 0x80) & 0xBF);
- str[1] = (char) (((ch >> 12) | 0x80) & 0xBF);
- str[0] = (char) ((ch >> 18) | 0xF0);
- return 4;
- }
- if (ch <= 0x3FFFFFF) {
- str[4] = (char) ((ch | 0x80) & 0xBF);
- str[3] = (char) (((ch >> 6) | 0x80) & 0xBF);
- str[2] = (char) (((ch >> 12) | 0x80) & 0xBF);
- str[1] = (char) (((ch >> 18) | 0x80) & 0xBF);
- str[0] = (char) ((ch >> 24) | 0xF8);
- return 5;
- }
- if (ch <= 0x7FFFFFFF) {
- str[5] = (char) ((ch | 0x80) & 0xBF);
- str[4] = (char) (((ch >> 6) | 0x80) & 0xBF);
- str[3] = (char) (((ch >> 12) | 0x80) & 0xBF);
- str[2] = (char) (((ch >> 18) | 0x80) & 0xBF);
- str[1] = (char) (((ch >> 24) | 0x80) & 0xBF);
- str[0] = (char) ((ch >> 30) | 0xFC);
- return 6;
- }
+ if (ch <= 0x1FFFFF) {
+ buf[3] = (char) ((ch | 0x80) & 0xBF);
+ buf[2] = (char) (((ch >> 6) | 0x80) & 0xBF);
+ buf[1] = (char) (((ch >> 12) | 0x80) & 0xBF);
+ buf[0] = (char) ((ch >> 18) | 0xF0);
+ return 4;
+ }
+ if (ch <= 0x3FFFFFF) {
+ buf[4] = (char) ((ch | 0x80) & 0xBF);
+ buf[3] = (char) (((ch >> 6) | 0x80) & 0xBF);
+ buf[2] = (char) (((ch >> 12) | 0x80) & 0xBF);
+ buf[1] = (char) (((ch >> 18) | 0x80) & 0xBF);
+ buf[0] = (char) ((ch >> 24) | 0xF8);
+ return 5;
+ }
+ if (ch <= 0x7FFFFFFF) {
+ buf[5] = (char) ((ch | 0x80) & 0xBF);
+ buf[4] = (char) (((ch >> 6) | 0x80) & 0xBF);
+ buf[3] = (char) (((ch >> 12) | 0x80) & 0xBF);
+ buf[2] = (char) (((ch >> 18) | 0x80) & 0xBF);
+ buf[1] = (char) (((ch >> 24) | 0x80) & 0xBF);
+ buf[0] = (char) ((ch >> 30) | 0xFC);
+ return 6;
+ }
#endif
+ }
ch = 0xFFFD;
goto three;
@@ -222,25 +220,24 @@ Tcl_UniCharToUtf(ch, str)
*
* Results:
* The return value is a pointer to the UTF-8 representation of the
- * Unicode string. Storage for the return value is appended to the
- * end of dsPtr.
+ * Unicode string. Storage for the return value is appended to the end of
+ * dsPtr.
*
* Side effects:
* None.
*
*---------------------------------------------------------------------------
*/
-
+
char *
-Tcl_UniCharToUtfDString(wString, numChars, dsPtr)
- CONST Tcl_UniChar *wString; /* Unicode string to convert to UTF-8. */
- int numChars; /* Length of Unicode string in Tcl_UniChars
+Tcl_UniCharToUtfDString(
+ const Tcl_UniChar *uniStr, /* Unicode string to convert to UTF-8. */
+ int uniLength, /* Length of Unicode string in Tcl_UniChars
* (must be >= 0). */
- Tcl_DString *dsPtr; /* UTF-8 representation of string is
- * appended to this previously initialized
- * DString. */
+ Tcl_DString *dsPtr) /* UTF-8 representation of string is appended
+ * to this previously initialized DString. */
{
- CONST Tcl_UniChar *w, *wEnd;
+ const Tcl_UniChar *w, *wEnd;
char *p, *string;
int oldLength;
@@ -250,12 +247,12 @@ Tcl_UniCharToUtfDString(wString, numChars, dsPtr)
*/
oldLength = Tcl_DStringLength(dsPtr);
- Tcl_DStringSetLength(dsPtr, (oldLength + numChars + 1) * TCL_UTF_MAX);
+ Tcl_DStringSetLength(dsPtr, (oldLength + uniLength + 1) * TCL_UTF_MAX);
string = Tcl_DStringValue(dsPtr) + oldLength;
p = string;
- wEnd = wString + numChars;
- for (w = wString; w < wEnd; ) {
+ wEnd = uniStr + uniLength;
+ for (w = uniStr; w < wEnd; ) {
p += Tcl_UniCharToUtf(*w, p);
w++;
}
@@ -269,16 +266,16 @@ Tcl_UniCharToUtfDString(wString, numChars, dsPtr)
*
* Tcl_UtfToUniChar --
*
- * Extract the Tcl_UniChar represented by the UTF-8 string. Bad
- * UTF-8 sequences are converted to valid Tcl_UniChars and processing
- * continues. Equivalent to Plan 9 chartorune().
+ * Extract the Tcl_UniChar represented by the UTF-8 string. Bad UTF-8
+ * sequences are converted to valid Tcl_UniChars and processing
+ * continues. Equivalent to Plan 9 chartorune().
*
- * The caller must ensure that the source buffer is long enough that
- * this routine does not run off the end and dereference non-existent
- * memory looking for trail bytes. If the source buffer is known to
- * be '\0' terminated, this cannot happen. Otherwise, the caller
- * should call Tcl_UtfCharComplete() before calling this routine to
- * ensure that enough bytes remain in the string.
+ * The caller must ensure that the source buffer is long enough that this
+ * routine does not run off the end and dereference non-existent memory
+ * looking for trail bytes. If the source buffer is known to be '\0'
+ * terminated, this cannot happen. Otherwise, the caller should call
+ * Tcl_UtfCharComplete() before calling this routine to ensure that
+ * enough bytes remain in the string.
*
* Results:
* *chPtr is filled with the Tcl_UniChar, and the return value is the
@@ -289,55 +286,57 @@ Tcl_UniCharToUtfDString(wString, numChars, dsPtr)
*
*---------------------------------------------------------------------------
*/
-
+
int
-Tcl_UtfToUniChar(str, chPtr)
- register CONST char *str; /* The UTF-8 string. */
- register Tcl_UniChar *chPtr; /* Filled with the Tcl_UniChar represented
- * by the UTF-8 string. */
+Tcl_UtfToUniChar(
+ register const char *src, /* The UTF-8 string. */
+ register Tcl_UniChar *chPtr)/* Filled with the Tcl_UniChar represented by
+ * the UTF-8 string. */
{
register int byte;
-
+
/*
* Unroll 1 to 3 byte UTF-8 sequences, use loop to handle longer ones.
*/
- byte = *((unsigned char *) str);
+ byte = *((unsigned char *) src);
if (byte < 0xC0) {
/*
* Handles properly formed UTF-8 characters between 0x01 and 0x7F.
* Also treats \0 and naked trail bytes 0x80 to 0xBF as valid
* characters representing themselves.
*/
-
+
*chPtr = (Tcl_UniChar) byte;
return 1;
} else if (byte < 0xE0) {
- if ((str[1] & 0xC0) == 0x80) {
+ if ((src[1] & 0xC0) == 0x80) {
/*
* Two-byte-character lead-byte followed by a trail-byte.
*/
-
- *chPtr = (Tcl_UniChar) (((byte & 0x1F) << 6) | (str[1] & 0x3F));
+
+ *chPtr = (Tcl_UniChar) (((byte & 0x1F) << 6) | (src[1] & 0x3F));
return 2;
}
+
/*
* A two-byte-character lead-byte not followed by trail-byte
* represents itself.
*/
-
+
*chPtr = (Tcl_UniChar) byte;
return 1;
} else if (byte < 0xF0) {
- if (((str[1] & 0xC0) == 0x80) && ((str[2] & 0xC0) == 0x80)) {
+ if (((src[1] & 0xC0) == 0x80) && ((src[2] & 0xC0) == 0x80)) {
/*
* Three-byte-character lead byte followed by two trail bytes.
*/
- *chPtr = (Tcl_UniChar) (((byte & 0x0F) << 12)
- | ((str[1] & 0x3F) << 6) | (str[2] & 0x3F));
+ *chPtr = (Tcl_UniChar) (((byte & 0x0F) << 12)
+ | ((src[1] & 0x3F) << 6) | (src[2] & 0x3F));
return 3;
}
+
/*
* A three-byte-character lead-byte not followed by two trail-bytes
* represents itself.
@@ -347,7 +346,7 @@ Tcl_UtfToUniChar(str, chPtr)
return 1;
}
#if TCL_UTF_MAX > 3
- else {
+ {
int ch, total, trail;
total = totalBytes[byte];
@@ -355,13 +354,13 @@ Tcl_UtfToUniChar(str, chPtr)
if (trail > 0) {
ch = byte & (0x3F >> trail);
do {
- str++;
- if ((*str & 0xC0) != 0x80) {
+ src++;
+ if ((*src & 0xC0) != 0x80) {
*chPtr = byte;
return 1;
}
ch <<= 6;
- ch |= (*str & 0x3F);
+ ch |= (*src & 0x3F);
trail--;
} while (trail > 0);
*chPtr = ch;
@@ -383,9 +382,8 @@ Tcl_UtfToUniChar(str, chPtr)
*
* Results:
* The return value is a pointer to the Unicode representation of the
- * UTF-8 string. Storage for the return value is appended to the
- * end of dsPtr. The Unicode string is terminated with a Unicode
- * NULL character.
+ * UTF-8 string. Storage for the return value is appended to the end of
+ * dsPtr. The Unicode string is terminated with a Unicode NULL character.
*
* Side effects:
* None.
@@ -394,36 +392,37 @@ Tcl_UtfToUniChar(str, chPtr)
*/
Tcl_UniChar *
-Tcl_UtfToUniCharDString(string, length, dsPtr)
- CONST char *string; /* UTF-8 string to convert to Unicode. */
- int length; /* Length of UTF-8 string in bytes, or -1
- * for strlen(). */
- Tcl_DString *dsPtr; /* Unicode representation of string is
+Tcl_UtfToUniCharDString(
+ const char *src, /* UTF-8 string to convert to Unicode. */
+ int length, /* Length of UTF-8 string in bytes, or -1 for
+ * strlen(). */
+ Tcl_DString *dsPtr) /* Unicode representation of string is
* appended to this previously initialized
* DString. */
{
Tcl_UniChar *w, *wString;
- CONST char *p, *end;
+ const char *p, *end;
int oldLength;
if (length < 0) {
- length = strlen(string);
+ length = strlen(src);
}
/*
- * Unicode string length in Tcl_UniChars will be <= UTF-8 string length
- * in bytes.
+ * Unicode string length in Tcl_UniChars will be <= UTF-8 string length in
+ * bytes.
*/
oldLength = Tcl_DStringLength(dsPtr);
+/* TODO: fix overreach! */
Tcl_DStringSetLength(dsPtr,
(int) ((oldLength + length + 1) * sizeof(Tcl_UniChar)));
wString = (Tcl_UniChar *) (Tcl_DStringValue(dsPtr) + oldLength);
w = wString;
- end = string + length;
- for (p = string; p < end; ) {
- p += Tcl_UtfToUniChar(p, w);
+ end = src + length;
+ for (p = src; p < end; ) {
+ p += TclUtfToUniChar(p, w);
w++;
}
*w = '\0';
@@ -438,9 +437,9 @@ Tcl_UtfToUniCharDString(string, length, dsPtr)
*
* Tcl_UtfCharComplete --
*
- * Determine if the UTF-8 string of the given length is long enough
- * to be decoded by Tcl_UtfToUniChar(). This does not ensure that the
- * UTF-8 string is properly formed. Equivalent to Plan 9 fullrune().
+ * Determine if the UTF-8 string of the given length is long enough to be
+ * decoded by Tcl_UtfToUniChar(). This does not ensure that the UTF-8
+ * string is properly formed. Equivalent to Plan 9 fullrune().
*
* Results:
* The return value is 0 if the string is not long enough, non-zero
@@ -453,15 +452,15 @@ Tcl_UtfToUniCharDString(string, length, dsPtr)
*/
int
-Tcl_UtfCharComplete(str, len)
- CONST char *str; /* String to check if first few bytes
- * contain a complete UTF-8 character. */
- int len; /* Length of above string in bytes. */
+Tcl_UtfCharComplete(
+ const char *src, /* String to check if first few bytes contain
+ * a complete UTF-8 character. */
+ int length) /* Length of above string in bytes. */
{
int ch;
- ch = *((unsigned char *) str);
- return len >= totalBytes[ch];
+ ch = *((unsigned char *) src);
+ return length >= totalBytes[ch];
}
/*
@@ -469,48 +468,54 @@ Tcl_UtfCharComplete(str, len)
*
* Tcl_NumUtfChars --
*
- * Returns the number of characters (not bytes) in the UTF-8 string,
- * not including the terminating NULL byte. This is equivalent to
- * Plan 9 utflen() and utfnlen().
+ * Returns the number of characters (not bytes) in the UTF-8 string, not
+ * including the terminating NULL byte. This is equivalent to Plan 9
+ * utflen() and utfnlen().
*
* Results:
- * As above.
+ * As above.
*
* Side effects:
* None.
*
*---------------------------------------------------------------------------
*/
-
-int
-Tcl_NumUtfChars(str, len)
- register CONST char *str; /* The UTF-8 string to measure. */
- int len; /* The length of the string in bytes, or -1
+
+int
+Tcl_NumUtfChars(
+ register const char *src, /* The UTF-8 string to measure. */
+ int length) /* The length of the string in bytes, or -1
* for strlen(string). */
{
Tcl_UniChar ch;
register Tcl_UniChar *chPtr = &ch;
- register int n;
- int i;
+ register int i;
/*
* The separate implementations are faster.
+ *
+ * Since this is a time-sensitive function, we also do the check for the
+ * single-byte char case specially.
*/
-
+
i = 0;
- if (len < 0) {
- while (1) {
- str += Tcl_UtfToUniChar(str, chPtr);
- if (ch == '\0') {
- break;
- }
+ if (length < 0) {
+ while (*src != '\0') {
+ src += TclUtfToUniChar(src, chPtr);
i++;
}
} else {
- while (len > 0) {
- n = Tcl_UtfToUniChar(str, chPtr);
- len -= n;
- str += n;
+ register int n;
+
+ while (length > 0) {
+ if (UCHAR(*src) < 0xC0) {
+ length--;
+ src++;
+ } else {
+ n = Tcl_UtfToUniChar(src, chPtr);
+ length -= n;
+ src += n;
+ }
i++;
}
}
@@ -522,37 +527,37 @@ Tcl_NumUtfChars(str, len)
*
* Tcl_UtfFindFirst --
*
- * Returns a pointer to the first occurance of the given Tcl_UniChar
- * in the NULL-terminated UTF-8 string. The NULL terminator is
- * considered part of the UTF-8 string. Equivalent to Plan 9
- * utfrune().
+ * Returns a pointer to the first occurance of the given Tcl_UniChar in
+ * the NULL-terminated UTF-8 string. The NULL terminator is considered
+ * part of the UTF-8 string. Equivalent to Plan 9 utfrune().
*
* Results:
- * As above. If the Tcl_UniChar does not exist in the given string,
- * the return value is NULL.
+ * As above. If the Tcl_UniChar does not exist in the given string, the
+ * return value is NULL.
*
* Side effects:
* None.
*
*---------------------------------------------------------------------------
*/
-CONST char *
-Tcl_UtfFindFirst(string, ch)
- CONST char *string; /* The UTF-8 string to be searched. */
- int ch; /* The Tcl_UniChar to search for. */
+
+const char *
+Tcl_UtfFindFirst(
+ const char *src, /* The UTF-8 string to be searched. */
+ int ch) /* The Tcl_UniChar to search for. */
{
int len;
Tcl_UniChar find;
-
+
while (1) {
- len = Tcl_UtfToUniChar(string, &find);
+ len = TclUtfToUniChar(src, &find);
if (find == ch) {
- return string;
+ return src;
}
- if (*string == '\0') {
+ if (*src == '\0') {
return NULL;
}
- string += len;
+ src += len;
}
}
@@ -561,14 +566,13 @@ Tcl_UtfFindFirst(string, ch)
*
* Tcl_UtfFindLast --
*
- * Returns a pointer to the last occurance of the given Tcl_UniChar
- * in the NULL-terminated UTF-8 string. The NULL terminator is
- * considered part of the UTF-8 string. Equivalent to Plan 9
- * utfrrune().
+ * Returns a pointer to the last occurance of the given Tcl_UniChar in
+ * the NULL-terminated UTF-8 string. The NULL terminator is considered
+ * part of the UTF-8 string. Equivalent to Plan 9 utfrrune().
*
* Results:
- * As above. If the Tcl_UniChar does not exist in the given string,
- * the return value is NULL.
+ * As above. If the Tcl_UniChar does not exist in the given string, the
+ * return value is NULL.
*
* Side effects:
* None.
@@ -576,25 +580,25 @@ Tcl_UtfFindFirst(string, ch)
*---------------------------------------------------------------------------
*/
-CONST char *
-Tcl_UtfFindLast(string, ch)
- CONST char *string; /* The UTF-8 string to be searched. */
- int ch; /* The Tcl_UniChar to search for. */
+const char *
+Tcl_UtfFindLast(
+ const char *src, /* The UTF-8 string to be searched. */
+ int ch) /* The Tcl_UniChar to search for. */
{
int len;
Tcl_UniChar find;
- CONST char *last;
-
+ const char *last;
+
last = NULL;
while (1) {
- len = Tcl_UtfToUniChar(string, &find);
+ len = TclUtfToUniChar(src, &find);
if (find == ch) {
- last = string;
+ last = src;
}
- if (*string == '\0') {
+ if (*src == '\0') {
break;
}
- string += len;
+ src += len;
}
return last;
}
@@ -604,28 +608,27 @@ Tcl_UtfFindLast(string, ch)
*
* Tcl_UtfNext --
*
- * Given a pointer to some current location in a UTF-8 string,
- * move forward one character. The caller must ensure that they
- * are not asking for the next character after the last character
- * in the string.
+ * Given a pointer to some current location in a UTF-8 string, move
+ * forward one character. The caller must ensure that they are not asking
+ * for the next character after the last character in the string.
*
* Results:
- * The return value is the pointer to the next character in
- * the UTF-8 string.
+ * The return value is the pointer to the next character in the UTF-8
+ * string.
*
* Side effects:
* None.
*
*---------------------------------------------------------------------------
*/
-
-CONST char *
-Tcl_UtfNext(str)
- CONST char *str; /* The current location in the string. */
+
+const char *
+Tcl_UtfNext(
+ const char *src) /* The current location in the string. */
{
Tcl_UniChar ch;
- return str + Tcl_UtfToUniChar(str, &ch);
+ return src + TclUtfToUniChar(src, &ch);
}
/*
@@ -633,15 +636,15 @@ Tcl_UtfNext(str)
*
* Tcl_UtfPrev --
*
- * Given a pointer to some current location in a UTF-8 string,
- * move backwards one character. This works correctly when the
- * pointer is in the middle of a UTF-8 character.
+ * Given a pointer to some current location in a UTF-8 string, move
+ * backwards one character. This works correctly when the pointer is in
+ * the middle of a UTF-8 character.
*
* Results:
- * The return value is a pointer to the previous character in the
- * UTF-8 string. If the current location was already at the
- * beginning of the string, the return value will also be a
- * pointer to the beginning of the string.
+ * The return value is a pointer to the previous character in the UTF-8
+ * string. If the current location was already at the beginning of the
+ * string, the return value will also be a pointer to the beginning of
+ * the string.
*
* Side effects:
* None.
@@ -649,44 +652,43 @@ Tcl_UtfNext(str)
*---------------------------------------------------------------------------
*/
-CONST char *
-Tcl_UtfPrev(str, start)
- CONST char *str; /* The current location in the string. */
- CONST char *start; /* Pointer to the beginning of the
- * string, to avoid going backwards too
- * far. */
+const char *
+Tcl_UtfPrev(
+ const char *src, /* The current location in the string. */
+ const char *start) /* Pointer to the beginning of the string, to
+ * avoid going backwards too far. */
{
- CONST char *look;
+ const char *look;
int i, byte;
-
- str--;
- look = str;
+
+ src--;
+ look = src;
for (i = 0; i < TCL_UTF_MAX; i++) {
if (look < start) {
- if (str < start) {
- str = start;
+ if (src < start) {
+ src = start;
}
break;
}
byte = *((unsigned char *) look);
if (byte < 0x80) {
break;
- }
+ }
if (byte >= 0xC0) {
return look;
}
look--;
}
- return str;
+ return src;
}
-
+
/*
*---------------------------------------------------------------------------
*
* Tcl_UniCharAtIndex --
*
- * Returns the Unicode character represented at the specified
- * character (not byte) position in the UTF-8 string.
+ * Returns the Unicode character represented at the specified character
+ * (not byte) position in the UTF-8 string.
*
* Results:
* As above.
@@ -696,17 +698,17 @@ Tcl_UtfPrev(str, start)
*
*---------------------------------------------------------------------------
*/
-
+
Tcl_UniChar
-Tcl_UniCharAtIndex(src, index)
- register CONST char *src; /* The UTF-8 string to dereference. */
- register int index; /* The position of the desired character. */
+Tcl_UniCharAtIndex(
+ register const char *src, /* The UTF-8 string to dereference. */
+ register int index) /* The position of the desired character. */
{
- Tcl_UniChar ch;
+ Tcl_UniChar ch = 0;
while (index >= 0) {
index--;
- src += Tcl_UtfToUniChar(src, &ch);
+ src += TclUtfToUniChar(src, &ch);
}
return ch;
}
@@ -716,8 +718,8 @@ Tcl_UniCharAtIndex(src, index)
*
* Tcl_UtfAtIndex --
*
- * Returns a pointer to the specified character (not byte) position
- * in the UTF-8 string.
+ * Returns a pointer to the specified character (not byte) position in
+ * the UTF-8 string.
*
* Results:
* As above.
@@ -728,16 +730,16 @@ Tcl_UniCharAtIndex(src, index)
*---------------------------------------------------------------------------
*/
-CONST char *
-Tcl_UtfAtIndex(src, index)
- register CONST char *src; /* The UTF-8 string. */
- register int index; /* The position of the desired character. */
+const char *
+Tcl_UtfAtIndex(
+ register const char *src, /* The UTF-8 string. */
+ register int index) /* The position of the desired character. */
{
Tcl_UniChar ch;
-
+
while (index > 0) {
index--;
- src += Tcl_UtfToUniChar(src, &ch);
+ src += TclUtfToUniChar(src, &ch);
}
return src;
}
@@ -751,156 +753,48 @@ Tcl_UtfAtIndex(src, index)
*
* Results:
* Stores the bytes represented by the backslash sequence in dst and
- * returns the number of bytes written to dst. At most TCL_UTF_MAX
- * bytes are written to dst; dst must have been large enough to accept
- * those bytes. If readPtr isn't NULL then it is filled in with a
- * count of the number of bytes in the backslash sequence.
+ * returns the number of bytes written to dst. At most TCL_UTF_MAX bytes
+ * are written to dst; dst must have been large enough to accept those
+ * bytes. If readPtr isn't NULL then it is filled in with a count of the
+ * number of bytes in the backslash sequence.
*
* Side effects:
- * The maximum number of bytes it takes to represent a Unicode
- * character in UTF-8 is guaranteed to be less than the number of
- * bytes used to express the backslash sequence that represents
- * that Unicode character. If the target buffer into which the
- * caller is going to store the bytes that represent the Unicode
- * character is at least as large as the source buffer from which
- * the backslashed sequence was extracted, no buffer overruns should
- * occur.
+ * The maximum number of bytes it takes to represent a Unicode character
+ * in UTF-8 is guaranteed to be less than the number of bytes used to
+ * express the backslash sequence that represents that Unicode character.
+ * If the target buffer into which the caller is going to store the bytes
+ * that represent the Unicode character is at least as large as the
+ * source buffer from which the backslashed sequence was extracted, no
+ * buffer overruns should occur.
*
*---------------------------------------------------------------------------
*/
int
-Tcl_UtfBackslash(src, readPtr, dst)
- CONST char *src; /* Points to the backslash character of
- * a backslash sequence. */
- int *readPtr; /* Fill in with number of characters read
- * from src, unless NULL. */
- char *dst; /* Filled with the bytes represented by the
+Tcl_UtfBackslash(
+ const char *src, /* Points to the backslash character of a
+ * backslash sequence. */
+ int *readPtr, /* Fill in with number of characters read from
+ * src, unless NULL. */
+ char *dst) /* Filled with the bytes represented by the
* backslash sequence. */
{
- register CONST char *p = src+1;
- Tcl_UniChar result;
- int count, n;
- char buf[TCL_UTF_MAX];
+#define LINE_LENGTH 128
+ int numRead;
+ int result;
- if (dst == NULL) {
- dst = buf;
- }
-
- count = 2;
- switch (*p) {
+ result = TclParseBackslash(src, LINE_LENGTH, &numRead, dst);
+ if (numRead == LINE_LENGTH) {
/*
- * Note: in the conversions below, use absolute values (e.g.,
- * 0xa) rather than symbolic values (e.g. \n) that get converted
- * by the compiler. It's possible that compilers on some
- * platforms will do the symbolic conversions differently, which
- * could result in non-portable Tcl scripts.
- */
-
- case 'a':
- result = 0x7;
- break;
- case 'b':
- result = 0x8;
- break;
- case 'f':
- result = 0xc;
- break;
- case 'n':
- result = 0xa;
- break;
- case 'r':
- result = 0xd;
- break;
- case 't':
- result = 0x9;
- break;
- case 'v':
- result = 0xb;
- break;
- case 'x':
- if (isxdigit(UCHAR(p[1]))) { /* INTL: digit */
- char *end;
-
- result = (unsigned char) strtoul(p+1, &end, 16);
- count = end - src;
- } else {
- count = 2;
- result = 'x';
- }
- break;
- case 'u':
- result = 0;
- for (count = 0; count < 4; count++) {
- p++;
- if (!isxdigit(UCHAR(*p))) { /* INTL: digit */
- break;
- }
- n = *p - '0';
- if (n > 9) {
- n = n + '0' + 10 - 'A';
- }
- if (n > 16) {
- n = n + 'A' - 'a';
- }
- result = (result << 4) + n;
- }
- if (count == 0) {
- result = 'u';
- }
- count += 2;
- break;
-
- case '\n':
- do {
- p++;
- } while ((*p == ' ') || (*p == '\t'));
- result = ' ';
- count = p - src;
- break;
- case 0:
- result = '\\';
- count = 1;
- break;
- default:
- /*
- * Check for an octal number \oo?o?
- */
- if (isdigit(UCHAR(*p)) && (UCHAR(*p) < '8')) { /* INTL: digit */
- result = (unsigned char)(*p - '0');
- p++;
- if (!isdigit(UCHAR(*p)) || (UCHAR(*p) >= '8')) { /* INTL: digit */
- break;
- }
- count = 3;
- result = (unsigned char)((result << 3) + (*p - '0'));
- p++;
- if (!isdigit(UCHAR(*p)) || (UCHAR(*p) >= '8')) { /* INTL: digit */
- break;
- }
- count = 4;
- result = (unsigned char)((result << 3) + (*p - '0'));
- break;
- }
- if (UCHAR(*p) < UNICODE_SELF) {
- result = *p;
- count = 2;
- } else {
- /*
- * We have to convert here because the user has put a
- * backslash in front of a multi-byte utf-8 character.
- * While this means nothing special, we shouldn't break up
- * a correct utf-8 character. [Bug #217987] test subst-3.2
- */
- count = Tcl_UtfToUniChar(p, &result) + 1; /* +1 for '\' */
- }
- break;
- }
+ * We ate a whole line. Pay the price of a strlen()
+ */
+ result = TclParseBackslash(src, (int)strlen(src), &numRead, dst);
+ }
if (readPtr != NULL) {
- *readPtr = count;
+ *readPtr = numRead;
}
- return Tcl_UniCharToUtf((int) result, dst);
+ return result;
}
/*
@@ -908,12 +802,12 @@ Tcl_UtfBackslash(src, readPtr, dst)
*
* Tcl_UtfToUpper --
*
- * Convert lowercase characters to uppercase characters in a UTF
- * string in place. The conversion may shrink the UTF string.
+ * Convert lowercase characters to uppercase characters in a UTF string
+ * in place. The conversion may shrink the UTF string.
*
* Results:
- * Returns the number of bytes in the resulting string
- * excluding the trailing null.
+ * Returns the number of bytes in the resulting string excluding the
+ * trailing null.
*
* Side effects:
* Writes a terminating null after the last converted character.
@@ -922,8 +816,8 @@ Tcl_UtfBackslash(src, readPtr, dst)
*/
int
-Tcl_UtfToUpper(str)
- char *str; /* String to convert in place. */
+Tcl_UtfToUpper(
+ char *str) /* String to convert in place. */
{
Tcl_UniChar ch, upChar;
char *src, *dst;
@@ -935,15 +829,15 @@ Tcl_UtfToUpper(str)
src = dst = str;
while (*src) {
- bytes = Tcl_UtfToUniChar(src, &ch);
+ bytes = TclUtfToUniChar(src, &ch);
upChar = Tcl_UniCharToUpper(ch);
/*
- * To keep badly formed Utf strings from getting inflated by
- * the conversion (thereby causing a segfault), only copy the
- * upper case char to dst if its size is <= the original char.
+ * To keep badly formed Utf strings from getting inflated by the
+ * conversion (thereby causing a segfault), only copy the upper case
+ * char to dst if its size is <= the original char.
*/
-
+
if (bytes < UtfCount(upChar)) {
memcpy(dst, src, (size_t) bytes);
dst += bytes;
@@ -961,12 +855,12 @@ Tcl_UtfToUpper(str)
*
* Tcl_UtfToLower --
*
- * Convert uppercase characters to lowercase characters in a UTF
- * string in place. The conversion may shrink the UTF string.
+ * Convert uppercase characters to lowercase characters in a UTF string
+ * in place. The conversion may shrink the UTF string.
*
* Results:
- * Returns the number of bytes in the resulting string
- * excluding the trailing null.
+ * Returns the number of bytes in the resulting string excluding the
+ * trailing null.
*
* Side effects:
* Writes a terminating null after the last converted character.
@@ -975,28 +869,28 @@ Tcl_UtfToUpper(str)
*/
int
-Tcl_UtfToLower(str)
- char *str; /* String to convert in place. */
+Tcl_UtfToLower(
+ char *str) /* String to convert in place. */
{
Tcl_UniChar ch, lowChar;
char *src, *dst;
int bytes;
-
+
/*
* Iterate over the string until we hit the terminating null.
*/
src = dst = str;
while (*src) {
- bytes = Tcl_UtfToUniChar(src, &ch);
+ bytes = TclUtfToUniChar(src, &ch);
lowChar = Tcl_UniCharToLower(ch);
/*
- * To keep badly formed Utf strings from getting inflated by
- * the conversion (thereby causing a segfault), only copy the
- * lower case char to dst if its size is <= the original char.
+ * To keep badly formed Utf strings from getting inflated by the
+ * conversion (thereby causing a segfault), only copy the lower case
+ * char to dst if its size is <= the original char.
*/
-
+
if (bytes < UtfCount(lowChar)) {
memcpy(dst, src, (size_t) bytes);
dst += bytes;
@@ -1014,13 +908,13 @@ Tcl_UtfToLower(str)
*
* Tcl_UtfToTitle --
*
- * Changes the first character of a UTF string to title case or
- * uppercase and the rest of the string to lowercase. The
- * conversion happens in place and may shrink the UTF string.
+ * Changes the first character of a UTF string to title case or uppercase
+ * and the rest of the string to lowercase. The conversion happens in
+ * place and may shrink the UTF string.
*
* Results:
- * Returns the number of bytes in the resulting string
- * excluding the trailing null.
+ * Returns the number of bytes in the resulting string excluding the
+ * trailing null.
*
* Side effects:
* Writes a terminating null after the last converted character.
@@ -1029,13 +923,13 @@ Tcl_UtfToLower(str)
*/
int
-Tcl_UtfToTitle(str)
- char *str; /* String to convert in place. */
+Tcl_UtfToTitle(
+ char *str) /* String to convert in place. */
{
Tcl_UniChar ch, titleChar, lowChar;
char *src, *dst;
int bytes;
-
+
/*
* Capitalize the first character and then lowercase the rest of the
* characters until we get to a null.
@@ -1044,7 +938,7 @@ Tcl_UtfToTitle(str)
src = dst = str;
if (*src) {
- bytes = Tcl_UtfToUniChar(src, &ch);
+ bytes = TclUtfToUniChar(src, &ch);
titleChar = Tcl_UniCharToTitle(ch);
if (bytes < UtfCount(titleChar)) {
@@ -1056,7 +950,7 @@ Tcl_UtfToTitle(str)
src += bytes;
}
while (*src) {
- bytes = Tcl_UtfToUniChar(src, &ch);
+ bytes = TclUtfToUniChar(src, &ch);
lowChar = Tcl_UniCharToLower(ch);
if (bytes < UtfCount(lowChar)) {
@@ -1074,10 +968,57 @@ Tcl_UtfToTitle(str)
/*
*----------------------------------------------------------------------
*
+ * TclpUtfNcmp2 --
+ *
+ * Compare at most numBytes bytes of utf-8 strings cs and ct. Both cs and
+ * ct are assumed to be at least numBytes bytes long.
+ *
+ * Results:
+ * Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
+ *
+ * Side effects:
+ * None.
+ *
+ *----------------------------------------------------------------------
+ */
+
+int
+TclpUtfNcmp2(
+ const char *cs, /* UTF string to compare to ct. */
+ const char *ct, /* UTF string cs is compared to. */
+ unsigned long numBytes) /* Number of *bytes* to compare. */
+{
+ /*
+ * We can't simply call 'memcmp(cs, ct, numBytes);' because we need to
+ * check for Tcl's \xC0\x80 non-utf-8 null encoding. Otherwise utf-8 lexes
+ * fine in the strcmp manner.
+ */
+
+ register int result = 0;
+
+ for ( ; numBytes != 0; numBytes--, cs++, ct++) {
+ if (*cs != *ct) {
+ result = UCHAR(*cs) - UCHAR(*ct);
+ break;
+ }
+ }
+ if (numBytes && ((UCHAR(*cs) == 0xC0) || (UCHAR(*ct) == 0xC0))) {
+ unsigned char c1, c2;
+
+ c1 = ((UCHAR(*cs) == 0xC0) && (UCHAR(cs[1]) == 0x80)) ? 0 : UCHAR(*cs);
+ c2 = ((UCHAR(*ct) == 0xC0) && (UCHAR(ct[1]) == 0x80)) ? 0 : UCHAR(*ct);
+ result = (c1 - c2);
+ }
+ return result;
+}
+
+/*
+ *----------------------------------------------------------------------
+ *
* Tcl_UtfNcmp --
*
- * Compare at most n UTF chars of string cs to string ct. Both cs
- * and ct are assumed to be at least n UTF chars long.
+ * Compare at most numChars UTF chars of string cs to string ct. Both cs
+ * and ct are assumed to be at least numChars UTF chars long.
*
* Results:
* Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
@@ -1089,25 +1030,28 @@ Tcl_UtfToTitle(str)
*/
int
-Tcl_UtfNcmp(cs, ct, n)
- CONST char *cs; /* UTF string to compare to ct. */
- CONST char *ct; /* UTF string cs is compared to. */
- unsigned long n; /* Number of UTF chars to compare. */
+Tcl_UtfNcmp(
+ const char *cs, /* UTF string to compare to ct. */
+ const char *ct, /* UTF string cs is compared to. */
+ unsigned long numChars) /* Number of UTF chars to compare. */
{
Tcl_UniChar ch1, ch2;
+
/*
- * Cannot use memcmp()-based approach as byte representation of
- * \u0000 (the pair of bytes 0xc0,0x80) is larger than byte
- * representation of \u0001 (the byte 0x01.)
+ * Cannot use 'memcmp(cs, ct, n);' as byte representation of \u0000 (the
+ * pair of bytes 0xc0,0x80) is larger than byte representation of \u0001
+ * (the byte 0x01.)
*/
- while (n-- > 0) {
+
+ while (numChars-- > 0) {
/*
- * n must be interpreted as chars, not bytes.
- * This should be called only when both strings are of
- * at least n chars long (no need for \0 check)
+ * n must be interpreted as chars, not bytes. This should be called
+ * only when both strings are of at least n chars long (no need for \0
+ * check)
*/
- cs += Tcl_UtfToUniChar(cs, &ch1);
- ct += Tcl_UtfToUniChar(ct, &ch2);
+
+ cs += TclUtfToUniChar(cs, &ch1);
+ ct += TclUtfToUniChar(ct, &ch2);
if (ch1 != ch2) {
return (ch1 - ch2);
}
@@ -1120,9 +1064,9 @@ Tcl_UtfNcmp(cs, ct, n)
*
* Tcl_UtfNcasecmp --
*
- * Compare at most n UTF chars of string cs to string ct case
- * insensitive. Both cs and ct are assumed to be at least n
- * UTF chars long.
+ * Compare at most numChars UTF chars of string cs to string ct case
+ * insensitive. Both cs and ct are assumed to be at least numChars UTF
+ * chars long.
*
* Results:
* Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
@@ -1134,20 +1078,20 @@ Tcl_UtfNcmp(cs, ct, n)
*/
int
-Tcl_UtfNcasecmp(cs, ct, n)
- CONST char *cs; /* UTF string to compare to ct. */
- CONST char *ct; /* UTF string cs is compared to. */
- unsigned long n; /* Number of UTF chars to compare. */
+Tcl_UtfNcasecmp(
+ const char *cs, /* UTF string to compare to ct. */
+ const char *ct, /* UTF string cs is compared to. */
+ unsigned long numChars) /* Number of UTF chars to compare. */
{
Tcl_UniChar ch1, ch2;
- while (n-- > 0) {
+ while (numChars-- > 0) {
/*
* n must be interpreted as chars, not bytes.
* This should be called only when both strings are of
* at least n chars long (no need for \0 check)
*/
- cs += Tcl_UtfToUniChar(cs, &ch1);
- ct += Tcl_UtfToUniChar(ct, &ch2);
+ cs += TclUtfToUniChar(cs, &ch1);
+ ct += TclUtfToUniChar(ct, &ch2);
if (ch1 != ch2) {
ch1 = Tcl_UniCharToLower(ch1);
ch2 = Tcl_UniCharToLower(ch2);
@@ -1162,6 +1106,46 @@ Tcl_UtfNcasecmp(cs, ct, n)
/*
*----------------------------------------------------------------------
*
+ * Tcl_UtfNcasecmp --
+ *
+ * Compare UTF chars of string cs to string ct case insensitively.
+ * Replacement for strcasecmp in Tcl core, in places where UTF-8 should
+ * be handled.
+ *
+ * Results:
+ * Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
+ *
+ * Side effects:
+ * None.
+ *
+ *----------------------------------------------------------------------
+ */
+
+int
+TclUtfCasecmp(
+ const char *cs, /* UTF string to compare to ct. */
+ const char *ct) /* UTF string cs is compared to. */
+{
+ while (*cs && *ct) {
+ Tcl_UniChar ch1, ch2;
+
+ cs += TclUtfToUniChar(cs, &ch1);
+ ct += TclUtfToUniChar(ct, &ch2);
+ if (ch1 != ch2) {
+ ch1 = Tcl_UniCharToLower(ch1);
+ ch2 = Tcl_UniCharToLower(ch2);
+ if (ch1 != ch2) {
+ return ch1 - ch2;
+ }
+ }
+ }
+ return UCHAR(*cs) - UCHAR(*ct);
+}
+
+
+/*
+ *----------------------------------------------------------------------
+ *
* Tcl_UniCharToUpper --
*
* Compute the uppercase equivalent of the given Unicode character.
@@ -1176,16 +1160,15 @@ Tcl_UtfNcasecmp(cs, ct, n)
*/
Tcl_UniChar
-Tcl_UniCharToUpper(ch)
- int ch; /* Unicode character to convert. */
+Tcl_UniCharToUpper(
+ int ch) /* Unicode character to convert. */
{
int info = GetUniCharInfo(ch);
if (GetCaseType(info) & 0x04) {
- return (Tcl_UniChar) (ch - GetDelta(info));
- } else {
- return ch;
+ ch -= GetDelta(info);
}
+ return (Tcl_UniChar) ch;
}
/*
@@ -1205,16 +1188,15 @@ Tcl_UniCharToUpper(ch)
*/
Tcl_UniChar
-Tcl_UniCharToLower(ch)
- int ch; /* Unicode character to convert. */
+Tcl_UniCharToLower(
+ int ch) /* Unicode character to convert. */
{
int info = GetUniCharInfo(ch);
if (GetCaseType(info) & 0x02) {
- return (Tcl_UniChar) (ch + GetDelta(info));
- } else {
- return ch;
+ ch += GetDelta(info);
}
+ return (Tcl_UniChar) ch;
}
/*
@@ -1234,8 +1216,8 @@ Tcl_UniCharToLower(ch)
*/
Tcl_UniChar
-Tcl_UniCharToTitle(ch)
- int ch; /* Unicode character to convert. */
+Tcl_UniCharToTitle(
+ int ch) /* Unicode character to convert. */
{
int info = GetUniCharInfo(ch);
int mode = GetCaseType(info);
@@ -1245,12 +1227,11 @@ Tcl_UniCharToTitle(ch)
* Subtract or add one depending on the original case.
*/
- return (Tcl_UniChar) (ch + ((mode & 0x4) ? -1 : 1));
+ ch += ((mode & 0x4) ? -1 : 1);
} else if (mode == 0x4) {
- return (Tcl_UniChar) (ch - GetDelta(info));
- } else {
- return ch;
+ ch -= GetDelta(info);
}
+ return (Tcl_UniChar) ch;
}
/*
@@ -1258,7 +1239,7 @@ Tcl_UniCharToTitle(ch)
*
* Tcl_UniCharLen --
*
- * Find the length of a UniChar string. The str input must be null
+ * Find the length of a UniChar string. The str input must be null
* terminated.
*
* Results:
@@ -1271,14 +1252,14 @@ Tcl_UniCharToTitle(ch)
*/
int
-Tcl_UniCharLen(str)
- CONST Tcl_UniChar *str; /* Unicode string to find length of. */
+Tcl_UniCharLen(
+ const Tcl_UniChar *uniStr) /* Unicode string to find length of. */
{
int len = 0;
-
- while (*str != '\0') {
+
+ while (*uniStr != '\0') {
len++;
- str++;
+ uniStr++;
}
return len;
}
@@ -1288,11 +1269,11 @@ Tcl_UniCharLen(str)
*
* Tcl_UniCharNcmp --
*
- * Compare at most n unichars of string cs to string ct. Both cs
- * and ct are assumed to be at least n unichars long.
+ * Compare at most numChars unichars of string ucs to string uct.
+ * Both ucs and uct are assumed to be at least numChars unichars long.
*
* Results:
- * Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
+ * Return <0 if ucs < uct, 0 if ucs == uct, or >0 if ucs > uct.
*
* Side effects:
* None.
@@ -1301,20 +1282,30 @@ Tcl_UniCharLen(str)
*/
int
-Tcl_UniCharNcmp(cs, ct, n)
- CONST Tcl_UniChar *cs; /* Unicode string to compare to ct. */
- CONST Tcl_UniChar *ct; /* Unicode string cs is compared to. */
- unsigned long n; /* Number of unichars to compare. */
+Tcl_UniCharNcmp(
+ const Tcl_UniChar *ucs, /* Unicode string to compare to uct. */
+ const Tcl_UniChar *uct, /* Unicode string ucs is compared to. */
+ unsigned long numChars) /* Number of unichars to compare. */
{
- for ( ; n != 0; n--, cs++, ct++) {
- if (*cs != *ct) {
- return (*cs - *ct);
- }
- if (*cs == '\0') {
- break;
+#ifdef WORDS_BIGENDIAN
+ /*
+ * We are definitely on a big-endian machine; memcmp() is safe
+ */
+
+ return memcmp(ucs, uct, numChars*sizeof(Tcl_UniChar));
+
+#else /* !WORDS_BIGENDIAN */
+ /*
+ * We can't simply call memcmp() because that is not lexically correct.
+ */
+
+ for ( ; numChars != 0; ucs++, uct++, numChars--) {
+ if (*ucs != *uct) {
+ return (*ucs - *uct);
}
}
return 0;
+#endif /* WORDS_BIGENDIAN */
}
/*
@@ -1322,12 +1313,12 @@ Tcl_UniCharNcmp(cs, ct, n)
*
* Tcl_UniCharNcasecmp --
*
- * Compare at most n unichars of string cs to string ct case
- * insensitive. Both cs and ct are assumed to be at least n
+ * Compare at most numChars unichars of string ucs to string uct case
+ * insensitive. Both ucs and uct are assumed to be at least numChars
* unichars long.
*
* Results:
- * Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
+ * Return <0 if ucs < uct, 0 if ucs == uct, or >0 if ucs > uct.
*
* Side effects:
* None.
@@ -1336,18 +1327,19 @@ Tcl_UniCharNcmp(cs, ct, n)
*/
int
-Tcl_UniCharNcasecmp(cs, ct, n)
- CONST Tcl_UniChar *cs; /* Unicode string to compare to ct. */
- CONST Tcl_UniChar *ct; /* Unicode string cs is compared to. */
- unsigned long n; /* Number of unichars to compare. */
+Tcl_UniCharNcasecmp(
+ const Tcl_UniChar *ucs, /* Unicode string to compare to uct. */
+ const Tcl_UniChar *uct, /* Unicode string ucs is compared to. */
+ unsigned long numChars) /* Number of unichars to compare. */
{
- for ( ; n != 0; n--, cs++, ct++) {
- if ((*cs != *ct) &&
- (Tcl_UniCharToLower(*cs) != Tcl_UniCharToLower(*ct))) {
- return (*cs - *ct);
- }
- if (*cs == '\0') {
- break;
+ for ( ; numChars != 0; numChars--, ucs++, uct++) {
+ if (*ucs != *uct) {
+ Tcl_UniChar lcs = Tcl_UniCharToLower(*ucs);
+ Tcl_UniChar lct = Tcl_UniCharToLower(*uct);
+
+ if (lcs != lct) {
+ return (lcs - lct);
+ }
}
}
return 0;
@@ -1370,12 +1362,10 @@ Tcl_UniCharNcasecmp(cs, ct, n)
*/
int
-Tcl_UniCharIsAlnum(ch)
- int ch; /* Unicode character to test. */
+Tcl_UniCharIsAlnum(
+ int ch) /* Unicode character to test. */
{
- register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
-
- return (((ALPHA_BITS | DIGIT_BITS) >> category) & 1);
+ return (((ALPHA_BITS | DIGIT_BITS) >> GetCategory(ch)) & 1);
}
/*
@@ -1395,11 +1385,10 @@ Tcl_UniCharIsAlnum(ch)
*/
int
-Tcl_UniCharIsAlpha(ch)
- int ch; /* Unicode character to test. */
+Tcl_UniCharIsAlpha(
+ int ch) /* Unicode character to test. */
{
- register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
- return ((ALPHA_BITS >> category) & 1);
+ return ((ALPHA_BITS >> GetCategory(ch)) & 1);
}
/*
@@ -1419,10 +1408,10 @@ Tcl_UniCharIsAlpha(ch)
*/
int
-Tcl_UniCharIsControl(ch)
- int ch; /* Unicode character to test. */
+Tcl_UniCharIsControl(
+ int ch) /* Unicode character to test. */
{
- return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == CONTROL);
+ return ((CONTROL_BITS >> GetCategory(ch)) & 1);
}
/*
@@ -1442,11 +1431,10 @@ Tcl_UniCharIsControl(ch)
*/
int
-Tcl_UniCharIsDigit(ch)
- int ch; /* Unicode character to test. */
+Tcl_UniCharIsDigit(
+ int ch) /* Unicode character to test. */
{
- return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK)
- == DECIMAL_DIGIT_NUMBER);
+ return (GetCategory(ch) == DECIMAL_DIGIT_NUMBER);
}
/*
@@ -1466,11 +1454,10 @@ Tcl_UniCharIsDigit(ch)
*/
int
-Tcl_UniCharIsGraph(ch)
- int ch; /* Unicode character to test. */
+Tcl_UniCharIsGraph(
+ int ch) /* Unicode character to test. */
{
- register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
- return (((PRINT_BITS >> category) & 1) && ((unsigned char) ch != ' '));
+ return ((GRAPH_BITS >> GetCategory(ch)) & 1);
}
/*
@@ -1490,10 +1477,10 @@ Tcl_UniCharIsGraph(ch)
*/
int
-Tcl_UniCharIsLower(ch)
- int ch; /* Unicode character to test. */
+Tcl_UniCharIsLower(
+ int ch) /* Unicode character to test. */
{
- return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == LOWERCASE_LETTER);
+ return (GetCategory(ch) == LOWERCASE_LETTER);
}
/*
@@ -1513,11 +1500,10 @@ Tcl_UniCharIsLower(ch)
*/
int
-Tcl_UniCharIsPrint(ch)
- int ch; /* Unicode character to test. */
+Tcl_UniCharIsPrint(
+ int ch) /* Unicode character to test. */
{
- register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
- return ((PRINT_BITS >> category) & 1);
+ return (((GRAPH_BITS|SPACE_BITS) >> GetCategory(ch)) & 1);
}
/*
@@ -1537,11 +1523,10 @@ Tcl_UniCharIsPrint(ch)
*/
int
-Tcl_UniCharIsPunct(ch)
- int ch; /* Unicode character to test. */
+Tcl_UniCharIsPunct(
+ int ch) /* Unicode character to test. */
{
- register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
- return ((PUNCT_BITS >> category) & 1);
+ return ((PUNCT_BITS >> GetCategory(ch)) & 1);
}
/*
@@ -1561,21 +1546,22 @@ Tcl_UniCharIsPunct(ch)
*/
int
-Tcl_UniCharIsSpace(ch)
- int ch; /* Unicode character to test. */
+Tcl_UniCharIsSpace(
+ int ch) /* Unicode character to test. */
{
- register int category;
-
/*
* If the character is within the first 127 characters, just use the
* standard C function, otherwise consult the Unicode table.
*/
- if (ch < 0x80) {
- return isspace(UCHAR(ch)); /* INTL: ISO space */
+ if (((Tcl_UniChar) ch) < ((Tcl_UniChar) 0x80)) {
+ return TclIsSpaceProc((char) ch);
+ } else if ((Tcl_UniChar) ch == 0x0085 || (Tcl_UniChar) ch == 0x180e
+ || (Tcl_UniChar) ch == 0x200b || (Tcl_UniChar) ch == 0x2060
+ || (Tcl_UniChar) ch == 0x202f || (Tcl_UniChar) ch == 0xfeff) {
+ return 1;
} else {
- category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
- return ((SPACE_BITS >> category) & 1);
+ return ((SPACE_BITS >> GetCategory(ch)) & 1);
}
}
@@ -1596,10 +1582,10 @@ Tcl_UniCharIsSpace(ch)
*/
int
-Tcl_UniCharIsUpper(ch)
- int ch; /* Unicode character to test. */
+Tcl_UniCharIsUpper(
+ int ch) /* Unicode character to test. */
{
- return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == UPPERCASE_LETTER);
+ return (GetCategory(ch) == UPPERCASE_LETTER);
}
/*
@@ -1607,8 +1593,7 @@ Tcl_UniCharIsUpper(ch)
*
* Tcl_UniCharIsWordChar --
*
- * Test if a character is alphanumeric or a connector punctuation
- * mark.
+ * Test if a character is alphanumeric or a connector punctuation mark.
*
* Results:
* Returns 1 if character is a word character.
@@ -1620,12 +1605,10 @@ Tcl_UniCharIsUpper(ch)
*/
int
-Tcl_UniCharIsWordChar(ch)
- int ch; /* Unicode character to test. */
+Tcl_UniCharIsWordChar(
+ int ch) /* Unicode character to test. */
{
- register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
-
- return (((ALPHA_BITS | DIGIT_BITS | CONNECTOR_BITS) >> category) & 1);
+ return ((WORD_BITS >> GetCategory(ch)) & 1);
}
/*
@@ -1634,14 +1617,16 @@ Tcl_UniCharIsWordChar(ch)
* Tcl_UniCharCaseMatch --
*
* See if a particular Unicode string matches a particular pattern.
- * Allows case insensitivity. This is the Unicode equivalent of
- * the char* Tcl_StringCaseMatch.
+ * Allows case insensitivity. This is the Unicode equivalent of the char*
+ * Tcl_StringCaseMatch. The UniChar strings must be NULL-terminated.
+ * This has no provision for counted UniChar strings, thus should not be
+ * used where NULLs are expected in the UniChar string. Use
+ * TclUniCharMatch where possible.
*
* Results:
- * The return value is 1 if string matches pattern, and
- * 0 otherwise. The matching operation permits the following
- * special characters in the pattern: *?\[] (see the manual
- * entry for details on what these mean).
+ * The return value is 1 if string matches pattern, and 0 otherwise. The
+ * matching operation permits the following special characters in the
+ * pattern: *?\[] (see the manual entry for details on what these mean).
*
* Side effects:
* None.
@@ -1650,44 +1635,48 @@ Tcl_UniCharIsWordChar(ch)
*/
int
-Tcl_UniCharCaseMatch(string, pattern, nocase)
- CONST Tcl_UniChar *string; /* Unicode String. */
- CONST Tcl_UniChar *pattern; /* Pattern, which may contain special
+Tcl_UniCharCaseMatch(
+ const Tcl_UniChar *uniStr, /* Unicode String. */
+ const Tcl_UniChar *uniPattern,
+ /* Pattern, which may contain special
* characters. */
- int nocase; /* 0 for case sensitive, 1 for insensitive */
+ int nocase) /* 0 for case sensitive, 1 for insensitive */
{
Tcl_UniChar ch1, p;
-
+
while (1) {
- p = *pattern;
-
+ p = *uniPattern;
+
/*
- * See if we're at the end of both the pattern and the string. If
- * so, we succeeded. If we're at the end of the pattern but not at
- * the end of the string, we failed.
+ * See if we're at the end of both the pattern and the string. If so,
+ * we succeeded. If we're at the end of the pattern but not at the end
+ * of the string, we failed.
*/
-
+
if (p == 0) {
- return (*string == 0);
+ return (*uniStr == 0);
}
- if ((*string == 0) && (p != '*')) {
+ if ((*uniStr == 0) && (p != '*')) {
return 0;
}
/*
- * Check for a "*" as the next pattern character. It matches any
- * substring. We handle this by skipping all the characters up to the
+ * Check for a "*" as the next pattern character. It matches any
+ * substring. We handle this by skipping all the characters up to the
* next matching one in the pattern, and then calling ourselves
* recursively for each postfix of string, until either we match or we
* reach the end of the string.
*/
-
+
if (p == '*') {
/*
* Skip all successive *'s in the pattern
*/
- while (*(++pattern) == '*') {}
- p = *pattern;
+
+ while (*(++uniPattern) == '*') {
+ /* empty body */
+ }
+ p = *uniPattern;
if (p == 0) {
return 1;
}
@@ -1700,20 +1689,216 @@ Tcl_UniCharCaseMatch(string, pattern, nocase)
* quickly if the next char in the pattern isn't a special
* character
*/
+
if ((p != '[') && (p != '?') && (p != '\\')) {
if (nocase) {
- while (*string && (p != *string)
+ while (*uniStr && (p != *uniStr)
+ && (p != Tcl_UniCharToLower(*uniStr))) {
+ uniStr++;
+ }
+ } else {
+ while (*uniStr && (p != *uniStr)) {
+ uniStr++;
+ }
+ }
+ }
+ if (Tcl_UniCharCaseMatch(uniStr, uniPattern, nocase)) {
+ return 1;
+ }
+ if (*uniStr == 0) {
+ return 0;
+ }
+ uniStr++;
+ }
+ }
+
+ /*
+ * Check for a "?" as the next pattern character. It matches any
+ * single character.
+ */
+
+ if (p == '?') {
+ uniPattern++;
+ uniStr++;
+ continue;
+ }
+
+ /*
+ * Check for a "[" as the next pattern character. It is followed by a
+ * list of characters that are acceptable, or by a range (two
+ * characters separated by "-").
+ */
+
+ if (p == '[') {
+ Tcl_UniChar startChar, endChar;
+
+ uniPattern++;
+ ch1 = (nocase ? Tcl_UniCharToLower(*uniStr) : *uniStr);
+ uniStr++;
+ while (1) {
+ if ((*uniPattern == ']') || (*uniPattern == 0)) {
+ return 0;
+ }
+ startChar = (nocase ? Tcl_UniCharToLower(*uniPattern)
+ : *uniPattern);
+ uniPattern++;
+ if (*uniPattern == '-') {
+ uniPattern++;
+ if (*uniPattern == 0) {
+ return 0;
+ }
+ endChar = (nocase ? Tcl_UniCharToLower(*uniPattern)
+ : *uniPattern);
+ uniPattern++;
+ if (((startChar <= ch1) && (ch1 <= endChar))
+ || ((endChar <= ch1) && (ch1 <= startChar))) {
+ /*
+ * Matches ranges of form [a-z] or [z-a].
+ */
+ break;
+ }
+ } else if (startChar == ch1) {
+ break;
+ }
+ }
+ while (*uniPattern != ']') {
+ if (*uniPattern == 0) {
+ uniPattern--;
+ break;
+ }
+ uniPattern++;
+ }
+ uniPattern++;
+ continue;
+ }
+
+ /*
+ * If the next pattern character is '\', just strip off the '\' so we
+ * do exact matching on the character that follows.
+ */
+
+ if (p == '\\') {
+ if (*(++uniPattern) == '\0') {
+ return 0;
+ }
+ }
+
+ /*
+ * There's no special character. Just make sure that the next bytes of
+ * each string match.
+ */
+
+ if (nocase) {
+ if (Tcl_UniCharToLower(*uniStr) !=
+ Tcl_UniCharToLower(*uniPattern)) {
+ return 0;
+ }
+ } else if (*uniStr != *uniPattern) {
+ return 0;
+ }
+ uniStr++;
+ uniPattern++;
+ }
+}
+
+/*
+ *----------------------------------------------------------------------
+ *
+ * TclUniCharMatch --
+ *
+ * See if a particular Unicode string matches a particular pattern.
+ * Allows case insensitivity. This is the Unicode equivalent of the char*
+ * Tcl_StringCaseMatch. This variant of Tcl_UniCharCaseMatch uses counted
+ * Strings, so embedded NULLs are allowed.
+ *
+ * Results:
+ * The return value is 1 if string matches pattern, and 0 otherwise. The
+ * matching operation permits the following special characters in the
+ * pattern: *?\[] (see the manual entry for details on what these mean).
+ *
+ * Side effects:
+ * None.
+ *
+ *----------------------------------------------------------------------
+ */
+
+int
+TclUniCharMatch(
+ const Tcl_UniChar *string, /* Unicode String. */
+ int strLen, /* Length of String */
+ const Tcl_UniChar *pattern, /* Pattern, which may contain special
+ * characters. */
+ int ptnLen, /* Length of Pattern */
+ int nocase) /* 0 for case sensitive, 1 for insensitive */
+{
+ const Tcl_UniChar *stringEnd, *patternEnd;
+ Tcl_UniChar p;
+
+ stringEnd = string + strLen;
+ patternEnd = pattern + ptnLen;
+
+ while (1) {
+ /*
+ * See if we're at the end of both the pattern and the string. If so,
+ * we succeeded. If we're at the end of the pattern but not at the end
+ * of the string, we failed.
+ */
+
+ if (pattern == patternEnd) {
+ return (string == stringEnd);
+ }
+ p = *pattern;
+ if ((string == stringEnd) && (p != '*')) {
+ return 0;
+ }
+
+ /*
+ * Check for a "*" as the next pattern character. It matches any
+ * substring. We handle this by skipping all the characters up to the
+ * next matching one in the pattern, and then calling ourselves
+ * recursively for each postfix of string, until either we match or we
+ * reach the end of the string.
+ */
+
+ if (p == '*') {
+ /*
+ * Skip all successive *'s in the pattern.
+ */
+
+ while (*(++pattern) == '*') {
+ /* empty body */
+ }
+ if (pattern == patternEnd) {
+ return 1;
+ }
+ p = *pattern;
+ if (nocase) {
+ p = Tcl_UniCharToLower(p);
+ }
+ while (1) {
+ /*
+ * Optimization for matching - cruise through the string
+ * quickly if the next char in the pattern isn't a special
+ * character.
+ */
+
+ if ((p != '[') && (p != '?') && (p != '\\')) {
+ if (nocase) {
+ while ((string < stringEnd) && (p != *string)
&& (p != Tcl_UniCharToLower(*string))) {
string++;
}
} else {
- while (*string && (p != *string)) { string++; }
+ while ((string < stringEnd) && (p != *string)) {
+ string++;
+ }
}
}
- if (Tcl_UniCharCaseMatch(string, pattern, nocase)) {
+ if (TclUniCharMatch(string, stringEnd - string,
+ pattern, patternEnd - pattern, nocase)) {
return 1;
}
- if (*string == 0) {
+ if (string == stringEnd) {
return 0;
}
string++;
@@ -1721,8 +1906,8 @@ Tcl_UniCharCaseMatch(string, pattern, nocase)
}
/*
- * Check for a "?" as the next pattern character. It matches
- * any single character.
+ * Check for a "?" as the next pattern character. It matches any
+ * single character.
*/
if (p == '?') {
@@ -1732,26 +1917,26 @@ Tcl_UniCharCaseMatch(string, pattern, nocase)
}
/*
- * Check for a "[" as the next pattern character. It is followed
- * by a list of characters that are acceptable, or by a range
- * (two characters separated by "-").
+ * Check for a "[" as the next pattern character. It is followed by a
+ * list of characters that are acceptable, or by a range (two
+ * characters separated by "-").
*/
-
+
if (p == '[') {
- Tcl_UniChar startChar, endChar;
+ Tcl_UniChar ch1, startChar, endChar;
pattern++;
ch1 = (nocase ? Tcl_UniCharToLower(*string) : *string);
string++;
while (1) {
- if ((*pattern == ']') || (*pattern == 0)) {
+ if ((*pattern == ']') || (pattern == patternEnd)) {
return 0;
}
startChar = (nocase ? Tcl_UniCharToLower(*pattern) : *pattern);
pattern++;
if (*pattern == '-') {
pattern++;
- if (*pattern == 0) {
+ if (pattern == patternEnd) {
return 0;
}
endChar = (nocase ? Tcl_UniCharToLower(*pattern)
@@ -1769,7 +1954,7 @@ Tcl_UniCharCaseMatch(string, pattern, nocase)
}
}
while (*pattern != ']') {
- if (*pattern == 0) {
+ if (pattern == patternEnd) {
pattern--;
break;
}
@@ -1780,19 +1965,19 @@ Tcl_UniCharCaseMatch(string, pattern, nocase)
}
/*
- * If the next pattern character is '\', just strip off the '\'
- * so we do exact matching on the character that follows.
+ * If the next pattern character is '\', just strip off the '\' so we
+ * do exact matching on the character that follows.
*/
if (p == '\\') {
- if (*(++pattern) == '\0') {
+ if (++pattern == patternEnd) {
return 0;
}
}
/*
- * There's no special character. Just make sure that the next
- * bytes of each string match.
+ * There's no special character. Just make sure that the next bytes of
+ * each string match.
*/
if (nocase) {
@@ -1806,3 +1991,11 @@ Tcl_UniCharCaseMatch(string, pattern, nocase)
pattern++;
}
}
+
+/*
+ * Local Variables:
+ * mode: c
+ * c-basic-offset: 4
+ * fill-column: 78
+ * End:
+ */