summaryrefslogtreecommitdiffstats
path: root/generic/tclUtf.c
diff options
context:
space:
mode:
Diffstat (limited to 'generic/tclUtf.c')
-rw-r--r--generic/tclUtf.c121
1 files changed, 74 insertions, 47 deletions
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index 31e52ba..15529c7 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -7,8 +7,6 @@
*
* See the file "license.terms" for information on usage and redistribution of
* this file, and for a DISCLAIMER OF ALL WARRANTIES.
- *
- * RCS: @(#) $Id: tclUtf.c,v 1.40 2009/09/07 07:28:38 das Exp $
*/
#include "tclInt.h"
@@ -28,28 +26,27 @@
#define ALPHA_BITS ((1 << UPPERCASE_LETTER) | (1 << LOWERCASE_LETTER) \
| (1 << TITLECASE_LETTER) | (1 << MODIFIER_LETTER) | (1<<OTHER_LETTER))
+#define CONTROL_BITS ((1 << CONTROL) | (1 << FORMAT) | (1 << PRIVATE_USE))
+
#define DIGIT_BITS (1 << DECIMAL_DIGIT_NUMBER)
#define SPACE_BITS ((1 << SPACE_SEPARATOR) | (1 << LINE_SEPARATOR) \
| (1 << PARAGRAPH_SEPARATOR))
-#define CONNECTOR_BITS (1 << CONNECTOR_PUNCTUATION)
-
-#define PRINT_BITS (ALPHA_BITS | DIGIT_BITS | SPACE_BITS | \
- (1 << NON_SPACING_MARK) | (1 << ENCLOSING_MARK) | \
- (1 << COMBINING_SPACING_MARK) | (1 << LETTER_NUMBER) | \
- (1 << OTHER_NUMBER) | (1 << CONNECTOR_PUNCTUATION) | \
- (1 << DASH_PUNCTUATION) | (1 << OPEN_PUNCTUATION) | \
- (1 << CLOSE_PUNCTUATION) | (1 << INITIAL_QUOTE_PUNCTUATION) | \
- (1 << FINAL_QUOTE_PUNCTUATION) | (1 << OTHER_PUNCTUATION) | \
- (1 << MATH_SYMBOL) | (1 << CURRENCY_SYMBOL) | \
- (1 << MODIFIER_SYMBOL) | (1 << OTHER_SYMBOL))
+#define WORD_BITS (ALPHA_BITS | DIGIT_BITS | (1 << CONNECTOR_PUNCTUATION))
#define PUNCT_BITS ((1 << CONNECTOR_PUNCTUATION) | \
(1 << DASH_PUNCTUATION) | (1 << OPEN_PUNCTUATION) | \
(1 << CLOSE_PUNCTUATION) | (1 << INITIAL_QUOTE_PUNCTUATION) | \
(1 << FINAL_QUOTE_PUNCTUATION) | (1 << OTHER_PUNCTUATION))
+#define GRAPH_BITS (WORD_BITS | PUNCT_BITS | \
+ (1 << NON_SPACING_MARK) | (1 << ENCLOSING_MARK) | \
+ (1 << COMBINING_SPACING_MARK) | (1 << LETTER_NUMBER) | \
+ (1 << OTHER_NUMBER) | \
+ (1 << MATH_SYMBOL) | (1 << CURRENCY_SYMBOL) | \
+ (1 << MODIFIER_SYMBOL) | (1 << OTHER_SYMBOL))
+
/*
* Unicode characters less than this value are represented by themselves in
* UTF-8 strings.
@@ -1109,6 +1106,46 @@ Tcl_UtfNcasecmp(
/*
*----------------------------------------------------------------------
*
+ * Tcl_UtfNcasecmp --
+ *
+ * Compare UTF chars of string cs to string ct case insensitively.
+ * Replacement for strcasecmp in Tcl core, in places where UTF-8 should
+ * be handled.
+ *
+ * Results:
+ * Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
+ *
+ * Side effects:
+ * None.
+ *
+ *----------------------------------------------------------------------
+ */
+
+int
+TclUtfCasecmp(
+ const char *cs, /* UTF string to compare to ct. */
+ const char *ct) /* UTF string cs is compared to. */
+{
+ while (*cs && *ct) {
+ Tcl_UniChar ch1, ch2;
+
+ cs += TclUtfToUniChar(cs, &ch1);
+ ct += TclUtfToUniChar(ct, &ch2);
+ if (ch1 != ch2) {
+ ch1 = Tcl_UniCharToLower(ch1);
+ ch2 = Tcl_UniCharToLower(ch2);
+ if (ch1 != ch2) {
+ return ch1 - ch2;
+ }
+ }
+ }
+ return UCHAR(*cs) - UCHAR(*ct);
+}
+
+
+/*
+ *----------------------------------------------------------------------
+ *
* Tcl_UniCharToUpper --
*
* Compute the uppercase equivalent of the given Unicode character.
@@ -1129,10 +1166,9 @@ Tcl_UniCharToUpper(
int info = GetUniCharInfo(ch);
if (GetCaseType(info) & 0x04) {
- return (Tcl_UniChar) (ch - GetDelta(info));
- } else {
- return ch;
+ ch -= GetDelta(info);
}
+ return (Tcl_UniChar) ch;
}
/*
@@ -1158,10 +1194,9 @@ Tcl_UniCharToLower(
int info = GetUniCharInfo(ch);
if (GetCaseType(info) & 0x02) {
- return (Tcl_UniChar) (ch + GetDelta(info));
- } else {
- return ch;
+ ch += GetDelta(info);
}
+ return (Tcl_UniChar) ch;
}
/*
@@ -1192,12 +1227,11 @@ Tcl_UniCharToTitle(
* Subtract or add one depending on the original case.
*/
- return (Tcl_UniChar) (ch + ((mode & 0x4) ? -1 : 1));
+ ch += ((mode & 0x4) ? -1 : 1);
} else if (mode == 0x4) {
- return (Tcl_UniChar) (ch - GetDelta(info));
- } else {
- return ch;
+ ch -= GetDelta(info);
}
+ return (Tcl_UniChar) ch;
}
/*
@@ -1331,9 +1365,7 @@ int
Tcl_UniCharIsAlnum(
int ch) /* Unicode character to test. */
{
- register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
-
- return (((ALPHA_BITS | DIGIT_BITS) >> category) & 1);
+ return (((ALPHA_BITS | DIGIT_BITS) >> GetCategory(ch)) & 1);
}
/*
@@ -1356,8 +1388,7 @@ int
Tcl_UniCharIsAlpha(
int ch) /* Unicode character to test. */
{
- register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
- return ((ALPHA_BITS >> category) & 1);
+ return ((ALPHA_BITS >> GetCategory(ch)) & 1);
}
/*
@@ -1380,7 +1411,7 @@ int
Tcl_UniCharIsControl(
int ch) /* Unicode character to test. */
{
- return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == CONTROL);
+ return ((CONTROL_BITS >> GetCategory(ch)) & 1);
}
/*
@@ -1403,7 +1434,7 @@ int
Tcl_UniCharIsDigit(
int ch) /* Unicode character to test. */
{
- return (GetUniCharInfo(ch)&UNICODE_CATEGORY_MASK) == DECIMAL_DIGIT_NUMBER;
+ return (GetCategory(ch) == DECIMAL_DIGIT_NUMBER);
}
/*
@@ -1426,8 +1457,7 @@ int
Tcl_UniCharIsGraph(
int ch) /* Unicode character to test. */
{
- register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
- return (((PRINT_BITS >> category) & 1) && ((unsigned char) ch != ' '));
+ return ((GRAPH_BITS >> GetCategory(ch)) & 1);
}
/*
@@ -1450,7 +1480,7 @@ int
Tcl_UniCharIsLower(
int ch) /* Unicode character to test. */
{
- return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == LOWERCASE_LETTER);
+ return (GetCategory(ch) == LOWERCASE_LETTER);
}
/*
@@ -1473,8 +1503,7 @@ int
Tcl_UniCharIsPrint(
int ch) /* Unicode character to test. */
{
- register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
- return ((PRINT_BITS >> category) & 1);
+ return (((GRAPH_BITS|SPACE_BITS) >> GetCategory(ch)) & 1);
}
/*
@@ -1497,8 +1526,7 @@ int
Tcl_UniCharIsPunct(
int ch) /* Unicode character to test. */
{
- register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
- return ((PUNCT_BITS >> category) & 1);
+ return ((PUNCT_BITS >> GetCategory(ch)) & 1);
}
/*
@@ -1521,18 +1549,19 @@ int
Tcl_UniCharIsSpace(
int ch) /* Unicode character to test. */
{
- register int category;
-
/*
* If the character is within the first 127 characters, just use the
* standard C function, otherwise consult the Unicode table.
*/
- if (ch < 0x80) {
- return isspace(UCHAR(ch)); /* INTL: ISO space */
+ if (((Tcl_UniChar) ch) < ((Tcl_UniChar) 0x80)) {
+ return TclIsSpaceProc((char) ch);
+ } else if ((Tcl_UniChar) ch == 0x0085 || (Tcl_UniChar) ch == 0x180e
+ || (Tcl_UniChar) ch == 0x200b || (Tcl_UniChar) ch == 0x2060
+ || (Tcl_UniChar) ch == 0x202f || (Tcl_UniChar) ch == 0xfeff) {
+ return 1;
} else {
- category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
- return ((SPACE_BITS >> category) & 1);
+ return ((SPACE_BITS >> GetCategory(ch)) & 1);
}
}
@@ -1556,7 +1585,7 @@ int
Tcl_UniCharIsUpper(
int ch) /* Unicode character to test. */
{
- return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == UPPERCASE_LETTER);
+ return (GetCategory(ch) == UPPERCASE_LETTER);
}
/*
@@ -1579,9 +1608,7 @@ int
Tcl_UniCharIsWordChar(
int ch) /* Unicode character to test. */
{
- register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
-
- return (((ALPHA_BITS | DIGIT_BITS | CONNECTOR_BITS) >> category) & 1);
+ return ((WORD_BITS >> GetCategory(ch)) & 1);
}
/*