summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--doc/ToUpper.32
-rw-r--r--doc/UniCharIsAlpha.35
-rw-r--r--doc/Utf.32
-rw-r--r--generic/tclUtf.c74
4 files changed, 61 insertions, 22 deletions
diff --git a/doc/ToUpper.3 b/doc/ToUpper.3
index b933e9c..be614e7 100644
--- a/doc/ToUpper.3
+++ b/doc/ToUpper.3
@@ -33,7 +33,7 @@ int
.SH ARGUMENTS
.AS char *str in/out
.AP int ch in
-The Tcl_UniChar to be converted.
+The Unicode character to be converted.
.AP char *str in/out
Pointer to UTF-8 string to be converted in place.
.BE
diff --git a/doc/UniCharIsAlpha.3 b/doc/UniCharIsAlpha.3
index 2336c34..5ba3fc9 100644
--- a/doc/UniCharIsAlpha.3
+++ b/doc/UniCharIsAlpha.3
@@ -53,14 +53,11 @@ The Tcl_UniChar to be examined.
.SH DESCRIPTION
.PP
-All of the routines described examine Tcl_UniChars and return a
+All of the routines described examine Unicode characters and return a
boolean value. A non-zero return value means that the character does
belong to the character class associated with the called routine. The
rest of this document just describes the character classes associated
with the various routines.
-.PP
-Note: A Tcl_UniChar is a Unicode character represented as an unsigned,
-fixed-size quantity.
.SH "CHARACTER CLASSES"
.PP
diff --git a/doc/Utf.3 b/doc/Utf.3
index 378c806..9d0c617 100644
--- a/doc/Utf.3
+++ b/doc/Utf.3
@@ -77,7 +77,7 @@ int
Buffer in which the UTF-8 representation of the Tcl_UniChar is stored. At most
\fBTCL_UTF_MAX\fR bytes are stored in the buffer.
.AP int ch in
-The Tcl_UniChar to be converted or examined.
+The Unicode character to be converted or examined.
.AP Tcl_UniChar *chPtr out
Filled with the Tcl_UniChar represented by the head of the UTF-8 string.
.AP "const char" *src in
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index 25cc2d1..1a8367b 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -398,7 +398,7 @@ Tcl_UtfToUniCharDString(
* appended to this previously initialized
* DString. */
{
- Tcl_UniChar ch, *w, *wString;
+ Tcl_UniChar ch = 0, *w, *wString;
const char *p, *end;
int oldLength;
@@ -522,13 +522,13 @@ Tcl_NumUtfChars(
*
* Tcl_UtfFindFirst --
*
- * Returns a pointer to the first occurance of the given Tcl_UniChar in
- * the NULL-terminated UTF-8 string. The NULL terminator is considered
+ * Returns a pointer to the first occurance of the given Unicode character
+ * in the NULL-terminated UTF-8 string. The NULL terminator is considered
* part of the UTF-8 string. Equivalent to Plan 9 utfrune().
*
* Results:
- * As above. If the Tcl_UniChar does not exist in the given string, the
- * return value is NULL.
+ * As above. If the Unicode character does not exist in the given string,
+ * the return value is NULL.
*
* Side effects:
* None.
@@ -539,14 +539,21 @@ Tcl_NumUtfChars(
const char *
Tcl_UtfFindFirst(
const char *src, /* The UTF-8 string to be searched. */
- int ch) /* The Tcl_UniChar to search for. */
+ int ch) /* The Unicode character to search for. */
{
- int len;
+ int len, fullchar;
Tcl_UniChar find = 0;
while (1) {
len = TclUtfToUniChar(src, &find);
- if (find == ch) {
+ fullchar = find;
+#if TCL_UTF_MAX == 4
+ if (!len) {
+ len += TclUtfToUniChar(src, &find);
+ fullchar = (((fullchar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000;
+ }
+#endif
+ if (find == fullchar) {
return src;
}
if (*src == '\0') {
@@ -561,8 +568,8 @@ Tcl_UtfFindFirst(
*
* Tcl_UtfFindLast --
*
- * Returns a pointer to the last occurance of the given Tcl_UniChar in
- * the NULL-terminated UTF-8 string. The NULL terminator is considered
+ * Returns a pointer to the last occurance of the given Unicode character
+ * in the NULL-terminated UTF-8 string. The NULL terminator is considered
* part of the UTF-8 string. Equivalent to Plan 9 utfrrune().
*
* Results:
@@ -578,16 +585,23 @@ Tcl_UtfFindFirst(
const char *
Tcl_UtfFindLast(
const char *src, /* The UTF-8 string to be searched. */
- int ch) /* The Tcl_UniChar to search for. */
+ int ch) /* The Unicode character to search for. */
{
- int len;
+ int len, fullchar;
Tcl_UniChar find = 0;
const char *last;
last = NULL;
while (1) {
len = TclUtfToUniChar(src, &find);
- if (find == ch) {
+ fullchar = find;
+#if TCL_UTF_MAX == 4
+ if (!len) {
+ len += TclUtfToUniChar(src, &find);
+ fullchar = (((fullchar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000;
+ }
+#endif
+ if (find == fullchar) {
last = src;
}
if (*src == '\0') {
@@ -1052,6 +1066,15 @@ Tcl_UtfNcmp(
cs += TclUtfToUniChar(cs, &ch1);
ct += TclUtfToUniChar(ct, &ch2);
+#if TCL_UTF_MAX == 4
+ /* map high surrogate characters to values > 0xffff */
+ if ((ch1 & 0xFC00) == 0xD800) {
+ ch1 += 0x4000;
+ }
+ if ((ch2 & 0xFC00) == 0xD800) {
+ ch2 += 0x4000;
+ }
+#endif
if (ch1 != ch2) {
return (ch1 - ch2);
}
@@ -1084,6 +1107,7 @@ Tcl_UtfNcasecmp(
unsigned long numChars) /* Number of UTF chars to compare. */
{
Tcl_UniChar ch1 = 0, ch2 = 0;
+
while (numChars-- > 0) {
/*
* n must be interpreted as chars, not bytes.
@@ -1092,6 +1116,15 @@ Tcl_UtfNcasecmp(
*/
cs += TclUtfToUniChar(cs, &ch1);
ct += TclUtfToUniChar(ct, &ch2);
+#if TCL_UTF_MAX == 4
+ /* map high surrogate characters to values > 0xffff */
+ if ((ch1 & 0xFC00) == 0xD800) {
+ ch1 += 0x4000;
+ }
+ if ((ch2 & 0xFC00) == 0xD800) {
+ ch2 += 0x4000;
+ }
+#endif
if (ch1 != ch2) {
ch1 = Tcl_UniCharToLower(ch1);
ch2 = Tcl_UniCharToLower(ch2);
@@ -1106,7 +1139,7 @@ Tcl_UtfNcasecmp(
/*
*----------------------------------------------------------------------
*
- * Tcl_UtfNcasecmp --
+ * TclUtfCasecmp --
*
* Compare UTF chars of string cs to string ct case insensitively.
* Replacement for strcasecmp in Tcl core, in places where UTF-8 should
@@ -1126,11 +1159,20 @@ TclUtfCasecmp(
const char *cs, /* UTF string to compare to ct. */
const char *ct) /* UTF string cs is compared to. */
{
- while (*cs && *ct) {
- Tcl_UniChar ch1, ch2;
+ Tcl_UniChar ch1 = 0, ch2 = 0;
+ while (*cs && *ct) {
cs += TclUtfToUniChar(cs, &ch1);
ct += TclUtfToUniChar(ct, &ch2);
+#if TCL_UTF_MAX == 4
+ /* map high surrogate characters to values > 0xffff */
+ if ((ch1 & 0xFC00) == 0xD800) {
+ ch1 += 0x4000;
+ }
+ if ((ch2 & 0xFC00) == 0xD800) {
+ ch2 += 0x4000;
+ }
+#endif
if (ch1 != ch2) {
ch1 = Tcl_UniCharToLower(ch1);
ch2 = Tcl_UniCharToLower(ch2);