summaryrefslogtreecommitdiffstats
path: root/generic/tclUtf.c
diff options
context:
space:
mode:
Diffstat (limited to 'generic/tclUtf.c')
-rw-r--r--generic/tclUtf.c135
1 files changed, 112 insertions, 23 deletions
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index 25cc2d1..43636b4 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -94,7 +94,7 @@ static const unsigned char totalBytes[256] = {
int
TclUtfCount(
- int ch) /* The Tcl_UniChar whose size is returned. */
+ int ch) /* The Unicode character whose size is returned. */
{
if ((unsigned)(ch - 1) < (UNICODE_SELF - 1)) {
return 1;
@@ -398,7 +398,7 @@ Tcl_UtfToUniCharDString(
* appended to this previously initialized
* DString. */
{
- Tcl_UniChar ch, *w, *wString;
+ Tcl_UniChar ch = 0, *w, *wString;
const char *p, *end;
int oldLength;
@@ -522,13 +522,13 @@ Tcl_NumUtfChars(
*
* Tcl_UtfFindFirst --
*
- * Returns a pointer to the first occurance of the given Tcl_UniChar in
- * the NULL-terminated UTF-8 string. The NULL terminator is considered
+ * Returns a pointer to the first occurance of the given Unicode character
+ * in the NULL-terminated UTF-8 string. The NULL terminator is considered
* part of the UTF-8 string. Equivalent to Plan 9 utfrune().
*
* Results:
- * As above. If the Tcl_UniChar does not exist in the given string, the
- * return value is NULL.
+ * As above. If the Unicode character does not exist in the given string,
+ * the return value is NULL.
*
* Side effects:
* None.
@@ -539,14 +539,21 @@ Tcl_NumUtfChars(
const char *
Tcl_UtfFindFirst(
const char *src, /* The UTF-8 string to be searched. */
- int ch) /* The Tcl_UniChar to search for. */
+ int ch) /* The Unicode character to search for. */
{
- int len;
+ int len, fullchar;
Tcl_UniChar find = 0;
while (1) {
len = TclUtfToUniChar(src, &find);
- if (find == ch) {
+ fullchar = find;
+#if TCL_UTF_MAX == 4
+ if (!len) {
+ len += TclUtfToUniChar(src, &find);
+ fullchar = (((fullchar & 0x3ff) << 10) | (find & 0x3ff)) + 0x10000;
+ }
+#endif
+ if (fullchar == ch) {
return src;
}
if (*src == '\0') {
@@ -561,12 +568,12 @@ Tcl_UtfFindFirst(
*
* Tcl_UtfFindLast --
*
- * Returns a pointer to the last occurance of the given Tcl_UniChar in
- * the NULL-terminated UTF-8 string. The NULL terminator is considered
+ * Returns a pointer to the last occurance of the given Unicode character
+ * in the NULL-terminated UTF-8 string. The NULL terminator is considered
* part of the UTF-8 string. Equivalent to Plan 9 utfrrune().
*
* Results:
- * As above. If the Tcl_UniChar does not exist in the given string, the
+ * As above. If the Unicode character does not exist in the given string, the
* return value is NULL.
*
* Side effects:
@@ -578,16 +585,23 @@ Tcl_UtfFindFirst(
const char *
Tcl_UtfFindLast(
const char *src, /* The UTF-8 string to be searched. */
- int ch) /* The Tcl_UniChar to search for. */
+ int ch) /* The Unicode character to search for. */
{
- int len;
+ int len, fullchar;
Tcl_UniChar find = 0;
const char *last;
last = NULL;
while (1) {
len = TclUtfToUniChar(src, &find);
- if (find == ch) {
+ fullchar = find;
+#if TCL_UTF_MAX == 4
+ if (!len) {
+ len += TclUtfToUniChar(src, &find);
+ fullchar = (((fullchar & 0x3ff) << 10) | (find & 0x3ff)) + 0x10000;
+ }
+#endif
+ if (fullchar == ch) {
last = src;
}
if (*src == '\0') {
@@ -687,7 +701,7 @@ Tcl_UtfPrev(
*
* Tcl_UniCharAtIndex --
*
- * Returns the Unicode character represented at the specified character
+ * Returns the Tcl_UniChar represented at the specified character
* (not byte) position in the UTF-8 string.
*
* Results:
@@ -706,8 +720,7 @@ Tcl_UniCharAtIndex(
{
Tcl_UniChar ch = 0;
- while (index >= 0) {
- index--;
+ while (index-- >= 0) {
src += TclUtfToUniChar(src, &ch);
}
return ch;
@@ -737,8 +750,7 @@ Tcl_UtfAtIndex(
{
Tcl_UniChar ch = 0;
- while (index > 0) {
- index--;
+ while (index-- > 0) {
src += TclUtfToUniChar(src, &ch);
}
return src;
@@ -1053,6 +1065,16 @@ Tcl_UtfNcmp(
cs += TclUtfToUniChar(cs, &ch1);
ct += TclUtfToUniChar(ct, &ch2);
if (ch1 != ch2) {
+#if TCL_UTF_MAX == 4
+ /* Surrogates always report higher than non-surrogates */
+ if (((ch1 & 0xFC00) == 0xD800)) {
+ if ((ch2 & 0xFC00) != 0xD800) {
+ return ch1;
+ }
+ } else if ((ch2 & 0xFC00) == 0xD800) {
+ return -ch2;
+ }
+#endif
return (ch1 - ch2);
}
}
@@ -1084,6 +1106,7 @@ Tcl_UtfNcasecmp(
unsigned long numChars) /* Number of UTF chars to compare. */
{
Tcl_UniChar ch1 = 0, ch2 = 0;
+
while (numChars-- > 0) {
/*
* n must be interpreted as chars, not bytes.
@@ -1093,6 +1116,16 @@ Tcl_UtfNcasecmp(
cs += TclUtfToUniChar(cs, &ch1);
ct += TclUtfToUniChar(ct, &ch2);
if (ch1 != ch2) {
+#if TCL_UTF_MAX == 4
+ /* Surrogates always report higher than non-surrogates */
+ if (((ch1 & 0xFC00) == 0xD800)) {
+ if ((ch2 & 0xFC00) != 0xD800) {
+ return ch1;
+ }
+ } else if ((ch2 & 0xFC00) == 0xD800) {
+ return -ch2;
+ }
+#endif
ch1 = Tcl_UniCharToLower(ch1);
ch2 = Tcl_UniCharToLower(ch2);
if (ch1 != ch2) {
@@ -1102,11 +1135,57 @@ Tcl_UtfNcasecmp(
}
return 0;
}
+
+/*
+ *----------------------------------------------------------------------
+ *
+ * Tcl_UtfCmp --
+ *
+ * Compare UTF chars of string cs to string ct case sensitively.
+ * Replacement for strcmp in Tcl core, in places where UTF-8 should
+ * be handled.
+ *
+ * Results:
+ * Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
+ *
+ * Side effects:
+ * None.
+ *
+ *----------------------------------------------------------------------
+ */
+
+int
+TclUtfCmp(
+ const char *cs, /* UTF string to compare to ct. */
+ const char *ct) /* UTF string cs is compared to. */
+{
+ Tcl_UniChar ch1 = 0, ch2 = 0;
+
+ while (*cs && *ct) {
+ cs += TclUtfToUniChar(cs, &ch1);
+ ct += TclUtfToUniChar(ct, &ch2);
+ if (ch1 != ch2) {
+#if TCL_UTF_MAX == 4
+ /* Surrogates always report higher than non-surrogates */
+ if (((ch1 & 0xFC00) == 0xD800)) {
+ if ((ch2 & 0xFC00) != 0xD800) {
+ return ch1;
+ }
+ } else if ((ch2 & 0xFC00) == 0xD800) {
+ return -ch2;
+ }
+#endif
+ return ch1 - ch2;
+ }
+ }
+ return UCHAR(*cs) - UCHAR(*ct);
+}
+
/*
*----------------------------------------------------------------------
*
- * Tcl_UtfNcasecmp --
+ * TclUtfCasecmp --
*
* Compare UTF chars of string cs to string ct case insensitively.
* Replacement for strcasecmp in Tcl core, in places where UTF-8 should
@@ -1126,12 +1205,22 @@ TclUtfCasecmp(
const char *cs, /* UTF string to compare to ct. */
const char *ct) /* UTF string cs is compared to. */
{
- while (*cs && *ct) {
- Tcl_UniChar ch1, ch2;
+ Tcl_UniChar ch1 = 0, ch2 = 0;
+ while (*cs && *ct) {
cs += TclUtfToUniChar(cs, &ch1);
ct += TclUtfToUniChar(ct, &ch2);
if (ch1 != ch2) {
+#if TCL_UTF_MAX == 4
+ /* Surrogates always report higher than non-surrogates */
+ if (((ch1 & 0xFC00) == 0xD800)) {
+ if ((ch2 & 0xFC00) != 0xD800) {
+ return ch1;
+ }
+ } else if ((ch2 & 0xFC00) == 0xD800) {
+ return -ch2;
+ }
+#endif
ch1 = Tcl_UniCharToLower(ch1);
ch2 = Tcl_UniCharToLower(ch2);
if (ch1 != ch2) {