summaryrefslogtreecommitdiffstats
path: root/generic/tclUtf.c
diff options
context:
space:
mode:
authorjan.nijtmans <nijtmans@users.sourceforge.net>2017-11-30 13:14:43 (GMT)
committerjan.nijtmans <nijtmans@users.sourceforge.net>2017-11-30 13:14:43 (GMT)
commit72998869a3be3534fec99499faabe2d1557d6bcb (patch)
tree85593ed297518fcde58853b7fa1d2cb98b1a0e6d /generic/tclUtf.c
parentc47ba70f4545f7c961160a6beb0a466e3bfd5532 (diff)
downloadtcl-72998869a3be3534fec99499faabe2d1557d6bcb.zip
tcl-72998869a3be3534fec99499faabe2d1557d6bcb.tar.gz
tcl-72998869a3be3534fec99499faabe2d1557d6bcb.tar.bz2
Fix [8e1e31eac0fd6b6c4452bc108a98ab08c6b64588|8e1e31eac0]: lsort treats NUL chars strangely. Also fix various initializations, which only make a difference when TCL_UTF_MAX == 4.
Add new test-cases which demonstrate the fix. For TCL_UTF_MAX == 4, surrogates will now be handled as expected as well when sorting.
Diffstat (limited to 'generic/tclUtf.c')
-rw-r--r--generic/tclUtf.c103
1 files changed, 75 insertions, 28 deletions
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index a72394d..43636b4 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -720,8 +720,7 @@ Tcl_UniCharAtIndex(
{
Tcl_UniChar ch = 0;
- while (index >= 0) {
- index--;
+ while (index-- >= 0) {
src += TclUtfToUniChar(src, &ch);
}
return ch;
@@ -751,8 +750,7 @@ Tcl_UtfAtIndex(
{
Tcl_UniChar ch = 0;
- while (index > 0) {
- index--;
+ while (index-- > 0) {
src += TclUtfToUniChar(src, &ch);
}
return src;
@@ -1066,16 +1064,17 @@ Tcl_UtfNcmp(
cs += TclUtfToUniChar(cs, &ch1);
ct += TclUtfToUniChar(ct, &ch2);
+ if (ch1 != ch2) {
#if TCL_UTF_MAX == 4
- /* map high surrogate characters to values > 0xffff */
- if ((ch1 & 0xFC00) == 0xD800) {
- ch1 += 0x4000;
- }
- if ((ch2 & 0xFC00) == 0xD800) {
- ch2 += 0x4000;
- }
+ /* Surrogates always report higher than non-surrogates */
+ if (((ch1 & 0xFC00) == 0xD800)) {
+ if ((ch2 & 0xFC00) != 0xD800) {
+ return ch1;
+ }
+ } else if ((ch2 & 0xFC00) == 0xD800) {
+ return -ch2;
+ }
#endif
- if (ch1 != ch2) {
return (ch1 - ch2);
}
}
@@ -1116,16 +1115,17 @@ Tcl_UtfNcasecmp(
*/
cs += TclUtfToUniChar(cs, &ch1);
ct += TclUtfToUniChar(ct, &ch2);
+ if (ch1 != ch2) {
#if TCL_UTF_MAX == 4
- /* map high surrogate characters to values > 0xffff */
- if ((ch1 & 0xFC00) == 0xD800) {
- ch1 += 0x4000;
- }
- if ((ch2 & 0xFC00) == 0xD800) {
- ch2 += 0x4000;
- }
+ /* Surrogates always report higher than non-surrogates */
+ if (((ch1 & 0xFC00) == 0xD800)) {
+ if ((ch2 & 0xFC00) != 0xD800) {
+ return ch1;
+ }
+ } else if ((ch2 & 0xFC00) == 0xD800) {
+ return -ch2;
+ }
#endif
- if (ch1 != ch2) {
ch1 = Tcl_UniCharToLower(ch1);
ch2 = Tcl_UniCharToLower(ch2);
if (ch1 != ch2) {
@@ -1135,6 +1135,52 @@ Tcl_UtfNcasecmp(
}
return 0;
}
+
+/*
+ *----------------------------------------------------------------------
+ *
+ * Tcl_UtfCmp --
+ *
+ * Compare UTF chars of string cs to string ct case sensitively.
+ * Replacement for strcmp in Tcl core, in places where UTF-8 should
+ * be handled.
+ *
+ * Results:
+ * Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
+ *
+ * Side effects:
+ * None.
+ *
+ *----------------------------------------------------------------------
+ */
+
+int
+TclUtfCmp(
+ const char *cs, /* UTF string to compare to ct. */
+ const char *ct) /* UTF string cs is compared to. */
+{
+ Tcl_UniChar ch1 = 0, ch2 = 0;
+
+ while (*cs && *ct) {
+ cs += TclUtfToUniChar(cs, &ch1);
+ ct += TclUtfToUniChar(ct, &ch2);
+ if (ch1 != ch2) {
+#if TCL_UTF_MAX == 4
+ /* Surrogates always report higher than non-surrogates */
+ if (((ch1 & 0xFC00) == 0xD800)) {
+ if ((ch2 & 0xFC00) != 0xD800) {
+ return ch1;
+ }
+ } else if ((ch2 & 0xFC00) == 0xD800) {
+ return -ch2;
+ }
+#endif
+ return ch1 - ch2;
+ }
+ }
+ return UCHAR(*cs) - UCHAR(*ct);
+}
+
/*
*----------------------------------------------------------------------
@@ -1164,16 +1210,17 @@ TclUtfCasecmp(
while (*cs && *ct) {
cs += TclUtfToUniChar(cs, &ch1);
ct += TclUtfToUniChar(ct, &ch2);
+ if (ch1 != ch2) {
#if TCL_UTF_MAX == 4
- /* map high surrogate characters to values > 0xffff */
- if ((ch1 & 0xFC00) == 0xD800) {
- ch1 += 0x4000;
- }
- if ((ch2 & 0xFC00) == 0xD800) {
- ch2 += 0x4000;
- }
+ /* Surrogates always report higher than non-surrogates */
+ if (((ch1 & 0xFC00) == 0xD800)) {
+ if ((ch2 & 0xFC00) != 0xD800) {
+ return ch1;
+ }
+ } else if ((ch2 & 0xFC00) == 0xD800) {
+ return -ch2;
+ }
#endif
- if (ch1 != ch2) {
ch1 = Tcl_UniCharToLower(ch1);
ch2 = Tcl_UniCharToLower(ch2);
if (ch1 != ch2) {