summaryrefslogtreecommitdiffstats
path: root/generic/tclUtf.c
diff options
context:
space:
mode:
authorjan.nijtmans <nijtmans@users.sourceforge.net>2017-12-01 15:03:46 (GMT)
committerjan.nijtmans <nijtmans@users.sourceforge.net>2017-12-01 15:03:46 (GMT)
commit71c032c81630285df2581432aeb509295fbb39ed (patch)
treed7736549826d2697b558a2db03759c1f7e3f3524 /generic/tclUtf.c
parent4bf4b7d35c401066e61ad2ce1d21b04709a57938 (diff)
parent200910b9c19ac1986d28677de7b427ee774d8e40 (diff)
downloadtcl-71c032c81630285df2581432aeb509295fbb39ed.zip
tcl-71c032c81630285df2581432aeb509295fbb39ed.tar.gz
tcl-71c032c81630285df2581432aeb509295fbb39ed.tar.bz2
merge trunk
Diffstat (limited to 'generic/tclUtf.c')
-rw-r--r--generic/tclUtf.c130
1 files changed, 111 insertions, 19 deletions
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index 5f87ffa..dbe5e45 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -94,7 +94,7 @@ static const unsigned char totalBytes[256] = {
int
TclUtfCount(
- int ch) /* The Tcl_UniChar whose size is returned. */
+ int ch) /* The Unicode character whose size is returned. */
{
if ((unsigned)(ch - 1) < (UNICODE_SELF - 1)) {
return 1;
@@ -398,7 +398,7 @@ Tcl_UtfToUniCharDString(
* appended to this previously initialized
* DString. */
{
- Tcl_UniChar ch, *w, *wString;
+ Tcl_UniChar ch = 0, *w, *wString;
const char *p, *end;
size_t oldLength;
@@ -521,13 +521,13 @@ Tcl_NumUtfChars(
*
* Tcl_UtfFindFirst --
*
- * Returns a pointer to the first occurance of the given Tcl_UniChar in
- * the NULL-terminated UTF-8 string. The NULL terminator is considered
+ * Returns a pointer to the first occurance of the given Unicode character
+ * in the NULL-terminated UTF-8 string. The NULL terminator is considered
* part of the UTF-8 string. Equivalent to Plan 9 utfrune().
*
* Results:
- * As above. If the Tcl_UniChar does not exist in the given string, the
- * return value is NULL.
+ * As above. If the Unicode character does not exist in the given string,
+ * the return value is NULL.
*
* Side effects:
* None.
@@ -538,14 +538,21 @@ Tcl_NumUtfChars(
const char *
Tcl_UtfFindFirst(
const char *src, /* The UTF-8 string to be searched. */
- int ch) /* The Tcl_UniChar to search for. */
+ int ch) /* The Unicode character to search for. */
{
- int len;
+ int len, fullchar;
Tcl_UniChar find = 0;
while (1) {
len = TclUtfToUniChar(src, &find);
- if (find == ch) {
+ fullchar = find;
+#if TCL_UTF_MAX == 4
+ if (!len) {
+ len += TclUtfToUniChar(src, &find);
+ fullchar = (((fullchar & 0x3ff) << 10) | (find & 0x3ff)) + 0x10000;
+ }
+#endif
+ if (fullchar == ch) {
return src;
}
if (*src == '\0') {
@@ -560,12 +567,12 @@ Tcl_UtfFindFirst(
*
* Tcl_UtfFindLast --
*
- * Returns a pointer to the last occurance of the given Tcl_UniChar in
- * the NULL-terminated UTF-8 string. The NULL terminator is considered
+ * Returns a pointer to the last occurance of the given Unicode character
+ * in the NULL-terminated UTF-8 string. The NULL terminator is considered
* part of the UTF-8 string. Equivalent to Plan 9 utfrrune().
*
* Results:
- * As above. If the Tcl_UniChar does not exist in the given string, the
+ * As above. If the Unicode character does not exist in the given string, the
* return value is NULL.
*
* Side effects:
@@ -577,16 +584,23 @@ Tcl_UtfFindFirst(
const char *
Tcl_UtfFindLast(
const char *src, /* The UTF-8 string to be searched. */
- int ch) /* The Tcl_UniChar to search for. */
+ int ch) /* The Unicode character to search for. */
{
- int len;
+ int len, fullchar;
Tcl_UniChar find = 0;
const char *last;
last = NULL;
while (1) {
len = TclUtfToUniChar(src, &find);
- if (find == ch) {
+ fullchar = find;
+#if TCL_UTF_MAX == 4
+ if (!len) {
+ len += TclUtfToUniChar(src, &find);
+ fullchar = (((fullchar & 0x3ff) << 10) | (find & 0x3ff)) + 0x10000;
+ }
+#endif
+ if (fullchar == ch) {
last = src;
}
if (*src == '\0') {
@@ -686,7 +700,7 @@ Tcl_UtfPrev(
*
* Tcl_UniCharAtIndex --
*
- * Returns the Unicode character represented at the specified character
+ * Returns the Tcl_UniChar represented at the specified character
* (not byte) position in the UTF-8 string.
*
* Results:
@@ -705,6 +719,7 @@ Tcl_UniCharAtIndex(
{
Tcl_UniChar ch = 0;
+ src += TclUtfToUniChar(src, &ch);
while (index--) {
src += TclUtfToUniChar(src, &ch);
}
@@ -1052,6 +1067,16 @@ Tcl_UtfNcmp(
cs += TclUtfToUniChar(cs, &ch1);
ct += TclUtfToUniChar(ct, &ch2);
if (ch1 != ch2) {
+#if TCL_UTF_MAX == 4
+ /* Surrogates always report higher than non-surrogates */
+ if (((ch1 & 0xFC00) == 0xD800)) {
+ if ((ch2 & 0xFC00) != 0xD800) {
+ return ch1;
+ }
+ } else if ((ch2 & 0xFC00) == 0xD800) {
+ return -ch2;
+ }
+#endif
return (ch1 - ch2);
}
}
@@ -1083,6 +1108,7 @@ Tcl_UtfNcasecmp(
size_t numChars) /* Number of UTF chars to compare. */
{
Tcl_UniChar ch1 = 0, ch2 = 0;
+
while (numChars-- > 0) {
/*
* n must be interpreted as chars, not bytes.
@@ -1092,6 +1118,16 @@ Tcl_UtfNcasecmp(
cs += TclUtfToUniChar(cs, &ch1);
ct += TclUtfToUniChar(ct, &ch2);
if (ch1 != ch2) {
+#if TCL_UTF_MAX == 4
+ /* Surrogates always report higher than non-surrogates */
+ if (((ch1 & 0xFC00) == 0xD800)) {
+ if ((ch2 & 0xFC00) != 0xD800) {
+ return ch1;
+ }
+ } else if ((ch2 & 0xFC00) == 0xD800) {
+ return -ch2;
+ }
+#endif
ch1 = Tcl_UniCharToLower(ch1);
ch2 = Tcl_UniCharToLower(ch2);
if (ch1 != ch2) {
@@ -1101,11 +1137,57 @@ Tcl_UtfNcasecmp(
}
return 0;
}
+
+/*
+ *----------------------------------------------------------------------
+ *
+ * Tcl_UtfCmp --
+ *
+ * Compare UTF chars of string cs to string ct case sensitively.
+ * Replacement for strcmp in Tcl core, in places where UTF-8 should
+ * be handled.
+ *
+ * Results:
+ * Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
+ *
+ * Side effects:
+ * None.
+ *
+ *----------------------------------------------------------------------
+ */
+
+int
+TclUtfCmp(
+ const char *cs, /* UTF string to compare to ct. */
+ const char *ct) /* UTF string cs is compared to. */
+{
+ Tcl_UniChar ch1 = 0, ch2 = 0;
+
+ while (*cs && *ct) {
+ cs += TclUtfToUniChar(cs, &ch1);
+ ct += TclUtfToUniChar(ct, &ch2);
+ if (ch1 != ch2) {
+#if TCL_UTF_MAX == 4
+ /* Surrogates always report higher than non-surrogates */
+ if (((ch1 & 0xFC00) == 0xD800)) {
+ if ((ch2 & 0xFC00) != 0xD800) {
+ return ch1;
+ }
+ } else if ((ch2 & 0xFC00) == 0xD800) {
+ return -ch2;
+ }
+#endif
+ return ch1 - ch2;
+ }
+ }
+ return UCHAR(*cs) - UCHAR(*ct);
+}
+
/*
*----------------------------------------------------------------------
*
- * Tcl_UtfNcasecmp --
+ * TclUtfCasecmp --
*
* Compare UTF chars of string cs to string ct case insensitively.
* Replacement for strcasecmp in Tcl core, in places where UTF-8 should
@@ -1125,12 +1207,22 @@ TclUtfCasecmp(
const char *cs, /* UTF string to compare to ct. */
const char *ct) /* UTF string cs is compared to. */
{
- while (*cs && *ct) {
- Tcl_UniChar ch1, ch2;
+ Tcl_UniChar ch1 = 0, ch2 = 0;
+ while (*cs && *ct) {
cs += TclUtfToUniChar(cs, &ch1);
ct += TclUtfToUniChar(ct, &ch2);
if (ch1 != ch2) {
+#if TCL_UTF_MAX == 4
+ /* Surrogates always report higher than non-surrogates */
+ if (((ch1 & 0xFC00) == 0xD800)) {
+ if ((ch2 & 0xFC00) != 0xD800) {
+ return ch1;
+ }
+ } else if ((ch2 & 0xFC00) == 0xD800) {
+ return -ch2;
+ }
+#endif
ch1 = Tcl_UniCharToLower(ch1);
ch2 = Tcl_UniCharToLower(ch2);
if (ch1 != ch2) {