summaryrefslogtreecommitdiffstats
path: root/generic/tclUtf.c
diff options
context:
space:
mode:
Diffstat (limited to 'generic/tclUtf.c')
-rw-r--r--generic/tclUtf.c141
1 files changed, 102 insertions, 39 deletions
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index 25cc2d1..94dd628 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -483,7 +483,7 @@ Tcl_NumUtfChars(
* for strlen(string). */
{
Tcl_UniChar ch = 0;
- register int i = 0;
+ register int i = 0, n;
/*
* The separate implementations are faster.
@@ -494,7 +494,11 @@ Tcl_NumUtfChars(
if (length < 0) {
while (*src != '\0') {
- src += TclUtfToUniChar(src, &ch);
+ n = TclUtfToUniChar(src, &ch);
+ if (!n) {
+ n = Tcl_UtfToUniChar(src, &ch);
+ }
+ src += n;
i++;
}
if (i < 0) i = INT_MAX; /* Bug [2738427] */
@@ -502,12 +506,20 @@ Tcl_NumUtfChars(
register const char *endPtr = src + length - TCL_UTF_MAX;
while (src < endPtr) {
- src += TclUtfToUniChar(src, &ch);
+ n = TclUtfToUniChar(src, &ch);
+ if (!n) {
+ n = Tcl_UtfToUniChar(src, &ch);
+ }
+ src += n;
i++;
}
endPtr += TCL_UTF_MAX;
while ((src < endPtr) && Tcl_UtfCharComplete(src, endPtr - src)) {
- src += TclUtfToUniChar(src, &ch);
+ n = TclUtfToUniChar(src, &ch);
+ if (!n) {
+ n = Tcl_UtfToUniChar(src, &ch);
+ }
+ src += n;
i++;
}
if (src < endPtr) {
@@ -699,16 +711,25 @@ Tcl_UtfPrev(
*---------------------------------------------------------------------------
*/
-Tcl_UniChar
+int
Tcl_UniCharAtIndex(
register const char *src, /* The UTF-8 string to dereference. */
register int index) /* The position of the desired character. */
{
- Tcl_UniChar ch = 0;
-
- while (index >= 0) {
- index--;
- src += TclUtfToUniChar(src, &ch);
+ Tcl_UniChar unichar = 0;
+ int bytes;
+ int ch = 0;
+
+ while (index-- >= 0) {
+ bytes = TclUtfToUniChar(src, &unichar);
+ ch = unichar;
+ if (!bytes) {
+ /* TclUtfToUniChar only returns 0 for chars > 0xffff ! */
+ bytes = TclUtfToUniChar(src, &unichar);
+ /* Combine surrogates */
+ ch = (((ch & 0x3ff) << 10) | (unichar & 0x3ff)) + 0x10000;
+ }
+ src += bytes;
}
return ch;
}
@@ -736,10 +757,15 @@ Tcl_UtfAtIndex(
register int index) /* The position of the desired character. */
{
Tcl_UniChar ch = 0;
+ int len;
while (index > 0) {
index--;
- src += TclUtfToUniChar(src, &ch);
+ len = TclUtfToUniChar(src, &ch);
+ if (!len) {
+ len = TclUtfToUniChar(src, &ch);
+ }
+ src += len;
}
return src;
}
@@ -819,7 +845,8 @@ int
Tcl_UtfToUpper(
char *str) /* String to convert in place. */
{
- Tcl_UniChar ch = 0, upChar;
+ Tcl_UniChar ch = 0;
+ int upChar;
char *src, *dst;
int bytes;
@@ -830,7 +857,14 @@ Tcl_UtfToUpper(
src = dst = str;
while (*src) {
bytes = TclUtfToUniChar(src, &ch);
- upChar = Tcl_UniCharToUpper(ch);
+ upChar = ch;
+ if (!bytes) {
+ /* TclUtfToUniChar only returns 0 for chars > 0xffff ! */
+ bytes = TclUtfToUniChar(src, &ch);
+ /* Combine surrogates */
+ upChar = (((upChar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000;
+ }
+ upChar = Tcl_UniCharToUpper(upChar);
/*
* To keep badly formed Utf strings from getting inflated by the
@@ -872,7 +906,8 @@ int
Tcl_UtfToLower(
char *str) /* String to convert in place. */
{
- Tcl_UniChar ch = 0, lowChar;
+ Tcl_UniChar ch = 0;
+ int lowChar;
char *src, *dst;
int bytes;
@@ -883,7 +918,14 @@ Tcl_UtfToLower(
src = dst = str;
while (*src) {
bytes = TclUtfToUniChar(src, &ch);
- lowChar = Tcl_UniCharToLower(ch);
+ lowChar = ch;
+ if (!bytes) {
+ /* TclUtfToUniChar only returns 0 for chars > 0xffff ! */
+ bytes = TclUtfToUniChar(src, &ch);
+ /* Combine surrogates */
+ lowChar = (((lowChar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000;
+ }
+ lowChar = Tcl_UniCharToLower(lowChar);
/*
* To keep badly formed Utf strings from getting inflated by the
@@ -926,7 +968,8 @@ int
Tcl_UtfToTitle(
char *str) /* String to convert in place. */
{
- Tcl_UniChar ch = 0, titleChar, lowChar;
+ Tcl_UniChar ch = 0;
+ int titleChar, lowChar;
char *src, *dst;
int bytes;
@@ -939,7 +982,14 @@ Tcl_UtfToTitle(
if (*src) {
bytes = TclUtfToUniChar(src, &ch);
- titleChar = Tcl_UniCharToTitle(ch);
+ titleChar = ch;
+ if (!bytes) {
+ /* TclUtfToUniChar only returns 0 for chars > 0xffff ! */
+ bytes = TclUtfToUniChar(src, &ch);
+ /* Combine surrogates */
+ titleChar = (((titleChar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000;
+ }
+ titleChar = Tcl_UniCharToTitle(titleChar);
if (bytes < TclUtfCount(titleChar)) {
memcpy(dst, src, (size_t) bytes);
@@ -951,7 +1001,14 @@ Tcl_UtfToTitle(
}
while (*src) {
bytes = TclUtfToUniChar(src, &ch);
- lowChar = Tcl_UniCharToLower(ch);
+ lowChar = ch;
+ if (!bytes) {
+ /* TclUtfToUniChar only returns 0 for chars > 0xffff ! */
+ bytes = TclUtfToUniChar(src, &ch);
+ /* Combine surrogates */
+ lowChar = (((lowChar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000;
+ }
+ lowChar = Tcl_UniCharToLower(lowChar);
if (bytes < TclUtfCount(lowChar)) {
memcpy(dst, src, (size_t) bytes);
@@ -1159,16 +1216,18 @@ TclUtfCasecmp(
*----------------------------------------------------------------------
*/
-Tcl_UniChar
+int
Tcl_UniCharToUpper(
int ch) /* Unicode character to convert. */
{
- int info = GetUniCharInfo(ch);
+ if (!UNICODE_OUT_OF_RANGE(ch)) {
+ int info = GetUniCharInfo(ch);
- if (GetCaseType(info) & 0x04) {
- ch -= GetDelta(info);
+ if (GetCaseType(info) & 0x04) {
+ ch -= GetDelta(info);
+ }
}
- return (Tcl_UniChar) ch;
+ return ch & 0x1FFFFF;
}
/*
@@ -1187,16 +1246,18 @@ Tcl_UniCharToUpper(
*----------------------------------------------------------------------
*/
-Tcl_UniChar
+int
Tcl_UniCharToLower(
int ch) /* Unicode character to convert. */
{
- int info = GetUniCharInfo(ch);
+ if (!UNICODE_OUT_OF_RANGE(ch)) {
+ int info = GetUniCharInfo(ch);
- if (GetCaseType(info) & 0x02) {
- ch += GetDelta(info);
+ if (GetCaseType(info) & 0x02) {
+ ch += GetDelta(info);
+ }
}
- return (Tcl_UniChar) ch;
+ return ch & 0x1FFFFF;
}
/*
@@ -1215,23 +1276,25 @@ Tcl_UniCharToLower(
*----------------------------------------------------------------------
*/
-Tcl_UniChar
+int
Tcl_UniCharToTitle(
int ch) /* Unicode character to convert. */
{
- int info = GetUniCharInfo(ch);
- int mode = GetCaseType(info);
+ if (!UNICODE_OUT_OF_RANGE(ch)) {
+ int info = GetUniCharInfo(ch);
+ int mode = GetCaseType(info);
- if (mode & 0x1) {
- /*
- * Subtract or add one depending on the original case.
- */
+ if (mode & 0x1) {
+ /*
+ * Subtract or add one depending on the original case.
+ */
- ch += ((mode & 0x4) ? -1 : 1);
- } else if (mode == 0x4) {
- ch -= GetDelta(info);
+ ch += ((mode & 0x4) ? -1 : 1);
+ } else if (mode == 0x4) {
+ ch -= GetDelta(info);
+ }
}
- return (Tcl_UniChar) ch;
+ return ch & 0x1FFFFF;
}
/*