summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--generic/tclCmdMZ.c163
-rw-r--r--generic/tclExecute.c2
-rw-r--r--generic/tclInt.h6
-rw-r--r--generic/tclUtf.c15
-rw-r--r--tests/string.test200
5 files changed, 290 insertions, 96 deletions
diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c
index bbd03d8..36b2443 100644
--- a/generic/tclCmdMZ.c
+++ b/generic/tclCmdMZ.c
@@ -2499,40 +2499,38 @@ StringStartCmd(
Tcl_Obj *const objv[]) /* Argument objects. */
{
int ch;
- const char *p, *string;
- int cur, index, length, numChars;
+ const Tcl_UniChar *p, *string;
+ int cur, index, length;
if (objc != 3) {
Tcl_WrongNumArgs(interp, 1, objv, "string index");
return TCL_ERROR;
}
- string = TclGetStringFromObj(objv[1], &length);
- numChars = Tcl_NumUtfChars(string, length);
- if (TclGetIntForIndexM(interp, objv[2], numChars-1, &index) != TCL_OK) {
+ string = Tcl_GetUnicodeFromObj(objv[1], &length);
+ if (TclGetIntForIndexM(interp, objv[2], length-1, &index) != TCL_OK) {
return TCL_ERROR;
}
- string = TclGetStringFromObj(objv[1], &length);
- if (index >= numChars) {
- index = numChars - 1;
+ if (index >= length) {
+ index = length - 1;
}
cur = 0;
if (index > 0) {
- p = Tcl_UtfAtIndex(string, index);
+ p = &string[index];
- TclUtfToUCS4(p, &ch);
+ TclUniCharToUCS4(p, &ch);
for (cur = index; cur >= 0; cur--) {
int delta = 0;
- const char *next;
+ const Tcl_UniChar *next;
if (!Tcl_UniCharIsWordChar(ch)) {
break;
}
- next = Tcl_UtfPrev(p, string);
+ next = TclUCS4Prev(p, string);
do {
next += delta;
- delta = TclUtfToUCS4(next, &ch);
+ delta = TclUniCharToUCS4(next, &ch);
} while (next + delta < p);
p = next;
}
@@ -2568,49 +2566,28 @@ StringPrevCharCmd(
int objc, /* Number of arguments. */
Tcl_Obj *const objv[]) /* Argument objects. */
{
- int ch;
- const char *p, *string;
- int cur, index, length, numChars;
+ const Tcl_UniChar *p, *string;
+ int index, length;
if (objc != 3) {
Tcl_WrongNumArgs(interp, 1, objv, "string index");
return TCL_ERROR;
}
- string = TclGetStringFromObj(objv[1], &length);
- numChars = Tcl_NumUtfChars(string, length);
- if (TclGetIntForIndexM(interp, objv[2], numChars-1, &index) != TCL_OK) {
+ string = Tcl_GetUnicodeFromObj(objv[1], &length);
+ if (TclGetIntForIndexM(interp, objv[2], length-1, &index) != TCL_OK) {
return TCL_ERROR;
}
- string = TclGetStringFromObj(objv[1], &length);
- if (index >= numChars) {
- index = numChars - 1;
+ if (index > length) {
+ index = length;
}
- cur = 0;
if (index > 0) {
- p = Tcl_UtfAtIndex(string, index);
-
- TclUtfToUCS4(p, &ch);
- for (cur = index; cur >= 0; cur--) {
- int delta = 0;
- const char *next;
-
- if (!Tcl_UniCharIsWordChar(ch)) {
- break;
- }
-
- next = Tcl_UtfPrev(p, string);
- do {
- next += delta;
- delta = TclUtfToUCS4(next, &ch);
- } while (next + delta < p);
- p = next;
- }
- if (cur != index) {
- cur += 1;
- }
+ p = &string[index];
+ index = TclUCS4Prev(p, string) - string;
+ } else {
+ index = 0;
}
- Tcl_SetObjResult(interp, Tcl_NewWideIntObj(cur));
+ Tcl_SetObjResult(interp, Tcl_NewWideIntObj(index));
return TCL_OK;
}
@@ -2639,40 +2616,53 @@ StringPrevWordCmd(
Tcl_Obj *const objv[]) /* Argument objects. */
{
int ch;
- const char *p, *string;
- int cur, index, length, numChars;
+ const Tcl_UniChar *p, *string;
+ int cur, index, length;
if (objc != 3) {
Tcl_WrongNumArgs(interp, 1, objv, "string index");
return TCL_ERROR;
}
- string = TclGetStringFromObj(objv[1], &length);
- numChars = Tcl_NumUtfChars(string, length);
- if (TclGetIntForIndexM(interp, objv[2], numChars-1, &index) != TCL_OK) {
+ string = Tcl_GetUnicodeFromObj(objv[1], &length);
+ if (TclGetIntForIndexM(interp, objv[2], length-1, &index) != TCL_OK) {
return TCL_ERROR;
}
- string = TclGetStringFromObj(objv[1], &length);
- if (index >= numChars) {
- index = numChars - 1;
+ if (index >= length) {
+ index = length - 1;
}
cur = 0;
if (index > 0) {
- p = Tcl_UtfAtIndex(string, index);
+ p = &string[index];
- TclUtfToUCS4(p, &ch);
+ TclUniCharToUCS4(p, &ch);
for (cur = index; cur >= 0; cur--) {
int delta = 0;
- const char *next;
+ const Tcl_UniChar *next;
if (!Tcl_UniCharIsWordChar(ch)) {
break;
}
- next = Tcl_UtfPrev(p, string);
+ next = TclUCS4Prev(p, string);
do {
next += delta;
- delta = TclUtfToUCS4(next, &ch);
+ delta = TclUniCharToUCS4(next, &ch);
+ } while (next + delta < p);
+ p = next;
+ }
+ for (; cur >= 0; cur--) {
+ int delta = 0;
+ const Tcl_UniChar *next;
+
+ if (Tcl_UniCharIsWordChar(ch)) {
+ break;
+ }
+
+ next = TclUCS4Prev(p, string);
+ do {
+ next += delta;
+ delta = TclUniCharToUCS4(next, &ch);
} while (next + delta < p);
p = next;
}
@@ -2769,39 +2759,27 @@ StringNextCharCmd(
Tcl_Obj *const objv[]) /* Argument objects. */
{
int ch;
- const char *p, *end, *string;
- int cur, index, length, numChars;
+ const Tcl_UniChar *string;
+ int index, length;
if (objc != 3) {
Tcl_WrongNumArgs(interp, 1, objv, "string index");
return TCL_ERROR;
}
- string = TclGetStringFromObj(objv[1], &length);
- numChars = Tcl_NumUtfChars(string, length);
- if (TclGetIntForIndexM(interp, objv[2], numChars-1, &index) != TCL_OK) {
+ string = Tcl_GetUnicodeFromObj(objv[1], &length);
+ if (TclGetIntForIndexM(interp, objv[2], length-1, &index) != TCL_OK) {
return TCL_ERROR;
}
- string = TclGetStringFromObj(objv[1], &length);
if (index < 0) {
index = 0;
}
- if (index < numChars) {
- p = Tcl_UtfAtIndex(string, index);
- end = string+length;
- for (cur = index; p < end; cur++) {
- p += TclUtfToUCS4(p, &ch);
- if (!Tcl_UniCharIsWordChar(ch)) {
- break;
- }
- }
- if (cur == index) {
- cur++;
- }
+ if (index < length) {
+ index += TclUniCharToUCS4(&string[index], &ch);
} else {
- cur = numChars;
+ index = length;
}
- Tcl_SetObjResult(interp, Tcl_NewWideIntObj(cur));
+ Tcl_SetObjResult(interp, Tcl_NewWideIntObj(index));
return TCL_OK;
}
@@ -2831,39 +2809,40 @@ StringNextWordCmd(
Tcl_Obj *const objv[]) /* Argument objects. */
{
int ch;
- const char *p, *end, *string;
- int cur, index, length, numChars;
+ const Tcl_UniChar *p, *end, *string;
+ int index, length;
if (objc != 3) {
Tcl_WrongNumArgs(interp, 1, objv, "string index");
return TCL_ERROR;
}
- string = TclGetStringFromObj(objv[1], &length);
- numChars = Tcl_NumUtfChars(string, length);
- if (TclGetIntForIndexM(interp, objv[2], numChars-1, &index) != TCL_OK) {
+ string = Tcl_GetUnicodeFromObj(objv[1], &length);
+ if (TclGetIntForIndexM(interp, objv[2], length-1, &index) != TCL_OK) {
return TCL_ERROR;
}
- string = TclGetStringFromObj(objv[1], &length);
if (index < 0) {
index = 0;
}
- if (index < numChars) {
- p = Tcl_UtfAtIndex(string, index);
+ if (index < length) {
+ p = &string[index];
end = string+length;
- for (cur = index; p < end; cur++) {
- p += TclUtfToUCS4(p, &ch);
+ while (index++, p < end) {
+ p += TclUniCharToUCS4(p, &ch);
if (!Tcl_UniCharIsWordChar(ch)) {
break;
}
}
- if (cur == index) {
- cur++;
+ for (; p < end; index++) {
+ p += TclUniCharToUCS4(p, &ch);
+ if (Tcl_UniCharIsWordChar(ch)) {
+ break;
+ }
}
} else {
- cur = numChars;
+ index = length;
}
- Tcl_SetObjResult(interp, Tcl_NewWideIntObj(cur));
+ Tcl_SetObjResult(interp, Tcl_NewWideIntObj(index));
return TCL_OK;
}
diff --git a/generic/tclExecute.c b/generic/tclExecute.c
index cc366e7..c80e12b 100644
--- a/generic/tclExecute.c
+++ b/generic/tclExecute.c
@@ -5292,7 +5292,7 @@ TEBCresume(
} else if (TclIsPureByteArray(valuePtr)) {
objResultPtr = Tcl_NewByteArrayObj(
Tcl_GetByteArrayFromObj(valuePtr, NULL)+index, 1);
- } else if (valuePtr->bytes && length == valuePtr->length) {
+ } else if (valuePtr->bytes && length == valuePtr->length && !(valuePtr->bytes[index] & 0x80)) {
objResultPtr = Tcl_NewStringObj((const char *)
valuePtr->bytes+index, 1);
} else {
diff --git a/generic/tclInt.h b/generic/tclInt.h
index ef7411a..e1dedda 100644
--- a/generic/tclInt.h
+++ b/generic/tclInt.h
@@ -3253,9 +3253,11 @@ MODULE_SCOPE int TclUtfCount(int ch);
#if TCL_UTF_MAX > 3
# define TclUtfToUCS4 Tcl_UtfToUniChar
# define TclUniCharToUCS4(src, ptr) (*ptr = *(src),1)
+# define TclUCS4Prev(src, ptr) (((src) > (ptr)) ? ((src) - 1) : (src))
#else
- MODULE_SCOPE int TclUtfToUCS4(const char *src, int *ucs4Ptr);
- MODULE_SCOPE int TclUniCharToUCS4(const Tcl_UniChar *src, int *ucs4Ptr);
+ MODULE_SCOPE int TclUtfToUCS4(const char *, int *);
+ MODULE_SCOPE int TclUniCharToUCS4(const Tcl_UniChar *, int *);
+ MODULE_SCOPE const Tcl_UniChar *TclUCS4Prev(const Tcl_UniChar *, const Tcl_UniChar *);
#endif
MODULE_SCOPE Tcl_Obj * TclpNativeToNormalized(ClientData clientData);
MODULE_SCOPE Tcl_Obj * TclpFilesystemPathType(Tcl_Obj *pathPtr);
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index db2fc02..807e087 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -2642,12 +2642,25 @@ TclUniCharToUCS4(
* by the Tcl_UniChar string. */
{
if (((src[0] & 0xFC00) == 0xD800) && ((src[1] & 0xFC00) == 0xDC00)) {
- *ucs4Ptr = (((src[0] & 0x3FF) << 10) | (src[01] & 0x3FF)) + 0x10000;
+ *ucs4Ptr = (((src[0] & 0x3FF) << 10) | (src[1] & 0x3FF)) + 0x10000;
return 2;
}
*ucs4Ptr = src[0];
return 1;
}
+
+const Tcl_UniChar *TclUCS4Prev(const Tcl_UniChar *src, const Tcl_UniChar *ptr) {
+ if (src <= ptr + 1) {
+ return ptr;
+ }
+ if (((src[-1] & 0xFC00) == 0xDC00) && ((src[-2] & 0xFC00) == 0xD800)) {
+ return src - 2;
+ }
+ return src - 1;
+}
+
+
+
#endif
/*
diff --git a/tests/string.test b/tests/string.test
index 184a555..17a6d3c 100644
--- a/tests/string.test
+++ b/tests/string.test
@@ -2534,6 +2534,206 @@ test string-32.17.$noComp {string is dict, valid dict packed in invalid dict} {
string is dict {{a b c d e f g h}}
} 0
+test string-33.1.$noComp {string nextchar} -body {
+ list [catch {run {string nextchar a}} msg] $msg
+} -result {1 {wrong # args: should be "string nextchar string index"}}
+test string-33.2.$noComp {string nextchar} -body {
+ list [catch {run {string nextchar a b c}} msg] $msg
+} -result {1 {wrong # args: should be "string nextchar string index"}}
+test string-33.3.$noComp {string nextchar} -body {
+ list [catch {run {string nextchar a gorp}} msg] $msg
+} -result {1 {bad index "gorp": must be integer?[+-]integer? or end?[+-]integer?}}
+test string-33.4.$noComp {string nextchar} -body {
+ run {string nextchar abc. -1}
+} -result 1
+test string-33.5.$noComp {string nextchar} -body {
+ run {string nextchar abc. 100}
+} -result 4
+test string-33.6.$noComp {string nextchar} -body {
+ run {string nextchar "word_one two three" 2}
+} -result 3
+test string-33.7.$noComp {string nextchar} -body {
+ run {string nextchar "one .&# three" 5}
+} -result 6
+test string-33.8.$noComp {string nextchar} -body {
+ run {string worde "x.y" 0}
+} -result 1
+test string-33.9.$noComp {string nextchar} -body {
+ run {string worde "x.y" end-1}
+} -result 2
+test string-33.10.$noComp {string nextchar, unicode} -body {
+ run {string nextchar "xyz\xC7de fg" 0}
+} -result 1
+test string-33.11.$noComp {string nextchar, unicode} -body {
+ run {string nextchar "xyz\uC700de fg" 0}
+} -result 1
+test string-33.12.$noComp {string nextchar, unicode} -body {
+ run {string nextchar "xyz\u203Fde fg" 0}
+} -result 1
+test string-33.13.$noComp {string nextchar, unicode} -body {
+ run {string nextchar "xyz\u2045de fg" 0}
+} -result 1
+test string-33.14.$noComp {string nextchar, unicode} -body {
+ run {string nextchar "\uC700\uC700 abc" 8}
+} -result 6
+test string-33.15.$noComp {string nextchar, unicode} -constraints utf16 -body {
+ run {string nextchar "\U1D7CA\U1D7CA abc" 0}
+} -result 2
+test string-33.16.$noComp {string nextchar, unicode} -constraints utf16 -body {
+ run {string nextchar "\U1D7CA\U1D7CA abc" 10}
+} -result 8
+
+test string-34.1.$noComp {string nextword} -body {
+ list [catch {run {string nextword a}} msg] $msg
+} -result {1 {wrong # args: should be "string nextword string index"}}
+test string-34.2.$noComp {string nextword} -body {
+ list [catch {run {string nextword a b c}} msg] $msg
+} -result {1 {wrong # args: should be "string nextword string index"}}
+test string-34.3.$noComp {string nextword} -body {
+ list [catch {run {string nextword a gorp}} msg] $msg
+} -result {1 {bad index "gorp": must be integer?[+-]integer? or end?[+-]integer?}}
+test string-34.4.$noComp {string nextword} -body {
+ run {string nextword abc. -1}
+} -result 4
+test string-34.5.$noComp {string nextword} -body {
+ run {string nextword abc. 100}
+} -result 4
+test string-34.6.$noComp {string nextword} -body {
+ run {string nextword "word_one two three" 2}
+} -result 9
+test string-34.7.$noComp {string nextword} -body {
+ run {string nextword "one .&# three" 5}
+} -result 8
+test string-34.8.$noComp {string nextword} -body {
+ run {string worde "x.y" 0}
+} -result 1
+test string-34.9.$noComp {string nextword} -body {
+ run {string worde "x.y" end-1}
+} -result 2
+test string-34.10.$noComp {string nextword, unicode} -body {
+ run {string nextword "xyz\xC7de fg" 0}
+} -result 7
+test string-34.11.$noComp {string nextword, unicode} -body {
+ run {string nextword "xyz\uC700de fg" 0}
+} -result 7
+test string-34.12.$noComp {string nextword, unicode} -body {
+ run {string nextword "xyz\u203Fde fg" 0}
+} -result 7
+test string-34.13.$noComp {string nextword, unicode} -body {
+ run {string nextword "xyz\u2045\u2045de fg" 0}
+} -result 5
+test string-34.14.$noComp {string nextword, unicode} -body {
+ run {string nextword "\uC700\uC700 abc" 8}
+} -result 6
+test string-34.15.$noComp {string nextword, unicode} -body {
+ run {string nextword "\U1D7CA\U1D7CA abc" 0}
+} -result 3
+test string-34.16.$noComp {string nextword, unicode} -constraints utf16 -body {
+ run {string nextword "\U1D7CA\U1D7CA abc" 10}
+} -result 8
+
+test string-35.1.$noComp {string prevchar} -body {
+ list [catch {run {string word a}} msg] $msg
+} -match regexp -result {1 {unknown or ambiguous subcommand "word": must be (bytelength, |)cat, compare, equal, first, index, insert, is, last, length, map, match, nextchar, nextword, prevchar, prevword, range, repeat, replace, reverse, tolower, totitle, toupper, trim, trimleft, trimright, wordend, or wordstart}}
+test string-35.2.$noComp {string prevchar} -body {
+ list [catch {run {string prevchar a}} msg] $msg
+} -result {1 {wrong # args: should be "string prevchar string index"}}
+test string-35.3.$noComp {string prevchar} -body {
+ list [catch {run {string prevchar a b c}} msg] $msg
+} -result {1 {wrong # args: should be "string prevchar string index"}}
+test string-35.4.$noComp {string prevchar} -body {
+ list [catch {run {string prevchar a gorp}} msg] $msg
+} -result {1 {bad index "gorp": must be integer?[+-]integer? or end?[+-]integer?}}
+test string-35.5.$noComp {string prevchar} -body {
+ run {string prevchar "one two three_words" 400}
+} -result 18
+test string-35.6.$noComp {string prevchar} -body {
+ run {string prevchar "one two three_words" 2}
+} -result 1
+test string-35.7.$noComp {string prevchar} -body {
+ run {string prevchar "one two three_words" -2}
+} -result 0
+test string-35.8.$noComp {string prevchar} -body {
+ run {string prevchar "one .*&^ three" 6}
+} -result 5
+test string-35.9.$noComp {string prevchar} -body {
+ run {string prevchar "one two three" 4}
+} -result 3
+test string-35.10.$noComp {string prevchar} -body {
+ run {string prevchar "one two three" end-5}
+} -result 6
+test string-35.11.$noComp {string prevchar, unicode} -body {
+ run {string prevchar "one tw\xC7o three" 7}
+} -result 6
+test string-35.12.$noComp {string prevchar, unicode} -body {
+ run {string prevchar "ab\uC700\uC700 cdef ghi" 12}
+} -result 11
+test string-35.13.$noComp {string prevchar, unicode} -body {
+ run {string prevchar "\uC700\uC700 abc" 8}
+} -result 5
+test string-35.14.$noComp {string prevchar, invalid UTF-8} -constraints testbytestring -body {
+ # See Bug c61818e4c9
+ set demo [testbytestring "abc def\xE0\xA9ghi"]
+ run {string index $demo [string prevchar $demo 10]}
+} -result g
+test string-35.15.$noComp {string prevchar, unicode} -body {
+ run {string prevchar "\U1D7CA\U1D7CA abc" 0}
+} -result 0
+test string-35.16.$noComp {string prevchar, unicode} -constraints utf16 -body {
+ run {string prevchar "\U1D7CA\U1D7CA abc" 10}
+} -result 7
+
+test string-36.1.$noComp {string prevword} -body {
+ list [catch {run {string word a}} msg] $msg
+} -match regexp -result {1 {unknown or ambiguous subcommand "word": must be (bytelength, |)cat, compare, equal, first, index, insert, is, last, length, map, match, nextchar, nextword, prevchar, prevword, range, repeat, replace, reverse, tolower, totitle, toupper, trim, trimleft, trimright, wordend, or wordstart}}
+test string-36.2.$noComp {string prevword} -body {
+ list [catch {run {string prevword a}} msg] $msg
+} -result {1 {wrong # args: should be "string prevword string index"}}
+test string-36.3.$noComp {string prevword} -body {
+ list [catch {run {string prevword a b c}} msg] $msg
+} -result {1 {wrong # args: should be "string prevword string index"}}
+test string-36.4.$noComp {string prevword} -body {
+ list [catch {run {string prevword a gorp}} msg] $msg
+} -result {1 {bad index "gorp": must be integer?[+-]integer? or end?[+-]integer?}}
+test string-36.5.$noComp {string prevword} -body {
+ run {string prevword "one two three_words" 400}
+} -result 7
+test string-36.6.$noComp {string prevword} -body {
+ run {string prevword "one two three_words" 2}
+} -result 0
+test string-36.7.$noComp {string prevword} -body {
+ run {string prevword "one two three_words" -2}
+} -result 0
+test string-36.8.$noComp {string prevword} -body {
+ run {string prevword "one .*&^ three" 6}
+} -result 3
+test string-36.9.$noComp {string prevword} -body {
+ run {string prevword "one two three" 4}
+} -result 3
+test string-36.10.$noComp {string prevword} -body {
+ run {string prevword "one two three" end-5}
+} -result 7
+test string-36.11.$noComp {string prevword, unicode} -body {
+ run {string prevword "one tw\xC7o three" 7}
+} -result 3
+test string-36.12.$noComp {string prevword, unicode} -body {
+ run {string prevword "ab\uC700\uC700 cdef ghi" 12}
+} -result 9
+test string-36.13.$noComp {string prevword, unicode} -body {
+ run {string prevword "\uC700\uC700 abc" 8}
+} -result 2
+test string-36.14.$noComp {string prevword, invalid UTF-8} -constraints testbytestring -body {
+ # See Bug c61818e4c9
+ set demo [testbytestring "abc def\xE0\xA9ghi"]
+ run {string index $demo [string prevword $demo 10]}
+} -result \xA9
+test string-36.15.$noComp {string prevword, unicode} -body {
+ run {string prevword "\U1D7CA\U1D7CA abc" 0}
+} -result 0
+test string-36.16.$noComp {string prevword, unicode} -constraints utf16 -body {
+ run {string prevword "\U1D7CA\U1D7CA abc" 10}
+} -result 4
+
}; # foreach noComp {0 1}
# cleanup