From 0fce9600ee90c152b14295a28da571c3a4db622b Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Mon, 25 May 2020 11:53:11 +0000 Subject: Finish implementation of "string nextchar|nextword|prevchar|prevword". Not thourougly test yet, but seems OK at first sight. --- generic/tclCmdMZ.c | 163 ++++++++++++++++++----------------------- generic/tclExecute.c | 2 +- generic/tclInt.h | 6 +- generic/tclUtf.c | 15 +++- tests/string.test | 200 +++++++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 290 insertions(+), 96 deletions(-) diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c index bbd03d8..36b2443 100644 --- a/generic/tclCmdMZ.c +++ b/generic/tclCmdMZ.c @@ -2499,40 +2499,38 @@ StringStartCmd( Tcl_Obj *const objv[]) /* Argument objects. */ { int ch; - const char *p, *string; - int cur, index, length, numChars; + const Tcl_UniChar *p, *string; + int cur, index, length; if (objc != 3) { Tcl_WrongNumArgs(interp, 1, objv, "string index"); return TCL_ERROR; } - string = TclGetStringFromObj(objv[1], &length); - numChars = Tcl_NumUtfChars(string, length); - if (TclGetIntForIndexM(interp, objv[2], numChars-1, &index) != TCL_OK) { + string = Tcl_GetUnicodeFromObj(objv[1], &length); + if (TclGetIntForIndexM(interp, objv[2], length-1, &index) != TCL_OK) { return TCL_ERROR; } - string = TclGetStringFromObj(objv[1], &length); - if (index >= numChars) { - index = numChars - 1; + if (index >= length) { + index = length - 1; } cur = 0; if (index > 0) { - p = Tcl_UtfAtIndex(string, index); + p = &string[index]; - TclUtfToUCS4(p, &ch); + TclUniCharToUCS4(p, &ch); for (cur = index; cur >= 0; cur--) { int delta = 0; - const char *next; + const Tcl_UniChar *next; if (!Tcl_UniCharIsWordChar(ch)) { break; } - next = Tcl_UtfPrev(p, string); + next = TclUCS4Prev(p, string); do { next += delta; - delta = TclUtfToUCS4(next, &ch); + delta = TclUniCharToUCS4(next, &ch); } while (next + delta < p); p = next; } @@ -2568,49 +2566,28 @@ StringPrevCharCmd( int objc, /* Number of arguments. */ Tcl_Obj *const objv[]) /* Argument objects. */ { - int ch; - const char *p, *string; - int cur, index, length, numChars; + const Tcl_UniChar *p, *string; + int index, length; if (objc != 3) { Tcl_WrongNumArgs(interp, 1, objv, "string index"); return TCL_ERROR; } - string = TclGetStringFromObj(objv[1], &length); - numChars = Tcl_NumUtfChars(string, length); - if (TclGetIntForIndexM(interp, objv[2], numChars-1, &index) != TCL_OK) { + string = Tcl_GetUnicodeFromObj(objv[1], &length); + if (TclGetIntForIndexM(interp, objv[2], length-1, &index) != TCL_OK) { return TCL_ERROR; } - string = TclGetStringFromObj(objv[1], &length); - if (index >= numChars) { - index = numChars - 1; + if (index > length) { + index = length; } - cur = 0; if (index > 0) { - p = Tcl_UtfAtIndex(string, index); - - TclUtfToUCS4(p, &ch); - for (cur = index; cur >= 0; cur--) { - int delta = 0; - const char *next; - - if (!Tcl_UniCharIsWordChar(ch)) { - break; - } - - next = Tcl_UtfPrev(p, string); - do { - next += delta; - delta = TclUtfToUCS4(next, &ch); - } while (next + delta < p); - p = next; - } - if (cur != index) { - cur += 1; - } + p = &string[index]; + index = TclUCS4Prev(p, string) - string; + } else { + index = 0; } - Tcl_SetObjResult(interp, Tcl_NewWideIntObj(cur)); + Tcl_SetObjResult(interp, Tcl_NewWideIntObj(index)); return TCL_OK; } @@ -2639,40 +2616,53 @@ StringPrevWordCmd( Tcl_Obj *const objv[]) /* Argument objects. */ { int ch; - const char *p, *string; - int cur, index, length, numChars; + const Tcl_UniChar *p, *string; + int cur, index, length; if (objc != 3) { Tcl_WrongNumArgs(interp, 1, objv, "string index"); return TCL_ERROR; } - string = TclGetStringFromObj(objv[1], &length); - numChars = Tcl_NumUtfChars(string, length); - if (TclGetIntForIndexM(interp, objv[2], numChars-1, &index) != TCL_OK) { + string = Tcl_GetUnicodeFromObj(objv[1], &length); + if (TclGetIntForIndexM(interp, objv[2], length-1, &index) != TCL_OK) { return TCL_ERROR; } - string = TclGetStringFromObj(objv[1], &length); - if (index >= numChars) { - index = numChars - 1; + if (index >= length) { + index = length - 1; } cur = 0; if (index > 0) { - p = Tcl_UtfAtIndex(string, index); + p = &string[index]; - TclUtfToUCS4(p, &ch); + TclUniCharToUCS4(p, &ch); for (cur = index; cur >= 0; cur--) { int delta = 0; - const char *next; + const Tcl_UniChar *next; if (!Tcl_UniCharIsWordChar(ch)) { break; } - next = Tcl_UtfPrev(p, string); + next = TclUCS4Prev(p, string); do { next += delta; - delta = TclUtfToUCS4(next, &ch); + delta = TclUniCharToUCS4(next, &ch); + } while (next + delta < p); + p = next; + } + for (; cur >= 0; cur--) { + int delta = 0; + const Tcl_UniChar *next; + + if (Tcl_UniCharIsWordChar(ch)) { + break; + } + + next = TclUCS4Prev(p, string); + do { + next += delta; + delta = TclUniCharToUCS4(next, &ch); } while (next + delta < p); p = next; } @@ -2769,39 +2759,27 @@ StringNextCharCmd( Tcl_Obj *const objv[]) /* Argument objects. */ { int ch; - const char *p, *end, *string; - int cur, index, length, numChars; + const Tcl_UniChar *string; + int index, length; if (objc != 3) { Tcl_WrongNumArgs(interp, 1, objv, "string index"); return TCL_ERROR; } - string = TclGetStringFromObj(objv[1], &length); - numChars = Tcl_NumUtfChars(string, length); - if (TclGetIntForIndexM(interp, objv[2], numChars-1, &index) != TCL_OK) { + string = Tcl_GetUnicodeFromObj(objv[1], &length); + if (TclGetIntForIndexM(interp, objv[2], length-1, &index) != TCL_OK) { return TCL_ERROR; } - string = TclGetStringFromObj(objv[1], &length); if (index < 0) { index = 0; } - if (index < numChars) { - p = Tcl_UtfAtIndex(string, index); - end = string+length; - for (cur = index; p < end; cur++) { - p += TclUtfToUCS4(p, &ch); - if (!Tcl_UniCharIsWordChar(ch)) { - break; - } - } - if (cur == index) { - cur++; - } + if (index < length) { + index += TclUniCharToUCS4(&string[index], &ch); } else { - cur = numChars; + index = length; } - Tcl_SetObjResult(interp, Tcl_NewWideIntObj(cur)); + Tcl_SetObjResult(interp, Tcl_NewWideIntObj(index)); return TCL_OK; } @@ -2831,39 +2809,40 @@ StringNextWordCmd( Tcl_Obj *const objv[]) /* Argument objects. */ { int ch; - const char *p, *end, *string; - int cur, index, length, numChars; + const Tcl_UniChar *p, *end, *string; + int index, length; if (objc != 3) { Tcl_WrongNumArgs(interp, 1, objv, "string index"); return TCL_ERROR; } - string = TclGetStringFromObj(objv[1], &length); - numChars = Tcl_NumUtfChars(string, length); - if (TclGetIntForIndexM(interp, objv[2], numChars-1, &index) != TCL_OK) { + string = Tcl_GetUnicodeFromObj(objv[1], &length); + if (TclGetIntForIndexM(interp, objv[2], length-1, &index) != TCL_OK) { return TCL_ERROR; } - string = TclGetStringFromObj(objv[1], &length); if (index < 0) { index = 0; } - if (index < numChars) { - p = Tcl_UtfAtIndex(string, index); + if (index < length) { + p = &string[index]; end = string+length; - for (cur = index; p < end; cur++) { - p += TclUtfToUCS4(p, &ch); + while (index++, p < end) { + p += TclUniCharToUCS4(p, &ch); if (!Tcl_UniCharIsWordChar(ch)) { break; } } - if (cur == index) { - cur++; + for (; p < end; index++) { + p += TclUniCharToUCS4(p, &ch); + if (Tcl_UniCharIsWordChar(ch)) { + break; + } } } else { - cur = numChars; + index = length; } - Tcl_SetObjResult(interp, Tcl_NewWideIntObj(cur)); + Tcl_SetObjResult(interp, Tcl_NewWideIntObj(index)); return TCL_OK; } diff --git a/generic/tclExecute.c b/generic/tclExecute.c index cc366e7..c80e12b 100644 --- a/generic/tclExecute.c +++ b/generic/tclExecute.c @@ -5292,7 +5292,7 @@ TEBCresume( } else if (TclIsPureByteArray(valuePtr)) { objResultPtr = Tcl_NewByteArrayObj( Tcl_GetByteArrayFromObj(valuePtr, NULL)+index, 1); - } else if (valuePtr->bytes && length == valuePtr->length) { + } else if (valuePtr->bytes && length == valuePtr->length && !(valuePtr->bytes[index] & 0x80)) { objResultPtr = Tcl_NewStringObj((const char *) valuePtr->bytes+index, 1); } else { diff --git a/generic/tclInt.h b/generic/tclInt.h index ef7411a..e1dedda 100644 --- a/generic/tclInt.h +++ b/generic/tclInt.h @@ -3253,9 +3253,11 @@ MODULE_SCOPE int TclUtfCount(int ch); #if TCL_UTF_MAX > 3 # define TclUtfToUCS4 Tcl_UtfToUniChar # define TclUniCharToUCS4(src, ptr) (*ptr = *(src),1) +# define TclUCS4Prev(src, ptr) (((src) > (ptr)) ? ((src) - 1) : (src)) #else - MODULE_SCOPE int TclUtfToUCS4(const char *src, int *ucs4Ptr); - MODULE_SCOPE int TclUniCharToUCS4(const Tcl_UniChar *src, int *ucs4Ptr); + MODULE_SCOPE int TclUtfToUCS4(const char *, int *); + MODULE_SCOPE int TclUniCharToUCS4(const Tcl_UniChar *, int *); + MODULE_SCOPE const Tcl_UniChar *TclUCS4Prev(const Tcl_UniChar *, const Tcl_UniChar *); #endif MODULE_SCOPE Tcl_Obj * TclpNativeToNormalized(ClientData clientData); MODULE_SCOPE Tcl_Obj * TclpFilesystemPathType(Tcl_Obj *pathPtr); diff --git a/generic/tclUtf.c b/generic/tclUtf.c index db2fc02..807e087 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -2642,12 +2642,25 @@ TclUniCharToUCS4( * by the Tcl_UniChar string. */ { if (((src[0] & 0xFC00) == 0xD800) && ((src[1] & 0xFC00) == 0xDC00)) { - *ucs4Ptr = (((src[0] & 0x3FF) << 10) | (src[01] & 0x3FF)) + 0x10000; + *ucs4Ptr = (((src[0] & 0x3FF) << 10) | (src[1] & 0x3FF)) + 0x10000; return 2; } *ucs4Ptr = src[0]; return 1; } + +const Tcl_UniChar *TclUCS4Prev(const Tcl_UniChar *src, const Tcl_UniChar *ptr) { + if (src <= ptr + 1) { + return ptr; + } + if (((src[-1] & 0xFC00) == 0xDC00) && ((src[-2] & 0xFC00) == 0xD800)) { + return src - 2; + } + return src - 1; +} + + + #endif /* diff --git a/tests/string.test b/tests/string.test index 184a555..17a6d3c 100644 --- a/tests/string.test +++ b/tests/string.test @@ -2534,6 +2534,206 @@ test string-32.17.$noComp {string is dict, valid dict packed in invalid dict} { string is dict {{a b c d e f g h}} } 0 +test string-33.1.$noComp {string nextchar} -body { + list [catch {run {string nextchar a}} msg] $msg +} -result {1 {wrong # args: should be "string nextchar string index"}} +test string-33.2.$noComp {string nextchar} -body { + list [catch {run {string nextchar a b c}} msg] $msg +} -result {1 {wrong # args: should be "string nextchar string index"}} +test string-33.3.$noComp {string nextchar} -body { + list [catch {run {string nextchar a gorp}} msg] $msg +} -result {1 {bad index "gorp": must be integer?[+-]integer? or end?[+-]integer?}} +test string-33.4.$noComp {string nextchar} -body { + run {string nextchar abc. -1} +} -result 1 +test string-33.5.$noComp {string nextchar} -body { + run {string nextchar abc. 100} +} -result 4 +test string-33.6.$noComp {string nextchar} -body { + run {string nextchar "word_one two three" 2} +} -result 3 +test string-33.7.$noComp {string nextchar} -body { + run {string nextchar "one .&# three" 5} +} -result 6 +test string-33.8.$noComp {string nextchar} -body { + run {string worde "x.y" 0} +} -result 1 +test string-33.9.$noComp {string nextchar} -body { + run {string worde "x.y" end-1} +} -result 2 +test string-33.10.$noComp {string nextchar, unicode} -body { + run {string nextchar "xyz\xC7de fg" 0} +} -result 1 +test string-33.11.$noComp {string nextchar, unicode} -body { + run {string nextchar "xyz\uC700de fg" 0} +} -result 1 +test string-33.12.$noComp {string nextchar, unicode} -body { + run {string nextchar "xyz\u203Fde fg" 0} +} -result 1 +test string-33.13.$noComp {string nextchar, unicode} -body { + run {string nextchar "xyz\u2045de fg" 0} +} -result 1 +test string-33.14.$noComp {string nextchar, unicode} -body { + run {string nextchar "\uC700\uC700 abc" 8} +} -result 6 +test string-33.15.$noComp {string nextchar, unicode} -constraints utf16 -body { + run {string nextchar "\U1D7CA\U1D7CA abc" 0} +} -result 2 +test string-33.16.$noComp {string nextchar, unicode} -constraints utf16 -body { + run {string nextchar "\U1D7CA\U1D7CA abc" 10} +} -result 8 + +test string-34.1.$noComp {string nextword} -body { + list [catch {run {string nextword a}} msg] $msg +} -result {1 {wrong # args: should be "string nextword string index"}} +test string-34.2.$noComp {string nextword} -body { + list [catch {run {string nextword a b c}} msg] $msg +} -result {1 {wrong # args: should be "string nextword string index"}} +test string-34.3.$noComp {string nextword} -body { + list [catch {run {string nextword a gorp}} msg] $msg +} -result {1 {bad index "gorp": must be integer?[+-]integer? or end?[+-]integer?}} +test string-34.4.$noComp {string nextword} -body { + run {string nextword abc. -1} +} -result 4 +test string-34.5.$noComp {string nextword} -body { + run {string nextword abc. 100} +} -result 4 +test string-34.6.$noComp {string nextword} -body { + run {string nextword "word_one two three" 2} +} -result 9 +test string-34.7.$noComp {string nextword} -body { + run {string nextword "one .&# three" 5} +} -result 8 +test string-34.8.$noComp {string nextword} -body { + run {string worde "x.y" 0} +} -result 1 +test string-34.9.$noComp {string nextword} -body { + run {string worde "x.y" end-1} +} -result 2 +test string-34.10.$noComp {string nextword, unicode} -body { + run {string nextword "xyz\xC7de fg" 0} +} -result 7 +test string-34.11.$noComp {string nextword, unicode} -body { + run {string nextword "xyz\uC700de fg" 0} +} -result 7 +test string-34.12.$noComp {string nextword, unicode} -body { + run {string nextword "xyz\u203Fde fg" 0} +} -result 7 +test string-34.13.$noComp {string nextword, unicode} -body { + run {string nextword "xyz\u2045\u2045de fg" 0} +} -result 5 +test string-34.14.$noComp {string nextword, unicode} -body { + run {string nextword "\uC700\uC700 abc" 8} +} -result 6 +test string-34.15.$noComp {string nextword, unicode} -body { + run {string nextword "\U1D7CA\U1D7CA abc" 0} +} -result 3 +test string-34.16.$noComp {string nextword, unicode} -constraints utf16 -body { + run {string nextword "\U1D7CA\U1D7CA abc" 10} +} -result 8 + +test string-35.1.$noComp {string prevchar} -body { + list [catch {run {string word a}} msg] $msg +} -match regexp -result {1 {unknown or ambiguous subcommand "word": must be (bytelength, |)cat, compare, equal, first, index, insert, is, last, length, map, match, nextchar, nextword, prevchar, prevword, range, repeat, replace, reverse, tolower, totitle, toupper, trim, trimleft, trimright, wordend, or wordstart}} +test string-35.2.$noComp {string prevchar} -body { + list [catch {run {string prevchar a}} msg] $msg +} -result {1 {wrong # args: should be "string prevchar string index"}} +test string-35.3.$noComp {string prevchar} -body { + list [catch {run {string prevchar a b c}} msg] $msg +} -result {1 {wrong # args: should be "string prevchar string index"}} +test string-35.4.$noComp {string prevchar} -body { + list [catch {run {string prevchar a gorp}} msg] $msg +} -result {1 {bad index "gorp": must be integer?[+-]integer? or end?[+-]integer?}} +test string-35.5.$noComp {string prevchar} -body { + run {string prevchar "one two three_words" 400} +} -result 18 +test string-35.6.$noComp {string prevchar} -body { + run {string prevchar "one two three_words" 2} +} -result 1 +test string-35.7.$noComp {string prevchar} -body { + run {string prevchar "one two three_words" -2} +} -result 0 +test string-35.8.$noComp {string prevchar} -body { + run {string prevchar "one .*&^ three" 6} +} -result 5 +test string-35.9.$noComp {string prevchar} -body { + run {string prevchar "one two three" 4} +} -result 3 +test string-35.10.$noComp {string prevchar} -body { + run {string prevchar "one two three" end-5} +} -result 6 +test string-35.11.$noComp {string prevchar, unicode} -body { + run {string prevchar "one tw\xC7o three" 7} +} -result 6 +test string-35.12.$noComp {string prevchar, unicode} -body { + run {string prevchar "ab\uC700\uC700 cdef ghi" 12} +} -result 11 +test string-35.13.$noComp {string prevchar, unicode} -body { + run {string prevchar "\uC700\uC700 abc" 8} +} -result 5 +test string-35.14.$noComp {string prevchar, invalid UTF-8} -constraints testbytestring -body { + # See Bug c61818e4c9 + set demo [testbytestring "abc def\xE0\xA9ghi"] + run {string index $demo [string prevchar $demo 10]} +} -result g +test string-35.15.$noComp {string prevchar, unicode} -body { + run {string prevchar "\U1D7CA\U1D7CA abc" 0} +} -result 0 +test string-35.16.$noComp {string prevchar, unicode} -constraints utf16 -body { + run {string prevchar "\U1D7CA\U1D7CA abc" 10} +} -result 7 + +test string-36.1.$noComp {string prevword} -body { + list [catch {run {string word a}} msg] $msg +} -match regexp -result {1 {unknown or ambiguous subcommand "word": must be (bytelength, |)cat, compare, equal, first, index, insert, is, last, length, map, match, nextchar, nextword, prevchar, prevword, range, repeat, replace, reverse, tolower, totitle, toupper, trim, trimleft, trimright, wordend, or wordstart}} +test string-36.2.$noComp {string prevword} -body { + list [catch {run {string prevword a}} msg] $msg +} -result {1 {wrong # args: should be "string prevword string index"}} +test string-36.3.$noComp {string prevword} -body { + list [catch {run {string prevword a b c}} msg] $msg +} -result {1 {wrong # args: should be "string prevword string index"}} +test string-36.4.$noComp {string prevword} -body { + list [catch {run {string prevword a gorp}} msg] $msg +} -result {1 {bad index "gorp": must be integer?[+-]integer? or end?[+-]integer?}} +test string-36.5.$noComp {string prevword} -body { + run {string prevword "one two three_words" 400} +} -result 7 +test string-36.6.$noComp {string prevword} -body { + run {string prevword "one two three_words" 2} +} -result 0 +test string-36.7.$noComp {string prevword} -body { + run {string prevword "one two three_words" -2} +} -result 0 +test string-36.8.$noComp {string prevword} -body { + run {string prevword "one .*&^ three" 6} +} -result 3 +test string-36.9.$noComp {string prevword} -body { + run {string prevword "one two three" 4} +} -result 3 +test string-36.10.$noComp {string prevword} -body { + run {string prevword "one two three" end-5} +} -result 7 +test string-36.11.$noComp {string prevword, unicode} -body { + run {string prevword "one tw\xC7o three" 7} +} -result 3 +test string-36.12.$noComp {string prevword, unicode} -body { + run {string prevword "ab\uC700\uC700 cdef ghi" 12} +} -result 9 +test string-36.13.$noComp {string prevword, unicode} -body { + run {string prevword "\uC700\uC700 abc" 8} +} -result 2 +test string-36.14.$noComp {string prevword, invalid UTF-8} -constraints testbytestring -body { + # See Bug c61818e4c9 + set demo [testbytestring "abc def\xE0\xA9ghi"] + run {string index $demo [string prevword $demo 10]} +} -result \xA9 +test string-36.15.$noComp {string prevword, unicode} -body { + run {string prevword "\U1D7CA\U1D7CA abc" 0} +} -result 0 +test string-36.16.$noComp {string prevword, unicode} -constraints utf16 -body { + run {string prevword "\U1D7CA\U1D7CA abc" 10} +} -result 4 + }; # foreach noComp {0 1} # cleanup -- cgit v0.12