From f61c3bdb284cedeb0db64a332f84bba54262565c Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Tue, 1 May 2018 19:02:32 +0000 Subject: Start implementing TIP #497. regexp's now are >BMP-aware. WIP --- generic/regc_locale.c | 4 +- generic/regcustom.h | 12 ++--- generic/regex.h | 2 +- generic/tclInt.h | 9 ++++ generic/tclRegexp.c | 39 +++++++++------- generic/tclUtf.c | 125 ++++++++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 162 insertions(+), 29 deletions(-) diff --git a/generic/regc_locale.c b/generic/regc_locale.c index 002b264..19ac511 100644 --- a/generic/regc_locale.c +++ b/generic/regc_locale.c @@ -828,7 +828,7 @@ element( */ Tcl_DStringInit(&ds); - np = Tcl_UniCharToUtfDString(startp, (int)len, &ds); + np = TclUnicodeToUtfDString(startp, (int)len, &ds); for (cn=cnames; cn->name!=NULL; cn++) { if (strlen(cn->name)==len && strncmp(cn->name, np, len)==0) { break; /* NOTE BREAK OUT */ @@ -1000,7 +1000,7 @@ cclass( len = endp - startp; Tcl_DStringInit(&ds); - np = Tcl_UniCharToUtfDString(startp, (int)len, &ds); + np = TclUnicodeToUtfDString(startp, (int)len, &ds); /* * Map the name to the corresponding enumerated value. diff --git a/generic/regcustom.h b/generic/regcustom.h index 095385d..5befada 100644 --- a/generic/regcustom.h +++ b/generic/regcustom.h @@ -66,7 +66,7 @@ #undef __REG_NOCHAR #endif /* Interface types */ -#define __REG_WIDE_T Tcl_UniChar +#define __REG_WIDE_T unsigned #define __REG_REGOFF_T long /* Not really right, but good enough... */ /* Names and declarations */ #define __REG_WIDE_COMPILE TclReComp @@ -81,22 +81,16 @@ * Internal character type and related. */ -typedef Tcl_UniChar chr; /* The type itself. */ +typedef unsigned chr; /* The type itself. */ typedef int pchr; /* What it promotes to. */ typedef unsigned uchr; /* Unsigned type that will hold a chr. */ typedef int celt; /* Type to hold chr, or NOCELT */ #define NOCELT (-1) /* Celt value which is not valid chr */ #define CHR(c) (UCHAR(c)) /* Turn char literal into chr literal */ #define DIGITVAL(c) ((c)-'0') /* Turn chr digit into its value */ -#if TCL_UTF_MAX > 4 #define CHRBITS 32 /* Bits in a chr; must not use sizeof */ #define CHR_MIN 0x00000000 /* Smallest and largest chr; the value */ -#define CHR_MAX 0x10ffff /* CHR_MAX-CHR_MIN+1 should fit in uchr */ -#else -#define CHRBITS 16 /* Bits in a chr; must not use sizeof */ -#define CHR_MIN 0x0000 /* Smallest and largest chr; the value */ -#define CHR_MAX 0xffff /* CHR_MAX-CHR_MIN+1 should fit in uchr */ -#endif +#define CHR_MAX 0x0010ffff /* CHR_MAX-CHR_MIN+1 should fit in uchr */ /* * Functions operating on chr. diff --git a/generic/regex.h b/generic/regex.h index 8845f72..0b559f4 100644 --- a/generic/regex.h +++ b/generic/regex.h @@ -99,7 +99,7 @@ extern "C" { #undef __REG_NOCHAR #endif /* interface types */ -#define __REG_WIDE_T Tcl_UniChar +#define __REG_WIDE_T unsigned #define __REG_REGOFF_T long /* not really right, but good enough... */ /* names and declarations */ #define __REG_WIDE_COMPILE TclReComp diff --git a/generic/tclInt.h b/generic/tclInt.h index 50048e9..28549d9 100644 --- a/generic/tclInt.h +++ b/generic/tclInt.h @@ -3985,6 +3985,15 @@ MODULE_SCOPE int TclPtrUnsetVarIdx(Tcl_Interp *interp, Var *varPtr, MODULE_SCOPE void TclInvalidateNsPath(Namespace *nsPtr); MODULE_SCOPE void TclFindArrayPtrElements(Var *arrayPtr, Tcl_HashTable *tablePtr); +#if TCL_UTF_MAX <= 4 +MODULE_SCOPE char * TclUnicodeToUtfDString(const unsigned *uniStr, + int uniLength, Tcl_DString *dsPtr); +MODULE_SCOPE unsigned * TclUtfToUnicodeDString(const char *src, int length, + Tcl_DString *dsPtr); +#else +# define TclUnicodeToUtfDString Tcl_UniCharToUtfDString +# define TclUtfToUnicodeDString Tcl_UtfToUniCharDString +#endif /* * The new extended interface to the variable traces. diff --git a/generic/tclRegexp.c b/generic/tclRegexp.c index 5f8dc20..79b979c 100644 --- a/generic/tclRegexp.c +++ b/generic/tclRegexp.c @@ -90,8 +90,8 @@ static void DupRegexpInternalRep(Tcl_Obj *srcPtr, static void FinalizeRegexp(ClientData clientData); static void FreeRegexp(TclRegexp *regexpPtr); static void FreeRegexpInternalRep(Tcl_Obj *objPtr); -static int RegExpExecUniChar(Tcl_Interp *interp, Tcl_RegExp re, - const Tcl_UniChar *uniString, int numChars, +static int RegExpExecUnicode(Tcl_Interp *interp, Tcl_RegExp re, + const __REG_WIDE_T *uniString, int numChars, int nmatches, int flags); static int SetRegexpFromAny(Tcl_Interp *interp, Tcl_Obj *objPtr); @@ -175,7 +175,7 @@ Tcl_RegExpExec( int flags, result, numChars; TclRegexp *regexp = (TclRegexp *) re; Tcl_DString ds; - const Tcl_UniChar *ustr; + const __REG_WIDE_T *ustr; /* * If the starting point is offset from the beginning of the buffer, then @@ -200,9 +200,9 @@ Tcl_RegExpExec( */ Tcl_DStringInit(&ds); - ustr = Tcl_UtfToUniCharDString(text, -1, &ds); - numChars = Tcl_DStringLength(&ds) / sizeof(Tcl_UniChar); - result = RegExpExecUniChar(interp, re, ustr, numChars, -1 /* nmatches */, + ustr = TclUtfToUnicodeDString(text, -1, &ds); + numChars = Tcl_DStringLength(&ds) / sizeof(__REG_WIDE_T); + result = RegExpExecUnicode(interp, re, ustr, numChars, -1 /* nmatches */, flags); Tcl_DStringFree(&ds); @@ -261,7 +261,7 @@ Tcl_RegExpRange( /* *--------------------------------------------------------------------------- * - * RegExpExecUniChar -- + * RegExpExecUnicode -- * * Execute the regular expression matcher using a compiled form of a * regular expression and save information about any match that is found. @@ -279,12 +279,12 @@ Tcl_RegExpRange( */ static int -RegExpExecUniChar( +RegExpExecUnicode( Tcl_Interp *interp, /* Interpreter to use for error reporting. */ Tcl_RegExp re, /* Compiled regular expression; returned by a * previous call to Tcl_GetRegExpFromObj */ - const Tcl_UniChar *wString, /* String against which to match re. */ - int numChars, /* Length of Tcl_UniChar string (must be + const __REG_WIDE_T *wString, /* String against which to match re. */ + int numChars, /* Length of Unicode string (must be * >=0). */ int nmatches, /* How many subexpression matches (counting * the whole match as subexpression 0) are of @@ -432,8 +432,9 @@ Tcl_RegExpExecObj( int flags) /* Regular expression execution flags. */ { TclRegexp *regexpPtr = (TclRegexp *) re; - Tcl_UniChar *udata; - int length; + Tcl_DString ds; + __REG_WIDE_T *udata; + int length, result; int reflags = regexpPtr->flags; #define TCL_REG_GLOBOK_FLAGS \ (TCL_REG_ADVANCED | TCL_REG_NOSUB | TCL_REG_NOCASE) @@ -464,7 +465,9 @@ Tcl_RegExpExecObj( regexpPtr->string = NULL; regexpPtr->objPtr = textObj; - udata = Tcl_GetUnicodeFromObj(textObj, &length); + Tcl_DStringInit(&ds); + udata = TclUtfToUnicodeDString(Tcl_GetString(textObj), -1, &ds); + length = Tcl_DStringLength(&ds)/sizeof(__REG_WIDE_T); if (offset > length) { offset = length; @@ -472,7 +475,9 @@ Tcl_RegExpExecObj( udata += offset; length -= offset; - return RegExpExecUniChar(interp, re, udata, length, nmatches, flags); + result = RegExpExecUnicode(interp, re, udata, length, nmatches, flags); + Tcl_DStringFree(&ds); + return result; } /* @@ -858,7 +863,7 @@ CompileRegexp( int flags) /* Compilation flags. */ { TclRegexp *regexpPtr; - const Tcl_UniChar *uniString; + const __REG_WIDE_T *uniString; int numChars, status, i, exact; Tcl_DString stringBuf; ThreadSpecificData *tsdPtr = TCL_TSD_INIT(&dataKey); @@ -923,8 +928,8 @@ CompileRegexp( */ Tcl_DStringInit(&stringBuf); - uniString = Tcl_UtfToUniCharDString(string, length, &stringBuf); - numChars = Tcl_DStringLength(&stringBuf) / sizeof(Tcl_UniChar); + uniString = TclUtfToUnicodeDString(string, length, &stringBuf); + numChars = Tcl_DStringLength(&stringBuf) / sizeof(__REG_WIDE_T); /* * Compile the string and check for errors. diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 1d73a7a..259a124 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -235,6 +235,63 @@ Tcl_UniCharToUtfDString( /* *--------------------------------------------------------------------------- * + * TclUnicodeToUtfDString -- + * + * Convert the given Unicode string to UTF-8. + * + * Results: + * The return value is a pointer to the UTF-8 representation of the + * Unicode string. Storage for the return value is appended to the end of + * dsPtr. + * + * Side effects: + * None. + * + *--------------------------------------------------------------------------- + */ + +#if TCL_UTF_MAX <= 4 +char * +TclUnicodeToUtfDString( + const unsigned *uniStr, /* Unicode string to convert to UTF-8. */ + int uniLength, /* Length of Unicode string in Tcl_UniChars + * (must be >= 0). */ + Tcl_DString *dsPtr) /* UTF-8 representation of string is appended + * to this previously initialized DString. */ +{ + const unsigned *w, *wEnd; + char *p, *string; + int oldLength; + + /* + * UTF-8 string length in bytes will be <= Unicode string length * 4. + */ + + oldLength = Tcl_DStringLength(dsPtr); + Tcl_DStringSetLength(dsPtr, (oldLength + uniLength + 1) * 4); + string = Tcl_DStringValue(dsPtr) + oldLength; + + p = string; + wEnd = uniStr + uniLength; + for (w = uniStr; w < wEnd; ) { + if ((*w & 0xD800) == 0xD800) { + *p++ = (*w >> 12) | 0xE0; + *p++ = ((*w >> 6) & 0x3F) | 0x80; + *p++ = (*w & 0x3F) | 0xE0; + } else { + p += Tcl_UniCharToUtf(*w, p); + } + w++; + } + Tcl_DStringSetLength(dsPtr, oldLength + (p - string)); + + return string; +} +#endif + +/* + *--------------------------------------------------------------------------- + * * Tcl_UtfToUniChar -- * * Extract the Tcl_UniChar represented by the UTF-8 string. Bad UTF-8 @@ -439,6 +496,74 @@ Tcl_UtfToUniCharDString( /* *--------------------------------------------------------------------------- * + * TclUtfToUnicodeDString -- + * + * Convert the UTF-8 string to Unicode. + * + * Results: + * The return value is a pointer to the Unicode representation of the + * UTF-8 string. Storage for the return value is appended to the end of + * dsPtr. The Unicode string is terminated with a Unicode NULL character. + * + * Side effects: + * None. + * + *--------------------------------------------------------------------------- + */ +#if TCL_UTF_MAX <= 4 +unsigned * +TclUtfToUnicodeDString( + const char *src, /* UTF-8 string to convert to Unicode. */ + int length, /* Length of UTF-8 string in bytes, or -1 for + * strlen(). */ + Tcl_DString *dsPtr) /* Unicode representation of string is + * appended to this previously initialized + * DString. */ +{ + Tcl_UniChar ch = 0; + unsigned *w, *wString; + const char *p, *end; + int oldLength, len; + + if (length < 0) { + length = strlen(src); + } + + /* + * Unicode string length in Tcl_UniChars will be <= UTF-8 string length in + * bytes. + */ + + oldLength = Tcl_DStringLength(dsPtr); +/* TODO: fix overreach! */ + Tcl_DStringSetLength(dsPtr, + (int) ((oldLength + length + 1) * sizeof(unsigned))); + wString = (unsigned *) (Tcl_DStringValue(dsPtr) + oldLength); + + w = wString; + end = src + length; + for (p = src; p < end; ) { + len = TclUtfToUniChar(p, &ch); + if (!len) { + int high = ch; + len = TclUtfToUniChar(p, &ch); + *w++ = ((high & 0x7ff) << 10) + (ch & 0x7ff) + 0x10000; + } else { + *w++ = ch; + } + p += len; + } + *w = '\0'; + Tcl_DStringSetLength(dsPtr, + (oldLength + ((char *) w - (char *) wString))); + + return wString; +} +#endif + +/* + *--------------------------------------------------------------------------- + * * Tcl_UtfCharComplete -- * * Determine if the UTF-8 string of the given length is long enough to be -- cgit v0.12 From 80a7abf7e553cc0c0ea01f10df7790996460b133 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Fri, 27 Sep 2019 13:02:39 +0000 Subject: Adapt test-case to full-utf correct behaviour --- tests/string.test | 14 +++++++------- tests/utf.test | 10 +++++----- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/tests/string.test b/tests/string.test index c54b5ba..299c765 100644 --- a/tests/string.test +++ b/tests/string.test @@ -31,7 +31,7 @@ proc makeShared {s} {uplevel 1 [list lappend copy $s]; return $s} testConstraint testobj [expr {[info commands testobj] ne {}}] testConstraint testindexobj [expr {[info commands testindexobj] ne {}}] testConstraint testevalex [expr {[info commands testevalex] ne {}}] -testConstraint tip389 [expr {[string length \U010000] == 2}] +testConstraint fullutf [expr {[string length \U010000] == 1}] # Used for constraining memory leak tests testConstraint memory [llength [info commands memory]] @@ -505,9 +505,9 @@ test string-5.19.$noComp {string index, bytearray object out of bounds} { test string-5.20.$noComp {string index, bytearray object out of bounds} { run {string index [binary format I* {0x50515253 0x52}] 20} } {} -test string-5.21.$noComp {string index, surrogates, bug [11ae2be95dac9417]} tip389 { +test string-5.21.$noComp {string index, surrogates, bug [11ae2be95dac9417]} fullutf { run {list [string index a\U100000b 1] [string index a\U100000b 2] [string index a\U100000b 3]} -} [list \U100000 {} b] +} [list \U100000 b {}] proc largest_int {} { @@ -1502,9 +1502,9 @@ test string-12.22.$noComp {string range, shimmering binary/index} { binary scan $s a* x run {string range $s $s end} } 000000001 -test string-12.23.$noComp {string range, surrogates, bug [11ae2be95dac9417]} tip389 { +test string-12.23.$noComp {string range, surrogates, bug [11ae2be95dac9417]} fullutf { run {list [string range a\U100000b 1 1] [string range a\U100000b 2 2] [string range a\U100000b 3 3]} -} [list \U100000 {} b] +} [list \U100000 b {}] test string-13.1.$noComp {string repeat} { list [catch {run {string repeat}} msg] $msg @@ -1743,10 +1743,10 @@ test string-17.7.$noComp {string totitle, unicode} { test string-17.8.$noComp {string totitle, compiled} { lindex [run {string totitle [list aa bb [list cc]]}] 0 } Aa -test string-17.9.$noComp {string totitle, surrogates, bug [11ae2be95dac9417]} tip389 { +test string-17.9.$noComp {string totitle, surrogates, bug [11ae2be95dac9417]} fullutf { run {list [string totitle a\U118c0c 1 1] [string totitle a\U118c0c 2 2] \ [string totitle a\U118c0c 3 3]} -} [list a\U118a0c a\U118c0C a\U118c0C] +} [list a\U118a0c a\U118c0C a\U118c0c] test string-18.1.$noComp {string trim} { list [catch {run {string trim}} msg] $msg diff --git a/tests/utf.test b/tests/utf.test index 979c4a6..45698e4 100644 --- a/tests/utf.test +++ b/tests/utf.test @@ -21,7 +21,7 @@ testConstraint testbytestring [llength [info commands testbytestring]] catch {unset x} # Some tests require support for 4-byte UTF-8 sequences -testConstraint tip389 [expr {[string length \U010000] == 2}] +testConstraint fullutf [expr {[string length \U010000] == 1}] test utf-1.1 {Tcl_UniCharToUtf: 1 byte sequences} testbytestring { expr {"\x01" eq [testbytestring "\x01"]} @@ -78,12 +78,12 @@ test utf-2.6 {Tcl_UtfToUniChar: lead (3-byte) followed by 1 trail} testbytestrin test utf-2.7 {Tcl_UtfToUniChar: lead (3-byte) followed by 2 trail} testbytestring { string length [testbytestring "\xE4\xb9\x8e"] } {1} -test utf-2.8 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail} -constraints {tip389 testbytestring} -body { +test utf-2.8 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail} -constraints {fullutf testbytestring} -body { string length [testbytestring "\xF0\x90\x80\x80"] -} -result {2} -test utf-2.9 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail} -constraints {tip389 testbytestring} -body { +} -result {1} +test utf-2.9 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail} -constraints {fullutf testbytestring} -body { string length [testbytestring "\xF4\x8F\xBF\xBF"] -} -result {2} +} -result {1} test utf-2.10 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail, underflow} testbytestring { string length [testbytestring "\xF0\x8F\xBF\xBF"] } {4} -- cgit v0.12