From 4f6ea8aef90242e6052cf743e6684e312c84873c Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Mon, 2 Dec 2019 20:26:58 +0000 Subject: If TCL_UTF_MAX>=4, make Tcl_ParseBackslash combine two surrogates so they appear as one 4-byte UTF-8 byte sequence from the start. Add test-case for this. --- generic/tclParse.c | 10 ++++++ generic/tclUtf.c | 2 +- generic/tclUtil.c | 2 +- tests/utf.test | 103 +++++++++++++++++++++++++++-------------------------- 4 files changed, 65 insertions(+), 52 deletions(-) diff --git a/generic/tclParse.c b/generic/tclParse.c index 1532c05..4f30f8b 100644 --- a/generic/tclParse.c +++ b/generic/tclParse.c @@ -920,6 +920,16 @@ TclParseBackslash( * No hexadigits -> This is just "u". */ result = 'u'; +#if TCL_UTF_MAX > 3 + } else if (((result & 0xDC00) == 0xD800) && (count == 6) && (p[5] == '\\') && (p[6] == 'u') && (numBytes >= 10)) { + /* If high surrogate is immediately followed by a low surrogate escape, combine them. */ + int low; + int count2 = TclParseHex(p+7, 4, &low); + if ((low & 0xDC00) == 0xDC00) { + result = ((result & 0x3FF)<<10 | (low & 0x3FF)) + 0x10000; + count += count2 + 2; + } +#endif } break; case 'U': diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 9c2ef03..ce80bd0 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -846,7 +846,7 @@ Tcl_UtfBackslash( * We ate a whole line. Pay the price of a strlen() */ - result = TclParseBackslash(src, (int)strlen(src), &numRead, dst); + result = TclParseBackslash(src, strlen(src), &numRead, dst); } if (readPtr != NULL) { *readPtr = numRead; diff --git a/generic/tclUtil.c b/generic/tclUtil.c index 61c1973..41b3481 100644 --- a/generic/tclUtil.c +++ b/generic/tclUtil.c @@ -1649,7 +1649,7 @@ Tcl_Backslash( int *readPtr) /* Fill in with number of characters read from * src, unless NULL. */ { - char buf[TCL_UTF_MAX] = ""; + char buf[4] = ""; Tcl_UniChar ch = 0; Tcl_UtfBackslash(src, readPtr, buf); diff --git a/tests/utf.test b/tests/utf.test index c43d95a..1e257d3 100644 --- a/tests/utf.test +++ b/tests/utf.test @@ -21,40 +21,43 @@ testConstraint testbytestring [llength [info commands testbytestring]] catch {unset x} # Some tests require support for 4-byte UTF-8 sequences -testConstraint fullutf [expr {[format %c 0x010000] != "\ufffd"}] +testConstraint fullutf [expr {[format %c 0x010000] != "\uFFFD"}] test utf-1.1 {Tcl_UniCharToUtf: 1 byte sequences} testbytestring { expr {"\x01" eq [testbytestring "\x01"]} } 1 test utf-1.2 {Tcl_UniCharToUtf: 2 byte sequences} testbytestring { - expr {"\x00" eq [testbytestring "\xc0\x80"]} + expr {"\x00" eq [testbytestring "\xC0\x80"]} } 1 test utf-1.3 {Tcl_UniCharToUtf: 2 byte sequences} testbytestring { - expr {"\xe0" eq [testbytestring "\xc3\xa0"]} + expr {"\xE0" eq [testbytestring "\xC3\xA0"]} } 1 test utf-1.4 {Tcl_UniCharToUtf: 3 byte sequences} testbytestring { - expr {"\u4e4e" eq [testbytestring "\xe4\xb9\x8e"]} + expr {"\u4E4E" eq [testbytestring "\xE4\xB9\x8E"]} } 1 test utf-1.5 {Tcl_UniCharToUtf: overflowed Tcl_UniChar} testbytestring { - expr {[format %c 0x110000] eq [testbytestring "\xef\xbf\xbd"]} + expr {[format %c 0x110000] eq [testbytestring "\xEF\xBF\xBD"]} } 1 test utf-1.6 {Tcl_UniCharToUtf: negative Tcl_UniChar} testbytestring { - expr {[format %c -1] eq [testbytestring "\xef\xbf\xbd"]} + expr {[format %c -1] eq [testbytestring "\xEF\xBF\xBD"]} } 1 test utf-1.7 {Tcl_UniCharToUtf: 4 byte sequences} -constraints {fullutf testbytestring} -body { - expr {"\U014e4e" eq [testbytestring "\xf0\x94\xb9\x8e"]} + expr {"\U014E4E" eq [testbytestring "\xF0\x94\xB9\x8E"]} } -result 1 test utf-1.8 {Tcl_UniCharToUtf: 3 byte sequence, high surrogate} testbytestring { - expr {"\ud842" eq [testbytestring "\xed\xa1\x82"]} + expr {"\uD842" eq [testbytestring "\xED\xA1\x82"]} } 1 test utf-1.9 {Tcl_UniCharToUtf: 3 byte sequence, low surrogate} testbytestring { - expr {"\udc42" eq [testbytestring "\xed\xb1\x82"]} + expr {"\uDC42" eq [testbytestring "\xED\xB1\x82"]} } 1 test utf-1.10 {Tcl_UniCharToUtf: 3 byte sequence, high surrogate} testbytestring { - expr {[format %c 0xd842] eq [testbytestring "\xed\xa1\x82"]} + expr {[format %c 0xD842] eq [testbytestring "\xED\xA1\x82"]} } 1 test utf-1.11 {Tcl_UniCharToUtf: 3 byte sequence, low surrogate} testbytestring { - expr {[format %c 0xdc42] eq [testbytestring "\xed\xb1\x82"]} + expr {[format %c 0xDC42] eq [testbytestring "\xED\xB1\x82"]} +} 1 +test utf-1.12 {Tcl_UniCharToUtf: 4 byte sequence, high/low surrogate} {fullutf testbytestring} { + expr {"\uD842\uDC42" eq [testbytestring "\xF0\xA0\xA1\x82"]} } 1 test utf-2.1 {Tcl_UtfToUniChar: low ascii} { @@ -67,7 +70,7 @@ test utf-2.3 {Tcl_UtfToUniChar: lead (2-byte) followed by non-trail} testbytestr string length [testbytestring "\xC2"] } {1} test utf-2.4 {Tcl_UtfToUniChar: lead (2-byte) followed by trail} testbytestring { - string length [testbytestring "\xC2\xa2"] + string length [testbytestring "\xC2\xA2"] } {1} test utf-2.5 {Tcl_UtfToUniChar: lead (3-byte) followed by non-trail} testbytestring { string length [testbytestring "\xE2"] @@ -76,7 +79,7 @@ test utf-2.6 {Tcl_UtfToUniChar: lead (3-byte) followed by 1 trail} testbytestrin string length [testbytestring "\xE2\xA2"] } {2} test utf-2.7 {Tcl_UtfToUniChar: lead (3-byte) followed by 2 trail} testbytestring { - string length [testbytestring "\xE4\xb9\x8e"] + string length [testbytestring "\xE4\xB9\x8E"] } {1} test utf-2.8 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail} -constraints {fullutf testbytestring} -body { string length [testbytestring "\xF0\x90\x80\x80"] @@ -150,21 +153,21 @@ test utf-8.1 {Tcl_UniCharAtIndex: index = 0} { string index abcd 0 } {a} test utf-8.2 {Tcl_UniCharAtIndex: index = 0} { - string index \u4e4e\u25a 0 -} "\u4e4e" + string index \u4E4E\u25A 0 +} "\u4E4E" test utf-8.3 {Tcl_UniCharAtIndex: index > 0} { string index abcd 2 } {c} test utf-8.4 {Tcl_UniCharAtIndex: index > 0} { - string index \u4e4e\u25a\xff\u543 2 + string index \u4E4E\u25A\xFF\u543 2 } "\uff" test utf-9.1 {Tcl_UtfAtIndex: index = 0} { string range abcd 0 2 } {abc} test utf-9.2 {Tcl_UtfAtIndex: index > 0} { - string range \u4e4e\u25a\xff\u543klmnop 1 5 -} "\u25a\xff\u543kl" + string range \u4E4E\u25A\xFF\u543klmnop 1 5 +} "\u25A\xFF\u543kl" test utf-10.1 {Tcl_UtfBackslash: dst == NULL} { @@ -172,16 +175,16 @@ test utf-10.1 {Tcl_UtfBackslash: dst == NULL} { } { } test utf-10.2 {Tcl_UtfBackslash: \u subst} testbytestring { - expr {"\ua2" eq [testbytestring "\xc2\xa2"]} + expr {"\uA2" eq [testbytestring "\xC2\xA2"]} } 1 test utf-10.3 {Tcl_UtfBackslash: longer \u subst} testbytestring { - expr {"\u4e21" eq [testbytestring "\xe4\xb8\xa1"]} + expr {"\u4E21" eq [testbytestring "\xE4\xB8\xA1"]} } 1 test utf-10.4 {Tcl_UtfBackslash: stops at first non-hex} testbytestring { - expr {"\u4e2k" eq "[testbytestring \xd3\xa2]k"} + expr {"\u4E2k" eq "[testbytestring \xD3\xA2]k"} } 1 test utf-10.5 {Tcl_UtfBackslash: stops after 4 hex chars} testbytestring { - expr {"\u4e216" eq "[testbytestring \xe4\xb8\xa1]6"} + expr {"\u4E216" eq "[testbytestring \xE4\xB8\xA1]6"} } 1 proc bsCheck {char num} { global errNum @@ -227,8 +230,8 @@ bsCheck \u41 65 bsCheck \ua 10 bsCheck \uA 10 bsCheck \340 224 -bsCheck \ua1 161 -bsCheck \u4e21 20001 +bsCheck \uA1 161 +bsCheck \u4E21 20001 bsCheck \741 60 bsCheck \U 85 bsCheck \Uk 85 @@ -236,10 +239,10 @@ bsCheck \U41 65 bsCheck \Ua 10 bsCheck \UA 10 bsCheck \Ua1 161 -bsCheck \U4e21 20001 -bsCheck \U004e21 20001 -bsCheck \U00004e21 20001 -bsCheck \U0000004e21 78 +bsCheck \U4E21 20001 +bsCheck \U004E21 20001 +bsCheck \U00004E21 20001 +bsCheck \U0000004E21 78 if {[testConstraint fullutf]} { bsCheck \U00110000 69632 bsCheck \U01100000 69632 @@ -257,14 +260,14 @@ test utf-11.2 {Tcl_UtfToUpper} { string toupper abc } ABC test utf-11.3 {Tcl_UtfToUpper} { - string toupper \u00e3ab -} \u00c3AB + string toupper \u00E3ab +} \u00C3AB test utf-11.4 {Tcl_UtfToUpper} { - string toupper \u01e3ab -} \u01e2AB + string toupper \u01E3ab +} \u01E2AB test utf-11.5 {Tcl_UtfToUpper Georgian (new in Unicode 11)} { - string toupper \u10d0\u1c90 -} \u1c90\u1c90 + string toupper \u10D0\u1C90 +} \u1C90\u1C90 test utf-12.1 {Tcl_UtfToLower} { string tolower {} @@ -273,14 +276,14 @@ test utf-12.2 {Tcl_UtfToLower} { string tolower ABC } abc test utf-12.3 {Tcl_UtfToLower} { - string tolower \u00c3AB -} \u00e3ab + string tolower \u00C3AB +} \u00E3ab test utf-12.4 {Tcl_UtfToLower} { - string tolower \u01e2AB -} \u01e3ab + string tolower \u01E2AB +} \u01E3ab test utf-12.5 {Tcl_UtfToLower Georgian (new in Unicode 11)} { - string tolower \u10d0\u1c90 -} \u10d0\u10d0 + string tolower \u10D0\u1C90 +} \u10D0\u10D0 test utf-13.1 {Tcl_UtfToTitle} { string totitle {} @@ -289,17 +292,17 @@ test utf-13.2 {Tcl_UtfToTitle} { string totitle abc } Abc test utf-13.3 {Tcl_UtfToTitle} { - string totitle \u00e3ab -} \u00c3ab + string totitle \u00E3ab +} \u00C3ab test utf-13.4 {Tcl_UtfToTitle} { - string totitle \u01f3ab -} \u01f2ab + string totitle \u01F3ab +} \u01F2ab test utf-13.5 {Tcl_UtfToTitle Georgian (new in Unicode 11)} { - string totitle \u10d0\u1c90 -} \u10d0\u1c90 + string totitle \u10D0\u1C90 +} \u10D0\u1C90 test utf-13.6 {Tcl_UtfToTitle Georgian (new in Unicode 11)} { - string totitle \u1c90\u10d0 -} \u1c90\u10d0 + string totitle \u1C90\u10D0 +} \u1C90\u10D0 test utf-14.1 {Tcl_UtfNcasecmp} { string compare -nocase a b @@ -328,8 +331,8 @@ test utf-16.1 {Tcl_UniCharToLower, negative delta} { string tolower aA } aa test utf-16.2 {Tcl_UniCharToLower, positive delta} { - string tolower \u0178\u00ff\uA78D\u01c5\U10400 -} \u00ff\u00ff\u0265\u01c6\U10428 + string tolower \u0178\u00ff\uA78D\u01c5 +} \u00ff\u00ff\u0265\u01c6 test utf-17.1 {Tcl_UniCharToLower, no delta} { string tolower ! -- cgit v0.12