From 413ea81a284c691dc5ed4ad48217370ce83f65f7 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Mon, 4 May 2020 14:17:23 +0000 Subject: More progress/simplification --- generic/tclUtf.c | 23 ++--------------------- tests/utf.test | 33 ++++++++++++--------------------- 2 files changed, 14 insertions(+), 42 deletions(-) diff --git a/generic/tclUtf.c b/generic/tclUtf.c index c4b5305..b964b7e 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -70,25 +70,6 @@ static const unsigned char totalBytes[256] = { /* End of "continuation byte section" */ 2,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, -#if TCL_UTF_MAX > 3 - 4,4,4,4,4, -#else - 1,1,1,1,1, -#endif - 1,1,1,1,1,1,1,1,1,1,1 -}; - -static const unsigned char complete[256] = { - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, -/* Tcl_UtfCharComplete() might point to 2nd byte of valid 4-byte sequence */ - 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, - 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, -/* End of "continuation byte section" */ - 2,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, - 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, #if TCL_UTF_MAX > 4 4,4,4,4,4, #else @@ -183,7 +164,7 @@ Invalid( unsigned char byte = *src; int index; - if (byte % 0x04) { + if ((byte & 0xC3) != 0xC0) { /* Only lead bytes 0xC0, 0xE0, 0xF0, 0xF4 need examination */ return 0; } @@ -573,7 +554,7 @@ Tcl_UtfCharComplete( * a complete UTF-8 character. */ int length) /* Length of above string in bytes. */ { - return length >= complete[UCHAR(*src)]; + return length >= totalBytes[UCHAR(*src)]; } /* diff --git a/tests/utf.test b/tests/utf.test index c455078..3f74f6f 100644 --- a/tests/utf.test +++ b/tests/utf.test @@ -493,25 +493,16 @@ test utf-6.91.0 {Tcl_UtfNext, validity check [493dccc2de]} {testutfnext ucs2} { test utf-6.91.1 {Tcl_UtfNext, validity check [493dccc2de]} {testutfnext fullutf} { testutfnext \xF4\x90\x80\x80 } 1 -test utf-6.92.0 {Tcl_UtfNext, pointing to 2th byte of 4-byte valid sequence} {testutfnext ucs2} { +test utf-6.92 {Tcl_UtfNext, pointing to 2th byte of 4-byte valid sequence} testutfnext { testutfnext \xA0\xA0\xA0 -} 1 -test utf-6.92.1 {Tcl_UtfNext, pointing to 2th byte of 4-byte valid sequence} {testutfnext fullutf knownBug} { - testutfnext \xA0\xA0\xA0 -} 3 -test utf-6.93.0 {Tcl_UtfNext, pointing to 2th byte of 4-byte invalid sequence} {testutfnext ucs2} { - testutfnext \x80\x80\x80 } 3 -test utf-6.93.1 {Tcl_UtfNext, pointing to 2th byte of 4-byte invalid sequence} {testutfnext fullutf knownBug} { +test utf-6.93 {Tcl_UtfNext, pointing to 2th byte of 4-byte invalid sequence} testutfnext { testutfnext \x80\x80\x80 } 3 -test utf-6.94.0 {Tcl_UtfNext, pointing to 2th byte of 5-byte invalid sequence} {testutfnext ucs2} { - testutfnext \xA0\xA0\xA0\xA0 -} 1 -test utf-6.94.1 {Tcl_UtfNext, pointing to 2th byte of 5-byte invalid sequence} {testutfnext fullutf knownBug} { +test utf-6.94 {Tcl_UtfNext, pointing to 2th byte of 5-byte invalid sequence} testutfnext { testutfnext \xA0\xA0\xA0\xA0 } 3 -test utf-6.95 {Tcl_UtfNext, pointing to 2th byte of 5-byte invalid sequence} {testutfnext ucs2} { +test utf-6.95 {Tcl_UtfNext, pointing to 2th byte of 5-byte invalid sequence} testutfnext { testutfnext \x80\x80\x80\x80 } 3 test utf-6.96 {Tcl_UtfNext, read limits} testutfnext { @@ -619,18 +610,18 @@ test utf-6.121 {Tcl_UtfNext, read limits} {testutfnext ucs2} { test utf-6.122 {Tcl_UtfNext, read limits} {testutfnext ucs2} { testutfnext \xA0\xA0\xA0 2 } 0 -test utf-6.123 {Tcl_UtfNext, read limits} {testutfnext ucs2} { +test utf-6.123 {Tcl_UtfNext, read limits} testutfnext { testutfnext \xA0\xA0\xA0G 3 -} 1 -test utf-6.124 {Tcl_UtfNext, read limits} {testutfnext ucs2} { +} 3 +test utf-6.124 {Tcl_UtfNext, read limits} testutfnext { testutfnext \xA0\xA0\xA0\xA0 3 -} 1 -test utf-6.125 {Tcl_UtfNext, read limits} {testutfnext ucs2} { +} 3 +test utf-6.125 {Tcl_UtfNext, read limits} testutfnext { testutfnext \xA0\xA0\xA0\xA0G 4 -} 1 -test utf-6.126 {Tcl_UtfNext, read limits} {testutfnext ucs2} { +} 3 +test utf-6.126 {Tcl_UtfNext, read limits} testutfnext { testutfnext \xA0\xA0\xA0\xA0\xA0 4 -} 1 +} 3 test utf-7.1 {Tcl_UtfPrev} testutfprev { testutfprev {} -- cgit v0.12