From 2ea2ef0609d7e306bf981672cda2e66782ed4db3 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Thu, 11 Mar 2021 12:19:14 +0000 Subject: Backport Tcl_UtfCharComplete() functionality from 8.6 for TCL_UTF_MAX>3. This makes Tcl_UtfCharComplete() usable to protect Tcl_UtfNext() calls for overflow. No change for TCL_UTF_MAX=3 (default build) --- generic/tclUtf.c | 19 ++++++++++++++++++- tests/utf.test | 6 +++--- 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 03d0f3a..efbd383 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -76,6 +76,23 @@ static const unsigned char totalBytes[256] = { 1,1,1,1,1,1,1,1,1,1,1 }; +#if TCL_UTF_MAX > 3 +static const unsigned char complete[256] = { + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +/* Tcl_UtfCharComplete() might point to 2nd byte of valid 4-byte sequence */ + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, +/* End of "continuation byte section" */ + 2,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,1,1,1,1,1,1,1,1,1,1,1 +}; +#else +# define complete totalBytes +#endif + /* * Functions used only in this module. */ @@ -492,7 +509,7 @@ Tcl_UtfCharComplete( * a complete UTF-8 character. */ int length) /* Length of above string in bytes. */ { - return length >= totalBytes[UCHAR(*src)]; + return length >= complete[UCHAR(*src)]; } /* diff --git a/tests/utf.test b/tests/utf.test index 06ac329..76b6847 100644 --- a/tests/utf.test +++ b/tests/utf.test @@ -8,8 +8,8 @@ # See the file "license.terms" for information on usage and redistribution # of this file, and for a DISCLAIMER OF ALL WARRANTIES. -if {[lsearch [namespace children] ::tcltest] == -1} { - package require tcltest 2 +if {"::tcltest" ni [namespace children]} { + package require tcltest 2.5 namespace import -force ::tcltest::* } @@ -614,7 +614,7 @@ test utf-6.117.1 {Tcl_UtfNext, read limits} {testutfnext testbytestring fullutf} test utf-6.118 {Tcl_UtfNext, read limits} {testutfnext testbytestring} { testutfnext [testbytestring \xA0]G 0 } 0 -test utf-6.119 {Tcl_UtfNext, read limits} {testutfnext testbytestring} { +test utf-6.119 {Tcl_UtfNext, read limits} {testutfnext testbytestring ucs2} { testutfnext [testbytestring \xA0]G 1 } 1 test utf-6.120 {Tcl_UtfNext, read limits} {testutfnext testbytestring ucs2} { -- cgit v0.12 From b0e0d4b618d58c962735cb62982229a8f67fb632 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Sun, 14 Mar 2021 16:12:25 +0000 Subject: Document that Tcl_UtfCharComplete() can (now) be used to protect Tcl_UtfNext() calls against overflow, if the string being handled is not NULL-terminated. --- doc/Utf.3 | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/doc/Utf.3 b/doc/Utf.3 index cca6498..9687eb6 100644 --- a/doc/Utf.3 +++ b/doc/Utf.3 @@ -141,8 +141,8 @@ source buffer is long enough such that this routine does not run off the end and dereference non-existent or random memory; if the source buffer is known to be null-terminated, this will not happen. If the input is not in proper UTF-8 format, \fBTcl_UtfToUniChar\fR will store the first -byte of \fIsrc\fR in \fI*chPtr\fR as a Tcl_UniChar between 0x0080 and -0x00FF and return 1. +byte of \fIsrc\fR in \fI*chPtr\fR as a Tcl_UniChar between 0x80 and +0xFF and return 1. .PP \fBTcl_UniCharToUtfDString\fR converts the given Unicode string to UTF-8, storing the result in a previously initialized \fBTcl_DString\fR. @@ -197,10 +197,10 @@ characters. .PP \fBTcl_UtfCharComplete\fR returns 1 if the source UTF-8 string \fIsrc\fR of \fIlength\fR bytes is long enough to be decoded by -\fBTcl_UtfToUniChar\fR, or 0 otherwise. This function does not guarantee -that the UTF-8 string is properly formed. This routine is used by -procedures that are operating on a byte at a time and need to know if a -full Tcl_UniChar has been seen. +\fBTcl_UtfToUniChar\fR/\fBTcl_UtfNext\fR, or 0 otherwise. This function +does not guarantee that the UTF-8 string is properly formed. This routine +is used by procedures that are operating on a byte at a time and need to +know if a full Tcl_UniChar has been seen. .PP \fBTcl_NumUtfChars\fR corresponds to \fBstrlen\fR for UTF-8 strings. It returns the number of Tcl_UniChars that are represented by the UTF-8 string @@ -221,7 +221,8 @@ Given \fIsrc\fR, a pointer to some location in a UTF-8 string, \fBTcl_UtfNext\fR returns a pointer to the next UTF-8 character in the string. The caller must not ask for the next character after the last character in the string if the string is not terminated by a null -character. +character. \fBTcl_UtfCharComplete\fR can be used in that case to +make sure enough bytes are available before calling \fBTcl_UtfNext\fR. .PP \fBTcl_UtfPrev\fR is used to step backward through but not beyond the UTF-8 string that begins at \fIstart\fR. If the UTF-8 string is made -- cgit v0.12