From 04ea3b8bff2991e54cc2469b372927735c9d7a83 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Mon, 18 Mar 2019 22:32:37 +0000 Subject: Comment Comment Tcl_UniCharToUtf() better, what happens handling surrogates. Add type cast in tclUtf.c, making actual check clearer --- doc/Utf.3 | 7 ++++--- generic/tclScan.c | 2 +- generic/tclUtf.c | 2 +- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/doc/Utf.3 b/doc/Utf.3 index afcff79..111aae6 100644 --- a/doc/Utf.3 +++ b/doc/Utf.3 @@ -133,9 +133,10 @@ represent one Unicode character in the UTF-8 representation. \fBTcl_UniCharToUtf\fR stores the character \fIch\fR as a UTF-8 string in starting at \fIbuf\fR. The return value is the number of bytes stored in \fIbuf\fR. If ch is a high surrogate (range U+D800 - U+DBFF), then -the return value will be 0 and nothing will be stored. If you still -want to produce UTF-8 output for it (even though knowing it's an illegal -code-point on its own), just call \fBTcl_UniCharToUtf\fR again using ch = -1. +the return value will be 1 and a single byte in the range 0xF0 - 0xF4 +will be stored. If you still want to produce UTF-8 output for it (even +though knowing it's an illegal code-point on its own), just call +\fBTcl_UniCharToUtf\fR again specifying ch = -1. .PP \fBTcl_UtfToUniChar\fR reads one UTF-8 character starting at \fIsrc\fR and stores it as a Tcl_UniChar in \fI*chPtr\fR. The return value is the diff --git a/generic/tclScan.c b/generic/tclScan.c index 0736cfb..74ec2da 100644 --- a/generic/tclScan.c +++ b/generic/tclScan.c @@ -261,7 +261,7 @@ ValidateFormat( Tcl_UniChar ch = 0; int objIndex, xpgSize, nspace = numVars; int *nassign = TclStackAlloc(interp, nspace * sizeof(int)); - char buf[TCL_UTF_MAX+1] = ""; + char buf[TCL_UTF_MAX + 1] = ""; Tcl_Obj *errorMsg; /* Place to build an error messages. Note that * these are messy operations because we do * not want to use the formatting engine; diff --git a/generic/tclUtf.c b/generic/tclUtf.c index b5d8824..f3561f9 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -446,7 +446,7 @@ Tcl_UtfToUniChar( #else *chPtr = (((byte & 0x07) << 18) | ((src[1] & 0x3F) << 12) | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F)); - if ((*chPtr - 0x10000) <= 0xFFFFF) { + if ((unsigned)(*chPtr - 0x10000) <= 0xFFFFF) { return 4; } #endif -- cgit v0.12