summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorjan.nijtmans <nijtmans@users.sourceforge.net>2018-04-17 21:49:00 (GMT)
committerjan.nijtmans <nijtmans@users.sourceforge.net>2018-04-17 21:49:00 (GMT)
commit28ac08663306381af8f310b247bec60e5ed694db (patch)
tree3ebfa96ba64918e4542607a87281b7631d2f969d
parent994ba283c9757b14d86fa9157ca304e4317af34c (diff)
downloadtcl-28ac08663306381af8f310b247bec60e5ed694db.zip
tcl-28ac08663306381af8f310b247bec60e5ed694db.tar.gz
tcl-28ac08663306381af8f310b247bec60e5ed694db.tar.bz2
Slightly better unmatched-surrogates handling. Unmatched High surrogates will still be silently removed, but Unmatched Low surrogates will pass through as-is now. Inspired by Kevin Kenny's remarks. Thanks!
-rw-r--r--generic/tclUtf.c19
1 files changed, 13 insertions, 6 deletions
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index 923b1f8..ab4e142 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -148,15 +148,22 @@ Tcl_UniCharToUtf(
if ((ch & 0xF800) == 0xD800) {
if (ch & 0x0400) {
/* Low surrogate */
- buf[3] = (char) ((ch | 0x80) & 0xBF);
- buf[2] |= (char) (((ch >> 6) | 0x80) & 0x8F);
- return 4;
+ if (((buf[0] & 0xF8) == 0xF0) && ((buf[1] & 0xC0) == 0x80)
+ && ((buf[2] & 0xCF) == 0)) {
+ /* Previous Tcl_UniChar was a High surrogate, so combine */
+ buf[3] = (char) ((ch & 0x3F) | 0x80);
+ buf[2] |= (char) (((ch >> 6) & 0x0F) | 0x80);
+ return 4;
+ }
+ /* Previous Tcl_UniChar was not a High surrogate, so just output */
} else {
/* High surrogate */
ch += 0x40;
- buf[2] = (char) (((ch << 4) | 0x80) & 0xB0);
- buf[1] = (char) (((ch >> 2) | 0x80) & 0xBF);
- buf[0] = (char) (((ch >> 8) | 0xF0) & 0xF7);
+ /* Fill buffer with specific 3-byte (invalid) byte combination,
+ so following Low surrogate can recognize it and combine */
+ buf[2] = (char) ((ch << 4) & 0x30);
+ buf[1] = (char) (((ch >> 2) & 0x3F) | 0x80);
+ buf[0] = (char) (((ch >> 8) & 0x07) | 0xF0);
return 0;
}
}