summaryrefslogtreecommitdiffstats
path: root/generic/tclUtf.c
diff options
context:
space:
mode:
authordgp <dgp@users.sourceforge.net>2020-04-15 22:39:53 (GMT)
committerdgp <dgp@users.sourceforge.net>2020-04-15 22:39:53 (GMT)
commitd491ca6385bb8c8e630dea43cc287b5a881233ce (patch)
tree3ec5d5777535207c2446b5531cbe98f969ab6afb /generic/tclUtf.c
parent1f6e0698a6ecceec504384a0d228e56b2c1aba42 (diff)
downloadtcl-d491ca6385bb8c8e630dea43cc287b5a881233ce.zip
tcl-d491ca6385bb8c8e630dea43cc287b5a881233ce.tar.gz
tcl-d491ca6385bb8c8e630dea43cc287b5a881233ce.tar.bz2
Refactor the Overlong test into a utility routine.
Diffstat (limited to 'generic/tclUtf.c')
-rw-r--r--generic/tclUtf.c85
1 files changed, 56 insertions, 29 deletions
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index 6003b75..f3b2097 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -81,6 +81,7 @@ static CONST unsigned char totalBytes[256] = {
*/
static int UtfCount(int ch);
+static int Overlong(unsigned char *src);
/*
*---------------------------------------------------------------------------
@@ -115,7 +116,59 @@ UtfCount(
#endif
return 3;
}
+
+/*
+ *---------------------------------------------------------------------------
+ *
+ * Overlong --
+ *
+ * Utility routine to report whether /src/ points to the start of an
+ * overlong byte sequence that should be rejected.
+ *
+ * Results:
+ * A boolean.
+ *---------------------------------------------------------------------------
+ */
+INLINE static int
+Overlong(
+ unsigned char *src) /* Points to lead byte of a UTF-8 byte
+ * sequence. Caller guarantees it is safe
+ * to read src[0] and src[1]. */
+{
+ switch (*src) {
+ case 0xC0:
+ if (src[1] == 0x80) {
+ /* Valid sequence: \xC0\x80 for \u0000 */
+ return 0;
+ }
+ /* Reject overlong: \xC0\x81 - \xC0\xBF */
+ return 1;
+ case 0xC1:
+ /* Reject overlong: \xC1\x80 - \xC1\xBF */
+ return 1;
+ case 0xE0:
+ if (src[1] < 0xA0) {
+ /* Reject overlong: \xE0\x80\x80 - \xE0\x9F\xBF */
+ return 1;
+ }
+ /* Valid sequence: \xE0\xA0\x80 for \u0800 , etc. */
+ return 0;
+#if TCL_UTF_MAX > 3
+ case 0xF0:
+ if (src[1] < 0x90) {
+ /* Reject overlong: \xF0\x80\x80\x80 - \xF0\x8F\xBF\xBF */
+ return 1;
+ }
+ /* Valid sequence: \xF0\x90\x80\x80 for \U10000 , etc. */
+ return 0
+#endif
+ default:
+ /* All other lead bytes lead only valid sequences */
+ return 0;
+ }
+}
+
/*
*---------------------------------------------------------------------------
*
@@ -756,37 +809,11 @@ Tcl_UtfPrev(
* Use that capability to screen out overlong sequences.
*/
- switch (byte) {
- case 0xC0:
- if (look[1] == 0x80) {
- /* Valid sequence: \xC0\x80 for \u0000 */
- return (CONST char *)look;
- }
- /* Reject overlong: \xC0\x81 - \xC0\xBF */
+ if (Overlong(look)) {
+ /* Reject */
return fallback;
- case 0xC1:
- /* Reject overlong: \xC1\x80 - \xC1\xBF */
- return fallback;
- case 0xE0:
- if (look[1] < 0xA0) {
- /* Reject overlong: \xE0\x80\x80 - \xE0\x9F\xBF */
- return fallback;
- }
- /* Valid sequence: \xE0\xA0\x80 for \u0800 , etc. */
- return (CONST char *)look;
-#if TCL_UTF_MAX > 3
- case 0xF0:
- if (look[1] < 0x90) {
- /* Reject overlong: \xF0\x80\x80\x80 - \xF0\x8F\xBF\xBF */
- return fallback;
- }
- /* Valid sequence: \xF0\x90\x80\x80 for \U10000 , etc. */
- return (CONST char *)look;
-#endif
- default:
- /* All other lead bytes lead only valid sequences */
- return (CONST char *)look;
}
+ return (CONST char *)look;
}
/* We saw a trail byte. */