Issue #14923: Optimize continuation-byte check in UTF-8 decoding. Patch by Serhiy Storchaka.

author: Mark Dickinson <mdickinson@enthought.com> 2012-06-23 20:45:14 (GMT)
committer: Mark Dickinson <mdickinson@enthought.com> 2012-06-23 20:45:14 (GMT)
commit: 106c4145ff79f1cf6c2377ee305c8201483102a0 (patch)
tree: 7b69ec25392b8a664f357e999d72ecfa73145052 /Objects
parent: 16ad7a254a54c8e756406e79d4ad7d9a2536ad52 (diff)
download: cpython-106c4145ff79f1cf6c2377ee305c8201483102a0.zip
cpython-106c4145ff79f1cf6c2377ee305c8201483102a0.tar.gz
cpython-106c4145ff79f1cf6c2377ee305c8201483102a0.tar.bz2
1 files changed, 10 insertions, 6 deletions
diff --git a/Objects/stringlib/codecs.h b/Objects/stringlib/codecs.h
index fb35493..63bf648 100644
--- a/Objects/stringlib/codecs.h
+++ b/Objects/stringlib/codecs.h
@@ -15,6 +15,9 @@
 # error C 'long' size should be either 4 or 8!
 #endif
 
+/* 10xxxxxx */
+#define IS_CONTINUATION_BYTE(ch) ((ch) >= 0x80 && (ch) < 0xC0)
+
 Py_LOCAL_INLINE(Py_UCS4)
 STRINGLIB(utf8_decode)(const char **inptr, const char *end,
                        STRINGLIB_CHAR *dest,
@@ -107,7 +110,7 @@ STRINGLIB(utf8_decode)(const char **inptr, const char *end,
                 break;
             }
             ch2 = (unsigned char)s[1];
-            if ((ch2 & 0xC0) != 0x80)
+            if (!IS_CONTINUATION_BYTE(ch2))
                 /* invalid continuation byte */
                 goto InvalidContinuation;
             ch = (ch << 6) + ch2 -
@@ -131,8 +134,8 @@ STRINGLIB(utf8_decode)(const char **inptr, const char *end,
             }
             ch2 = (unsigned char)s[1];
             ch3 = (unsigned char)s[2];
-            if ((ch2 & 0xC0) != 0x80 ||
-                (ch3 & 0xC0) != 0x80) {
+            if (!IS_CONTINUATION_BYTE(ch2) ||
+                !IS_CONTINUATION_BYTE(ch3)) {
                 /* invalid continuation byte */
                 goto InvalidContinuation;
             }
@@ -172,9 +175,9 @@ STRINGLIB(utf8_decode)(const char **inptr, const char *end,
             ch2 = (unsigned char)s[1];
             ch3 = (unsigned char)s[2];
             ch4 = (unsigned char)s[3];
-            if ((ch2 & 0xC0) != 0x80 ||
-                (ch3 & 0xC0) != 0x80 ||
-                (ch4 & 0xC0) != 0x80) {
+            if (!IS_CONTINUATION_BYTE(ch2) ||
+                !IS_CONTINUATION_BYTE(ch3) ||
+                !IS_CONTINUATION_BYTE(ch4)) {
                 /* invalid continuation byte */
                 goto InvalidContinuation;
             }
@@ -216,6 +219,7 @@ InvalidContinuation:
 }
 
 #undef ASCII_CHAR_MASK
+#undef IS_CONTINUATION_BYTE
 
 
 /* UTF-8 encoder specialized for a Unicode kind to avoid the slow
author	Mark Dickinson <mdickinson@enthought.com>	2012-06-23 20:45:14 (GMT)
committer	Mark Dickinson <mdickinson@enthought.com>	2012-06-23 20:45:14 (GMT)
commit	106c4145ff79f1cf6c2377ee305c8201483102a0 (patch)
tree	7b69ec25392b8a664f357e999d72ecfa73145052 /Objects
parent	16ad7a254a54c8e756406e79d4ad7d9a2536ad52 (diff)
download	cpython-106c4145ff79f1cf6c2377ee305c8201483102a0.zip cpython-106c4145ff79f1cf6c2377ee305c8201483102a0.tar.gz cpython-106c4145ff79f1cf6c2377ee305c8201483102a0.tar.bz2