summaryrefslogtreecommitdiffstats
path: root/Objects
diff options
context:
space:
mode:
authorMark Dickinson <mdickinson@enthought.com>2012-06-23 20:45:14 (GMT)
committerMark Dickinson <mdickinson@enthought.com>2012-06-23 20:45:14 (GMT)
commit106c4145ff79f1cf6c2377ee305c8201483102a0 (patch)
tree7b69ec25392b8a664f357e999d72ecfa73145052 /Objects
parent16ad7a254a54c8e756406e79d4ad7d9a2536ad52 (diff)
downloadcpython-106c4145ff79f1cf6c2377ee305c8201483102a0.zip
cpython-106c4145ff79f1cf6c2377ee305c8201483102a0.tar.gz
cpython-106c4145ff79f1cf6c2377ee305c8201483102a0.tar.bz2
Issue #14923: Optimize continuation-byte check in UTF-8 decoding. Patch by Serhiy Storchaka.
Diffstat (limited to 'Objects')
-rw-r--r--Objects/stringlib/codecs.h16
1 files changed, 10 insertions, 6 deletions
diff --git a/Objects/stringlib/codecs.h b/Objects/stringlib/codecs.h
index fb35493..63bf648 100644
--- a/Objects/stringlib/codecs.h
+++ b/Objects/stringlib/codecs.h
@@ -15,6 +15,9 @@
# error C 'long' size should be either 4 or 8!
#endif
+/* 10xxxxxx */
+#define IS_CONTINUATION_BYTE(ch) ((ch) >= 0x80 && (ch) < 0xC0)
+
Py_LOCAL_INLINE(Py_UCS4)
STRINGLIB(utf8_decode)(const char **inptr, const char *end,
STRINGLIB_CHAR *dest,
@@ -107,7 +110,7 @@ STRINGLIB(utf8_decode)(const char **inptr, const char *end,
break;
}
ch2 = (unsigned char)s[1];
- if ((ch2 & 0xC0) != 0x80)
+ if (!IS_CONTINUATION_BYTE(ch2))
/* invalid continuation byte */
goto InvalidContinuation;
ch = (ch << 6) + ch2 -
@@ -131,8 +134,8 @@ STRINGLIB(utf8_decode)(const char **inptr, const char *end,
}
ch2 = (unsigned char)s[1];
ch3 = (unsigned char)s[2];
- if ((ch2 & 0xC0) != 0x80 ||
- (ch3 & 0xC0) != 0x80) {
+ if (!IS_CONTINUATION_BYTE(ch2) ||
+ !IS_CONTINUATION_BYTE(ch3)) {
/* invalid continuation byte */
goto InvalidContinuation;
}
@@ -172,9 +175,9 @@ STRINGLIB(utf8_decode)(const char **inptr, const char *end,
ch2 = (unsigned char)s[1];
ch3 = (unsigned char)s[2];
ch4 = (unsigned char)s[3];
- if ((ch2 & 0xC0) != 0x80 ||
- (ch3 & 0xC0) != 0x80 ||
- (ch4 & 0xC0) != 0x80) {
+ if (!IS_CONTINUATION_BYTE(ch2) ||
+ !IS_CONTINUATION_BYTE(ch3) ||
+ !IS_CONTINUATION_BYTE(ch4)) {
/* invalid continuation byte */
goto InvalidContinuation;
}
@@ -216,6 +219,7 @@ InvalidContinuation:
}
#undef ASCII_CHAR_MASK
+#undef IS_CONTINUATION_BYTE
/* UTF-8 encoder specialized for a Unicode kind to avoid the slow