diff options
author | Mark Dickinson <mdickinson@enthought.com> | 2012-06-23 20:45:14 (GMT) |
---|---|---|
committer | Mark Dickinson <mdickinson@enthought.com> | 2012-06-23 20:45:14 (GMT) |
commit | 106c4145ff79f1cf6c2377ee305c8201483102a0 (patch) | |
tree | 7b69ec25392b8a664f357e999d72ecfa73145052 /Objects/stringlib | |
parent | 16ad7a254a54c8e756406e79d4ad7d9a2536ad52 (diff) | |
download | cpython-106c4145ff79f1cf6c2377ee305c8201483102a0.zip cpython-106c4145ff79f1cf6c2377ee305c8201483102a0.tar.gz cpython-106c4145ff79f1cf6c2377ee305c8201483102a0.tar.bz2 |
Issue #14923: Optimize continuation-byte check in UTF-8 decoding. Patch by Serhiy Storchaka.
Diffstat (limited to 'Objects/stringlib')
-rw-r--r-- | Objects/stringlib/codecs.h | 16 |
1 files changed, 10 insertions, 6 deletions
diff --git a/Objects/stringlib/codecs.h b/Objects/stringlib/codecs.h index fb35493..63bf648 100644 --- a/Objects/stringlib/codecs.h +++ b/Objects/stringlib/codecs.h @@ -15,6 +15,9 @@ # error C 'long' size should be either 4 or 8! #endif +/* 10xxxxxx */ +#define IS_CONTINUATION_BYTE(ch) ((ch) >= 0x80 && (ch) < 0xC0) + Py_LOCAL_INLINE(Py_UCS4) STRINGLIB(utf8_decode)(const char **inptr, const char *end, STRINGLIB_CHAR *dest, @@ -107,7 +110,7 @@ STRINGLIB(utf8_decode)(const char **inptr, const char *end, break; } ch2 = (unsigned char)s[1]; - if ((ch2 & 0xC0) != 0x80) + if (!IS_CONTINUATION_BYTE(ch2)) /* invalid continuation byte */ goto InvalidContinuation; ch = (ch << 6) + ch2 - @@ -131,8 +134,8 @@ STRINGLIB(utf8_decode)(const char **inptr, const char *end, } ch2 = (unsigned char)s[1]; ch3 = (unsigned char)s[2]; - if ((ch2 & 0xC0) != 0x80 || - (ch3 & 0xC0) != 0x80) { + if (!IS_CONTINUATION_BYTE(ch2) || + !IS_CONTINUATION_BYTE(ch3)) { /* invalid continuation byte */ goto InvalidContinuation; } @@ -172,9 +175,9 @@ STRINGLIB(utf8_decode)(const char **inptr, const char *end, ch2 = (unsigned char)s[1]; ch3 = (unsigned char)s[2]; ch4 = (unsigned char)s[3]; - if ((ch2 & 0xC0) != 0x80 || - (ch3 & 0xC0) != 0x80 || - (ch4 & 0xC0) != 0x80) { + if (!IS_CONTINUATION_BYTE(ch2) || + !IS_CONTINUATION_BYTE(ch3) || + !IS_CONTINUATION_BYTE(ch4)) { /* invalid continuation byte */ goto InvalidContinuation; } @@ -216,6 +219,7 @@ InvalidContinuation: } #undef ASCII_CHAR_MASK +#undef IS_CONTINUATION_BYTE /* UTF-8 encoder specialized for a Unicode kind to avoid the slow |