summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorR David Murray <rdmurray@bitdance.com>2013-06-26 16:06:21 (GMT)
committerR David Murray <rdmurray@bitdance.com>2013-06-26 16:06:21 (GMT)
commitb83ee30fc1110d8e2d7ca0fb08b79506830a207a (patch)
treec4404d086770acfb58325b75582c4d943b253477
parentdd3a6a5533bed5f9d1250856e4aa9edd70ec9ef2 (diff)
downloadcpython-b83ee30fc1110d8e2d7ca0fb08b79506830a207a.zip
cpython-b83ee30fc1110d8e2d7ca0fb08b79506830a207a.tar.gz
cpython-b83ee30fc1110d8e2d7ca0fb08b79506830a207a.tar.bz2
#11454: Reduce email module load time, improve surrogate check efficiency.
The new _has_surrogates code was suggested by Serhiy Storchaka. See the issue for timings, but it is far faster than any other alternative, and also removes the load time that we previously incurred from compiling the complex regex this replaces.
-rw-r--r--Lib/email/utils.py14
1 files changed, 10 insertions, 4 deletions
diff --git a/Lib/email/utils.py b/Lib/email/utils.py
index 93a625c..b3b42bb 100644
--- a/Lib/email/utils.py
+++ b/Lib/email/utils.py
@@ -54,10 +54,16 @@ TICK = "'"
specialsre = re.compile(r'[][\\()<>@,:;".]')
escapesre = re.compile(r'[\\"]')
-# How to figure out if we are processing strings that come from a byte
-# source with undecodable characters.
-_has_surrogates = re.compile(
- '([^\ud800-\udbff]|\A)[\udc00-\udfff]([^\udc00-\udfff]|\Z)').search
+def _has_surrogates(s):
+ """Return True if s contains surrogate-escaped binary data."""
+ # This check is based on the fact that unless there are surrogates, utf8
+ # (Python's default encoding) can encode any string. This is the fastest
+ # way to check for surrogates, see issue 11454 for timings.
+ try:
+ s.encode()
+ return False
+ except UnicodeEncodeError:
+ return True
# How to deal with a string containing bytes before handing it to the
# application through the 'normal' interface.