summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Doc/library/re.rst127
-rw-r--r--Lib/_strptime.py4
-rwxr-xr-xLib/base64.py14
-rw-r--r--Lib/decimal.py2
-rw-r--r--Lib/distutils/cygwinccompiler.py6
-rw-r--r--Lib/distutils/emxccompiler.py2
-rw-r--r--Lib/distutils/sysconfig.py2
-rw-r--r--Lib/distutils/util.py2
-rw-r--r--Lib/distutils/version.py2
-rw-r--r--Lib/distutils/versionpredicate.py6
-rw-r--r--Lib/email/quoprimime.py12
-rw-r--r--Lib/email/utils.py17
-rw-r--r--Lib/encodings/idna.py8
-rw-r--r--Lib/ftplib.py5
-rw-r--r--Lib/html/parser.py2
-rw-r--r--Lib/http/cookiejar.py22
-rw-r--r--Lib/http/cookies.py2
-rw-r--r--Lib/imaplib.py9
-rw-r--r--Lib/json/decoder.py2
-rw-r--r--Lib/logging/handlers.py2
-rwxr-xr-xLib/platform.py16
-rw-r--r--Lib/plistlib.py2
-rw-r--r--Lib/posixpath.py2
-rw-r--r--Lib/py_compile.py2
-rw-r--r--Lib/re.py31
-rw-r--r--Lib/sre_constants.py3
-rw-r--r--Lib/sre_parse.py14
-rw-r--r--Lib/tarfile.py2
-rwxr-xr-xLib/test/re_tests.py2
-rw-r--r--Lib/test/test_bytes.py2
-rw-r--r--Lib/test/test_mmap.py2
-rw-r--r--Lib/test/test_re.py65
-rw-r--r--Lib/tokenize.py14
-rw-r--r--Lib/urllib/request.py2
-rw-r--r--Misc/NEWS8
-rw-r--r--Modules/_sre.c27
-rw-r--r--Modules/sre.h1
37 files changed, 280 insertions, 163 deletions
diff --git a/Doc/library/re.rst b/Doc/library/re.rst
index a6ebc22..f6f0d89 100644
--- a/Doc/library/re.rst
+++ b/Doc/library/re.rst
@@ -11,9 +11,13 @@
This module provides regular expression matching operations similar to
-those found in Perl. Both patterns and strings to be searched can be
-Unicode strings as well as 8-bit strings. The :mod:`re` module is
-always available.
+those found in Perl. The :mod:`re` module is always available.
+
+Both patterns and strings to be searched can be Unicode strings as well as
+8-bit strings. However, Unicode strings and 8-bit strings cannot be mixed:
+that is, you cannot match an Unicode string with a byte pattern or
+vice-versa; similarly, when asking for a substition, the replacement
+string must be of the same type as both the pattern and the search string.
Regular expressions use the backslash character (``'\'``) to indicate
special forms or to allow special characters to be used without invoking
@@ -212,12 +216,12 @@ The special characters are:
group; ``(?P<name>...)`` is the only exception to this rule. Following are the
currently supported extensions.
-``(?iLmsux)``
- (One or more letters from the set ``'i'``, ``'L'``, ``'m'``, ``'s'``,
- ``'u'``, ``'x'``.) The group matches the empty string; the letters
- set the corresponding flags: :const:`re.I` (ignore case),
- :const:`re.L` (locale dependent), :const:`re.M` (multi-line),
- :const:`re.S` (dot matches all), :const:`re.U` (Unicode dependent),
+``(?aiLmsux)``
+ (One or more letters from the set ``'a'``, ``'i'``, ``'L'``, ``'m'``,
+ ``'s'``, ``'u'``, ``'x'``.) The group matches the empty string; the
+ letters set the corresponding flags: :const:`re.a` (ASCII-only matching),
+ :const:`re.I` (ignore case), :const:`re.L` (locale dependent),
+ :const:`re.M` (multi-line), :const:`re.S` (dot matches all),
and :const:`re.X` (verbose), for the entire regular expression. (The
flags are described in :ref:`contents-of-module-re`.) This
is useful if you wish to include the flags as part of the regular
@@ -324,56 +328,62 @@ the second character. For example, ``\$`` matches the character ``'$'``.
word is indicated by whitespace or a non-alphanumeric, non-underscore character.
Note that ``\b`` is defined as the boundary between ``\w`` and ``\ W``, so the
precise set of characters deemed to be alphanumeric depends on the values of the
- ``UNICODE`` and ``LOCALE`` flags. Inside a character range, ``\b`` represents
+ ``ASCII`` and ``LOCALE`` flags. Inside a character range, ``\b`` represents
the backspace character, for compatibility with Python's string literals.
``\B``
Matches the empty string, but only when it is *not* at the beginning or end of a
word. This is just the opposite of ``\b``, so is also subject to the settings
- of ``LOCALE`` and ``UNICODE``.
+ of ``ASCII`` and ``LOCALE`` .
``\d``
- When the :const:`UNICODE` flag is not specified, matches any decimal digit; this
- is equivalent to the set ``[0-9]``. With :const:`UNICODE`, it will match
- whatever is classified as a digit in the Unicode character properties database.
+ For Unicode (str) patterns:
+ When the :const:`ASCII` flag is specified, matches any decimal digit; this
+ is equivalent to the set ``[0-9]``. Otherwise, it will match whatever
+ is classified as a digit in the Unicode character properties database
+ (but this does include the standard ASCII digits and is thus a superset
+ of [0-9]).
+ For 8-bit (bytes) patterns:
+ Matches any decimal digit; this is equivalent to the set ``[0-9]``.
``\D``
- When the :const:`UNICODE` flag is not specified, matches any non-digit
- character; this is equivalent to the set ``[^0-9]``. With :const:`UNICODE`, it
- will match anything other than character marked as digits in the Unicode
- character properties database.
+ Matches any character which is not a decimal digit. This is the
+ opposite of ``\d`` and is therefore similarly subject to the settings of
+ ``ASCII`` and ``LOCALE``.
``\s``
- When the :const:`LOCALE` and :const:`UNICODE` flags are not specified, matches
- any whitespace character; this is equivalent to the set ``[ \t\n\r\f\v]``. With
- :const:`LOCALE`, it will match this set plus whatever characters are defined as
- space for the current locale. If :const:`UNICODE` is set, this will match the
- characters ``[ \t\n\r\f\v]`` plus whatever is classified as space in the Unicode
- character properties database.
+ For Unicode (str) patterns:
+ When the :const:`ASCII` flag is specified, matches only ASCII whitespace
+ characters; this is equivalent to the set ``[ \t\n\r\f\v]``. Otherwise,
+ it will match this set whatever is classified as space in the Unicode
+ character properties database (including for example the non-breaking
+ spaces mandated by typography rules in many languages).
+ For 8-bit (bytes) patterns:
+ Matches characters considered whitespace in the ASCII character set;
+ this is equivalent to the set ``[ \t\n\r\f\v]``.
``\S``
- When the :const:`LOCALE` and :const:`UNICODE` flags are not specified, matches
- any non-whitespace character; this is equivalent to the set ``[^ \t\n\r\f\v]``
- With :const:`LOCALE`, it will match any character not in this set, and not
- defined as space in the current locale. If :const:`UNICODE` is set, this will
- match anything other than ``[ \t\n\r\f\v]`` and characters marked as space in
- the Unicode character properties database.
+ Matches any character which is not a whitespace character. This is the
+ opposite of ``\s`` and is therefore similarly subject to the settings of
+ ``ASCII`` and ``LOCALE``.
``\w``
- When the :const:`LOCALE` and :const:`UNICODE` flags are not specified, matches
- any alphanumeric character and the underscore; this is equivalent to the set
- ``[a-zA-Z0-9_]``. With :const:`LOCALE`, it will match the set ``[0-9_]`` plus
- whatever characters are defined as alphanumeric for the current locale. If
- :const:`UNICODE` is set, this will match the characters ``[0-9_]`` plus whatever
- is classified as alphanumeric in the Unicode character properties database.
+ For Unicode (str) patterns:
+ When the :const:`ASCII` flag is specified, this is equivalent to the set
+ ``[a-zA-Z0-9_]``. Otherwise, it will match whatever is classified as
+ alphanumeric in the Unicode character properties database (it will
+ include most characters that can be part of a word in whatever language,
+ as well as numbers and the underscore sign).
+ For 8-bit (bytes) patterns:
+ Matches characters considered alphanumeric in the ASCII character set;
+ this is equivalent to the set ``[a-zA-Z0-9_]``. With :const:`LOCALE`,
+ it will additionally match whatever characters are defined as
+ alphanumeric for the current locale.
``\W``
- When the :const:`LOCALE` and :const:`UNICODE` flags are not specified, matches
- any non-alphanumeric character; this is equivalent to the set ``[^a-zA-Z0-9_]``.
- With :const:`LOCALE`, it will match any character not in the set ``[0-9_]``, and
- not defined as alphanumeric for the current locale. If :const:`UNICODE` is set,
- this will match anything other than ``[0-9_]`` and characters marked as
- alphanumeric in the Unicode character properties database.
+ Matches any character which is not an alphanumeric character. This is the
+ opposite of ``\w`` and is therefore similarly subject to the settings of
+ ``ASCII`` and ``LOCALE``.
``\Z``
Matches only at the end of the string.
@@ -454,6 +464,25 @@ form.
expression at a time needn't worry about compiling regular expressions.)
+.. data:: A
+ ASCII
+
+ Make ``\w``, ``\W``, ``\b``, ``\B``, ``\s`` and ``\S`` perform ASCII-only
+ matching instead of full Unicode matching. This is only meaningful for
+ Unicode patterns, and is ignored for byte patterns.
+
+ Note that the :const:`re.U` flag still exists (as well as its synonym
+ :const:`re.UNICODE` and its embedded counterpart ``(?u)``), but it has
+ become useless in Python 3.0.
+ In previous Python versions, it was used to specify that
+ matching had to be Unicode dependent (the default was ASCII matching in
+ all circumstances). Starting from Python 3.0, the default is Unicode
+ matching for Unicode strings (which can be changed by specifying the
+ ``'a'`` flag), and ASCII matching for 8-bit strings. Further, Unicode
+ dependent matching for 8-bit strings isn't allowed anymore and results
+ in a ValueError.
+
+
.. data:: I
IGNORECASE
@@ -465,7 +494,10 @@ form.
LOCALE
Make ``\w``, ``\W``, ``\b``, ``\B``, ``\s`` and ``\S`` dependent on the
- current locale.
+ current locale. The use of this flag is discouraged as the locale mechanism
+ is very unreliable, and it only handles one "culture" at a time anyway;
+ you should use Unicode matching instead, which is the default in Python 3.0
+ for Unicode (str) patterns.
.. data:: M
@@ -486,13 +518,6 @@ form.
newline; without this flag, ``'.'`` will match anything *except* a newline.
-.. data:: U
- UNICODE
-
- Make ``\w``, ``\W``, ``\b``, ``\B``, ``\d``, ``\D``, ``\s`` and ``\S`` dependent
- on the Unicode character properties database.
-
-
.. data:: X
VERBOSE
@@ -511,6 +536,8 @@ form.
b = re.compile(r"\d+\.\d*")
+
+
.. function:: search(pattern, string[, flags])
Scan through *string* looking for a location where the regular expression
diff --git a/Lib/_strptime.py b/Lib/_strptime.py
index c3568b0..896c798 100644
--- a/Lib/_strptime.py
+++ b/Lib/_strptime.py
@@ -14,7 +14,7 @@ import time
import locale
import calendar
from re import compile as re_compile
-from re import IGNORECASE
+from re import IGNORECASE, ASCII
from re import escape as re_escape
from datetime import date as datetime_date
try:
@@ -262,7 +262,7 @@ class TimeRE(dict):
def compile(self, format):
"""Return a compiled re object for the format string."""
- return re_compile(self.pattern(format), IGNORECASE)
+ return re_compile(self.pattern(format), IGNORECASE | ASCII)
_cache_lock = _thread_allocate_lock()
# DO NOT modify _TimeRE_cache or _regex_cache without acquiring the cache lock
diff --git a/Lib/base64.py b/Lib/base64.py
index fc80835..4308fb4 100755
--- a/Lib/base64.py
+++ b/Lib/base64.py
@@ -39,7 +39,7 @@ def _translate(s, altchars):
return s.translate(translation)
-
+
# Base64 encoding/decoding uses binascii
def b64encode(s, altchars=None):
@@ -126,7 +126,7 @@ def urlsafe_b64decode(s):
return b64decode(s, b'-_')
-
+
# Base32 encoding/decoding must be done in Python
_b32alphabet = {
0: b'A', 9: b'J', 18: b'S', 27: b'3',
@@ -225,7 +225,7 @@ def b32decode(s, casefold=False, map01=None):
# characters because this will tell us how many null bytes to remove from
# the end of the decoded string.
padchars = 0
- mo = re.search('(?P<pad>[=]*)$', s)
+ mo = re.search(b'(?P<pad>[=]*)$', s)
if mo:
padchars = len(mo.group('pad'))
if padchars > 0:
@@ -262,7 +262,7 @@ def b32decode(s, casefold=False, map01=None):
return b''.join(parts)
-
+
# RFC 3548, Base 16 Alphabet specifies uppercase, but hexlify() returns
# lowercase. The RFC also recommends against accepting input case
# insensitively.
@@ -291,12 +291,12 @@ def b16decode(s, casefold=False):
raise TypeError("expected bytes, not %s" % s.__class__.__name__)
if casefold:
s = s.upper()
- if re.search('[^0-9A-F]', s):
+ if re.search(b'[^0-9A-F]', s):
raise binascii.Error('Non-base16 digit found')
return binascii.unhexlify(s)
-
+
# Legacy interface. This code could be cleaned up since I don't believe
# binascii has any line length limitations. It just doesn't seem worth it
# though. The files should be opened in binary mode.
@@ -353,7 +353,7 @@ def decodestring(s):
return binascii.a2b_base64(s)
-
+
# Usable as a script...
def main():
"""Small main program"""
diff --git a/Lib/decimal.py b/Lib/decimal.py
index 2c02f11..88cc5cd 100644
--- a/Lib/decimal.py
+++ b/Lib/decimal.py
@@ -5415,7 +5415,7 @@ ExtendedContext = Context(
# 2. For finite numbers (not infinities and NaNs) the body of the
# number between the optional sign and the optional exponent must have
# at least one decimal digit, possibly after the decimal point. The
-# lookahead expression '(?=\d|\.\d)' checks this.
+# lookahead expression '(?=[0-9]|\.[0-9])' checks this.
#
# As the flag UNICODE is not enabled here, we're explicitly avoiding any
# other meaning for \d than the numbers [0-9].
diff --git a/Lib/distutils/cygwinccompiler.py b/Lib/distutils/cygwinccompiler.py
index 48875230..da2c74a 100644
--- a/Lib/distutils/cygwinccompiler.py
+++ b/Lib/distutils/cygwinccompiler.py
@@ -409,7 +409,7 @@ def get_versions():
out = os.popen(gcc_exe + ' -dumpversion','r')
out_string = out.read()
out.close()
- result = re.search('(\d+\.\d+(\.\d+)*)',out_string)
+ result = re.search('(\d+\.\d+(\.\d+)*)', out_string, re.ASCII)
if result:
gcc_version = StrictVersion(result.group(1))
else:
@@ -421,7 +421,7 @@ def get_versions():
out = os.popen(ld_exe + ' -v','r')
out_string = out.read()
out.close()
- result = re.search('(\d+\.\d+(\.\d+)*)',out_string)
+ result = re.search('(\d+\.\d+(\.\d+)*)', out_string, re.ASCII)
if result:
ld_version = StrictVersion(result.group(1))
else:
@@ -433,7 +433,7 @@ def get_versions():
out = os.popen(dllwrap_exe + ' --version','r')
out_string = out.read()
out.close()
- result = re.search(' (\d+\.\d+(\.\d+)*)',out_string)
+ result = re.search(' (\d+\.\d+(\.\d+)*)', out_string, re.ASCII)
if result:
dllwrap_version = StrictVersion(result.group(1))
else:
diff --git a/Lib/distutils/emxccompiler.py b/Lib/distutils/emxccompiler.py
index d9ee82d..62a4c5b 100644
--- a/Lib/distutils/emxccompiler.py
+++ b/Lib/distutils/emxccompiler.py
@@ -300,7 +300,7 @@ def get_versions():
out = os.popen(gcc_exe + ' -dumpversion','r')
out_string = out.read()
out.close()
- result = re.search('(\d+\.\d+\.\d+)',out_string)
+ result = re.search('(\d+\.\d+\.\d+)', out_string, re.ASCII)
if result:
gcc_version = StrictVersion(result.group(1))
else:
diff --git a/Lib/distutils/sysconfig.py b/Lib/distutils/sysconfig.py
index 3a120dd..b17743a 100644
--- a/Lib/distutils/sysconfig.py
+++ b/Lib/distutils/sysconfig.py
@@ -512,7 +512,7 @@ def get_config_vars(*args):
# patched up as well.
'CFLAGS', 'PY_CFLAGS', 'BLDSHARED'):
flags = _config_vars[key]
- flags = re.sub('-arch\s+\w+\s', ' ', flags)
+ flags = re.sub('-arch\s+\w+\s', ' ', flags, re.ASCII)
flags = re.sub('-isysroot [^ \t]*', ' ', flags)
_config_vars[key] = flags
diff --git a/Lib/distutils/util.py b/Lib/distutils/util.py
index 76798b9..b87dfbe 100644
--- a/Lib/distutils/util.py
+++ b/Lib/distutils/util.py
@@ -81,7 +81,7 @@ def get_platform ():
return "%s-%s.%s" % (osname, version, release)
elif osname[:6] == "cygwin":
osname = "cygwin"
- rel_re = re.compile (r'[\d.]+')
+ rel_re = re.compile (r'[\d.]+', re.ASCII)
m = rel_re.match(release)
if m:
release = m.group()
diff --git a/Lib/distutils/version.py b/Lib/distutils/version.py
index f71b2f6..907f71c 100644
--- a/Lib/distutils/version.py
+++ b/Lib/distutils/version.py
@@ -134,7 +134,7 @@ class StrictVersion (Version):
"""
version_re = re.compile(r'^(\d+) \. (\d+) (\. (\d+))? ([ab](\d+))?$',
- re.VERBOSE)
+ re.VERBOSE | re.ASCII)
def parse (self, vstring):
diff --git a/Lib/distutils/versionpredicate.py b/Lib/distutils/versionpredicate.py
index 434b34f..b0dd9f4 100644
--- a/Lib/distutils/versionpredicate.py
+++ b/Lib/distutils/versionpredicate.py
@@ -5,7 +5,8 @@ import distutils.version
import operator
-re_validPackage = re.compile(r"(?i)^\s*([a-z_]\w*(?:\.[a-z_]\w*)*)(.*)")
+re_validPackage = re.compile(r"(?i)^\s*([a-z_]\w*(?:\.[a-z_]\w*)*)(.*)",
+ re.ASCII)
# (package) (rest)
re_paren = re.compile(r"^\s*\((.*)\)\s*$") # (list) inside of parentheses
@@ -153,7 +154,8 @@ def split_provision(value):
global _provision_rx
if _provision_rx is None:
_provision_rx = re.compile(
- "([a-zA-Z_]\w*(?:\.[a-zA-Z_]\w*)*)(?:\s*\(\s*([^)\s]+)\s*\))?$")
+ "([a-zA-Z_]\w*(?:\.[a-zA-Z_]\w*)*)(?:\s*\(\s*([^)\s]+)\s*\))?$",
+ re.ASCII)
value = value.strip()
m = _provision_rx.match(value)
if not m:
diff --git a/Lib/email/quoprimime.py b/Lib/email/quoprimime.py
index 68dc11c..ca91631 100644
--- a/Lib/email/quoprimime.py
+++ b/Lib/email/quoprimime.py
@@ -70,7 +70,7 @@ for c in (b' !"#$%&\'()*+,-./0123456789:;<>'
_QUOPRI_BODY_MAP[c] = chr(c)
-
+
# Helpers
def header_check(octet):
"""Return True if the octet should be escaped with header quopri."""
@@ -125,7 +125,7 @@ def quote(c):
return '=%02X' % ord(c)
-
+
def header_encode(header_bytes, charset='iso-8859-1'):
"""Encode a single header line with quoted-printable (like) encoding.
@@ -149,7 +149,7 @@ def header_encode(header_bytes, charset='iso-8859-1'):
return '=?%s?q?%s?=' % (charset, EMPTYSTRING.join(encoded))
-
+
def body_encode(body, maxlinelen=76, eol=NL):
"""Encode with quoted-printable, wrapping at maxlinelen characters.
@@ -225,7 +225,7 @@ def body_encode(body, maxlinelen=76, eol=NL):
return encoded_body
-
+
# BAW: I'm not sure if the intent was for the signature of this function to be
# the same as base64MIME.decode() or not...
def decode(encoded, eol=NL):
@@ -280,7 +280,7 @@ body_decode = decode
decodestring = decode
-
+
def _unquote_match(match):
"""Turn a match in the form =AB to the ASCII character with value 0xab"""
s = match.group(0)
@@ -296,4 +296,4 @@ def header_decode(s):
the high level email.Header class for that functionality.
"""
s = s.replace('_', ' ')
- return re.sub(r'=\w{2}', _unquote_match, s)
+ return re.sub(r'=\w{2}', _unquote_match, s, re.ASCII)
diff --git a/Lib/email/utils.py b/Lib/email/utils.py
index 35275f6..465903f 100644
--- a/Lib/email/utils.py
+++ b/Lib/email/utils.py
@@ -52,7 +52,7 @@ specialsre = re.compile(r'[][\\()<>@,:;".]')
escapesre = re.compile(r'[][\\()"]')
-
+
# Helpers
def formataddr(pair):
@@ -73,7 +73,7 @@ def formataddr(pair):
return address
-
+
def getaddresses(fieldvalues):
"""Return a list of (REALNAME, EMAIL) for each fieldvalue."""
all = COMMASPACE.join(fieldvalues)
@@ -81,7 +81,7 @@ def getaddresses(fieldvalues):
return a.addresslist
-
+
ecre = re.compile(r'''
=\? # literal =?
(?P<charset>[^?]*?) # non-greedy up to the next ? is the charset
@@ -93,7 +93,7 @@ ecre = re.compile(r'''
''', re.VERBOSE | re.IGNORECASE)
-
+
def formatdate(timeval=None, localtime=False, usegmt=False):
"""Returns a date string as specified by RFC 2822, e.g.:
@@ -146,7 +146,7 @@ def formatdate(timeval=None, localtime=False, usegmt=False):
zone)
-
+
def make_msgid(idstring=None):
"""Returns a string suitable for RFC 2822 compliant Message-ID, e.g:
@@ -168,7 +168,7 @@ def make_msgid(idstring=None):
return msgid
-
+
# These functions are in the standalone mimelib version only because they've
# subsequently been fixed in the latest Python versions. We use this to worm
# around broken older Pythons.
@@ -202,7 +202,7 @@ def unquote(str):
return str
-
+
# RFC2231-related functions - parameter encoding and decoding
def decode_rfc2231(s):
"""Decode string according to RFC 2231"""
@@ -227,7 +227,8 @@ def encode_rfc2231(s, charset=None, language=None):
return "%s'%s'%s" % (charset, language, s)
-rfc2231_continuation = re.compile(r'^(?P<name>\w+)\*((?P<num>[0-9]+)\*?)?$')
+rfc2231_continuation = re.compile(r'^(?P<name>\w+)\*((?P<num>[0-9]+)\*?)?$',
+ re.ASCII)
def decode_params(params):
"""Decode parameters list according to RFC 2231.
diff --git a/Lib/encodings/idna.py b/Lib/encodings/idna.py
index c2ba1da..583bdf1 100644
--- a/Lib/encodings/idna.py
+++ b/Lib/encodings/idna.py
@@ -176,12 +176,10 @@ class Codec(codecs.Codec):
return "", 0
# IDNA allows decoding to operate on Unicode strings, too.
- if isinstance(input, bytes):
- labels = dots.split(input)
- else:
- # Force to bytes
+ if not isinstance(input, bytes):
+ # XXX obviously wrong, see #3232
input = bytes(input)
- labels = input.split(b".")
+ labels = input.split(b".")
if labels and len(labels[-1]) == 0:
trailing_dot = '.'
diff --git a/Lib/ftplib.py b/Lib/ftplib.py
index 4955727..b64e45e 100644
--- a/Lib/ftplib.py
+++ b/Lib/ftplib.py
@@ -590,7 +590,8 @@ def parse150(resp):
global _150_re
if _150_re is None:
import re
- _150_re = re.compile("150 .* \((\d+) bytes\)", re.IGNORECASE)
+ _150_re = re.compile(
+ "150 .* \((\d+) bytes\)", re.IGNORECASE | re.ASCII)
m = _150_re.match(resp)
if not m:
return None
@@ -613,7 +614,7 @@ def parse227(resp):
global _227_re
if _227_re is None:
import re
- _227_re = re.compile(r'(\d+),(\d+),(\d+),(\d+),(\d+),(\d+)')
+ _227_re = re.compile(r'(\d+),(\d+),(\d+),(\d+),(\d+),(\d+)', re.ASCII)
m = _227_re.search(resp)
if not m:
raise error_proto(resp)
diff --git a/Lib/html/parser.py b/Lib/html/parser.py
index 828eece..83a5825 100644
--- a/Lib/html/parser.py
+++ b/Lib/html/parser.py
@@ -385,4 +385,4 @@ class HTMLParser(_markupbase.ParserBase):
return '&'+s+';'
return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));",
- replaceEntities, s)
+ replaceEntities, s, re.ASCII)
diff --git a/Lib/http/cookiejar.py b/Lib/http/cookiejar.py
index afd5f20..e9efab8 100644
--- a/Lib/http/cookiejar.py
+++ b/Lib/http/cookiejar.py
@@ -121,7 +121,7 @@ def time2netscape(t=None):
UTC_ZONES = {"GMT": None, "UTC": None, "UT": None, "Z": None}
-TIMEZONE_RE = re.compile(r"^([-+])?(\d\d?):?(\d\d)?$")
+TIMEZONE_RE = re.compile(r"^([-+])?(\d\d?):?(\d\d)?$", re.ASCII)
def offset_from_tz_string(tz):
offset = None
if tz in UTC_ZONES:
@@ -191,9 +191,9 @@ def _str2time(day, mon, yr, hr, min, sec, tz):
STRICT_DATE_RE = re.compile(
r"^[SMTWF][a-z][a-z], (\d\d) ([JFMASOND][a-z][a-z]) "
- "(\d\d\d\d) (\d\d):(\d\d):(\d\d) GMT$")
+ "(\d\d\d\d) (\d\d):(\d\d):(\d\d) GMT$", re.ASCII)
WEEKDAY_RE = re.compile(
- r"^(?:Sun|Mon|Tue|Wed|Thu|Fri|Sat)[a-z]*,?\s*", re.I)
+ r"^(?:Sun|Mon|Tue|Wed|Thu|Fri|Sat)[a-z]*,?\s*", re.I | re.ASCII)
LOOSE_HTTP_DATE_RE = re.compile(
r"""^
(\d\d?) # day
@@ -210,7 +210,7 @@ LOOSE_HTTP_DATE_RE = re.compile(
([-+]?\d{2,4}|(?![APap][Mm]\b)[A-Za-z]+)? # timezone
\s*
(?:\(\w+\))? # ASCII representation of timezone in parens.
- \s*$""", re.X)
+ \s*$""", re.X | re.ASCII)
def http2time(text):
"""Returns time in seconds since epoch of time represented by a string.
@@ -282,7 +282,7 @@ ISO_DATE_RE = re.compile(
\s*
([-+]?\d\d?:?(:?\d\d)?
|Z|z)? # timezone (Z is "zero meridian", i.e. GMT)
- \s*$""", re.X)
+ \s*$""", re.X | re. ASCII)
def iso2time(text):
"""
As for http2time, but parses the ISO 8601 formats:
@@ -489,7 +489,7 @@ def parse_ns_headers(ns_headers):
return result
-IPV4_RE = re.compile(r"\.\d+$")
+IPV4_RE = re.compile(r"\.\d+$", re.ASCII)
def is_HDN(text):
"""Return True if text is a host domain name."""
# XXX
@@ -574,7 +574,7 @@ def user_domain_match(A, B):
return True
return False
-cut_port_re = re.compile(r":\d+$")
+cut_port_re = re.compile(r":\d+$", re.ASCII)
def request_host(request):
"""Return request-host, as defined by RFC 2965.
@@ -1207,7 +1207,7 @@ class CookieJar:
domain_re = re.compile(r"[^.]*")
dots_re = re.compile(r"^\.+")
- magic_re = r"^\#LWP-Cookies-(\d+\.\d+)"
+ magic_re = re.compile(r"^\#LWP-Cookies-(\d+\.\d+)", re.ASCII)
def __init__(self, policy=None):
if policy is None:
@@ -1856,7 +1856,7 @@ class LWPCookieJar(FileCookieJar):
def _really_load(self, f, filename, ignore_discard, ignore_expires):
magic = f.readline()
- if not re.search(self.magic_re, magic):
+ if not self.magic_re.search(magic):
msg = ("%r does not look like a Set-Cookie3 (LWP) format "
"file" % filename)
raise LoadError(msg)
@@ -1965,7 +1965,7 @@ class MozillaCookieJar(FileCookieJar):
header by default (Mozilla can cope with that).
"""
- magic_re = "#( Netscape)? HTTP Cookie File"
+ magic_re = re.compile("#( Netscape)? HTTP Cookie File")
header = """\
# Netscape HTTP Cookie File
# http://www.netscape.com/newsref/std/cookie_spec.html
@@ -1977,7 +1977,7 @@ class MozillaCookieJar(FileCookieJar):
now = time.time()
magic = f.readline()
- if not re.search(self.magic_re, magic):
+ if not self.magic_re.search(magic):
f.close()
raise LoadError(
"%r does not look like a Netscape format cookies file" %
diff --git a/Lib/http/cookies.py b/Lib/http/cookies.py
index 344bede..3242d83 100644
--- a/Lib/http/cookies.py
+++ b/Lib/http/cookies.py
@@ -445,7 +445,7 @@ _CookiePattern = re.compile(
""+ _LegalCharsPatt +"*" # Any word or empty string
r")" # End of group 'val'
r"\s*;?" # Probably ending in a semi-colon
- )
+ , re.ASCII) # May be removed if safe.
# At long last, here is the cookie class.
diff --git a/Lib/imaplib.py b/Lib/imaplib.py
index 0d9704e..156b15d 100644
--- a/Lib/imaplib.py
+++ b/Lib/imaplib.py
@@ -88,11 +88,12 @@ InternalDate = re.compile(r'.*INTERNALDATE "'
r' (?P<hour>[0-9][0-9]):(?P<min>[0-9][0-9]):(?P<sec>[0-9][0-9])'
r' (?P<zonen>[-+])(?P<zoneh>[0-9][0-9])(?P<zonem>[0-9][0-9])'
r'"')
-Literal = re.compile(r'.*{(?P<size>\d+)}$')
+Literal = re.compile(r'.*{(?P<size>\d+)}$', re.ASCII)
MapCRLF = re.compile(r'\r\n|\r|\n')
Response_code = re.compile(r'\[(?P<type>[A-Z-]+)( (?P<data>[^\]]*))?\]')
Untagged_response = re.compile(r'\* (?P<type>[A-Z-]+)( (?P<data>.*))?')
-Untagged_status = re.compile(r'\* (?P<data>\d+) (?P<type>[A-Z-]+)( (?P<data2>.*))?')
+Untagged_status = re.compile(
+ r'\* (?P<data>\d+) (?P<type>[A-Z-]+)( (?P<data2>.*))?', re.ASCII)
@@ -146,7 +147,7 @@ class IMAP4:
class abort(error): pass # Service errors - close and retry
class readonly(abort): pass # Mailbox status changed to READ-ONLY
- mustquote = re.compile(r"[^\w!#$%&'*+,.:;<=>?^`|~-]")
+ mustquote = re.compile(r"[^\w!#$%&'*+,.:;<=>?^`|~-]", re.ASCII)
def __init__(self, host = '', port = IMAP4_PORT):
self.debug = Debug
@@ -168,7 +169,7 @@ class IMAP4:
self.tagpre = Int2AP(random.randint(4096, 65535))
self.tagre = re.compile(r'(?P<tag>'
+ self.tagpre
- + r'\d+) (?P<type>[A-Z]+) (?P<data>.*)')
+ + r'\d+) (?P<type>[A-Z]+) (?P<data>.*)', re.ASCII)
# Get server welcome message,
# request and store CAPABILITY response.
diff --git a/Lib/json/decoder.py b/Lib/json/decoder.py
index 5283eae..f0bc245 100644
--- a/Lib/json/decoder.py
+++ b/Lib/json/decoder.py
@@ -67,7 +67,7 @@ def JSONNumber(match, context):
fn = getattr(context, 'parse_int', None) or int
res = fn(integer)
return res, None
-pattern(r'(-?(?:0|[1-9]\d*))(\.\d+)?([eE][-+]?\d+)?')(JSONNumber)
+pattern(r'(-?(?:0|[1-9][0-9]*))(\.[0-9]+)?([eE][-+]?[0-9]+)?')(JSONNumber)
STRINGCHUNK = re.compile(r'(.*?)(["\\\x00-\x1f])', FLAGS)
diff --git a/Lib/logging/handlers.py b/Lib/logging/handlers.py
index 1e69120..19fb959 100644
--- a/Lib/logging/handlers.py
+++ b/Lib/logging/handlers.py
@@ -199,7 +199,7 @@ class TimedRotatingFileHandler(BaseRotatingHandler):
else:
raise ValueError("Invalid rollover interval specified: %s" % self.when)
- self.extMatch = re.compile(self.extMatch)
+ self.extMatch = re.compile(self.extMatch, re.ASCII)
self.interval = self.interval * interval # multiply by units requested
self.rolloverAt = currentTime + self.interval
diff --git a/Lib/platform.py b/Lib/platform.py
index ff81d28..0182180 100755
--- a/Lib/platform.py
+++ b/Lib/platform.py
@@ -118,7 +118,7 @@ _libc_search = re.compile(r'(__libc_init)'
'|'
'(GLIBC_([0-9.]+))'
'|'
- '(libc(_\w+)?\.so(?:\.(\d[0-9.]*))?)')
+ '(libc(_\w+)?\.so(?:\.(\d[0-9.]*))?)', re.ASCII)
def libc_ver(executable=sys.executable,lib='',version='',
@@ -223,15 +223,15 @@ def _dist_try_harder(distname,version,id):
return distname,version,id
-_release_filename = re.compile(r'(\w+)[-_](release|version)')
+_release_filename = re.compile(r'(\w+)[-_](release|version)', re.ASCII)
_lsb_release_version = re.compile(r'(.+)'
' release '
'([\d.]+)'
- '[^(]*(?:\((.+)\))?')
+ '[^(]*(?:\((.+)\))?', re.ASCII)
_release_version = re.compile(r'([^0-9]+)'
'(?: release )?'
'([\d.]+)'
- '[^(]*(?:\((.+)\))?')
+ '[^(]*(?:\((.+)\))?', re.ASCII)
# See also http://www.novell.com/coolsolutions/feature/11251.html
# and http://linuxmafia.com/faq/Admin/release-files.html
@@ -464,7 +464,7 @@ def _norm_version(version, build=''):
_ver_output = re.compile(r'(?:([\w ]+) ([\w.]+) '
'.*'
- 'Version ([\d.]+))')
+ 'Version ([\d.]+))', re.ASCII)
def _syscmd_ver(system='', release='', version='',
@@ -1253,16 +1253,16 @@ def processor():
_sys_version_parser = re.compile(
r'([\w.+]+)\s*'
'\(#?([^,]+),\s*([\w ]+),\s*([\w :]+)\)\s*'
- '\[([^\]]+)\]?')
+ '\[([^\]]+)\]?', re.ASCII)
_jython_sys_version_parser = re.compile(
- r'([\d\.]+)')
+ r'([\d\.]+)', re.ASCII)
_ironpython_sys_version_parser = re.compile(
r'IronPython\s*'
'([\d\.]+)'
'(?: \(([\d\.]+)\))?'
- ' on (.NET [\d\.]+)')
+ ' on (.NET [\d\.]+)', re.ASCII)
_sys_version_cache = {}
diff --git a/Lib/plistlib.py b/Lib/plistlib.py
index b775324..83e5d04 100644
--- a/Lib/plistlib.py
+++ b/Lib/plistlib.py
@@ -147,7 +147,7 @@ class DumbXMLWriter:
# Contents should conform to a subset of ISO 8601
# (in particular, YYYY '-' MM '-' DD 'T' HH ':' MM ':' SS 'Z'. Smaller units may be omitted with
# a loss of precision)
-_dateParser = re.compile(r"(?P<year>\d\d\d\d)(?:-(?P<month>\d\d)(?:-(?P<day>\d\d)(?:T(?P<hour>\d\d)(?::(?P<minute>\d\d)(?::(?P<second>\d\d))?)?)?)?)?Z")
+_dateParser = re.compile(r"(?P<year>\d\d\d\d)(?:-(?P<month>\d\d)(?:-(?P<day>\d\d)(?:T(?P<hour>\d\d)(?::(?P<minute>\d\d)(?::(?P<second>\d\d))?)?)?)?)?Z", re.ASCII)
def _dateFromString(s):
order = ('year', 'month', 'day', 'hour', 'minute', 'second')
diff --git a/Lib/posixpath.py b/Lib/posixpath.py
index dc0aa10..575492f 100644
--- a/Lib/posixpath.py
+++ b/Lib/posixpath.py
@@ -241,7 +241,7 @@ def expandvars(path):
return path
if not _varprog:
import re
- _varprog = re.compile(r'\$(\w+|\{[^}]*\})')
+ _varprog = re.compile(r'\$(\w+|\{[^}]*\})', re.ASCII)
i = 0
while True:
m = _varprog.search(path, i)
diff --git a/Lib/py_compile.py b/Lib/py_compile.py
index 8ef3662..cce5ac1 100644
--- a/Lib/py_compile.py
+++ b/Lib/py_compile.py
@@ -86,7 +86,7 @@ def read_encoding(file, default):
line = f.readline()
if not line:
break
- m = re.match(r".*\bcoding:\s*(\S+)\b", line)
+ m = re.match(br".*\bcoding:\s*(\S+)\b", line)
if m:
return m.group(1).decode("ascii")
return default
diff --git a/Lib/re.py b/Lib/re.py
index 951f239..63a95fd 100644
--- a/Lib/re.py
+++ b/Lib/re.py
@@ -44,7 +44,7 @@ The special characters are:
"|" A|B, creates an RE that will match either A or B.
(...) Matches the RE inside the parentheses.
The contents can be retrieved or matched later in the string.
- (?iLmsux) Set the I, L, M, S, U, or X flag for the RE (see below).
+ (?aiLmsux) Set the A, I, L, M, S, U, or X flag for the RE (see below).
(?:...) Non-grouping version of regular parentheses.
(?P<name>...) The substring matched by the group is accessible by name.
(?P=name) Matches the text matched earlier by the group named name.
@@ -64,11 +64,18 @@ resulting RE will match the second character.
\Z Matches only at the end of the string.
\b Matches the empty string, but only at the start or end of a word.
\B Matches the empty string, but not at the start or end of a word.
- \d Matches any decimal digit; equivalent to the set [0-9].
- \D Matches any non-digit character; equivalent to the set [^0-9].
+ \d Matches any decimal digit; equivalent to the set [0-9] in
+ bytes patterns or string patterns with the ASCII flag.
+ In string patterns without the ASCII flag, it will match the whole
+ range of Unicode digits.
+ \D Matches any non-digit character; equivalent to [^\d].
\s Matches any whitespace character; equivalent to [ \t\n\r\f\v].
\S Matches any non-whitespace character; equiv. to [^ \t\n\r\f\v].
- \w Matches any alphanumeric character; equivalent to [a-zA-Z0-9_].
+ \w Matches any alphanumeric character; equivalent to [a-zA-Z0-9_]
+ in bytes patterns or string patterns with the ASCII flag.
+ In string patterns without the ASCII flag, it will match the
+ range of Unicode alphanumeric characters (letters plus digits
+ plus underscore).
With LOCALE, it will match the set [0-9_] plus characters defined
as letters for the current locale.
\W Matches the complement of \w.
@@ -87,6 +94,12 @@ This module exports the following functions:
escape Backslash all non-alphanumerics in a string.
Some of the functions in this module takes flags as optional parameters:
+ A ASCII For string patterns, make \w, \W, \b, \B, \d, \D
+ match the corresponding ASCII character categories
+ (rather than the whole Unicode categories, which is the
+ default).
+ For bytes patterns, this flag is the only available
+ behaviour and needn't be specified.
I IGNORECASE Perform case-insensitive matching.
L LOCALE Make \w, \W, \b, \B, dependent on the current locale.
M MULTILINE "^" matches the beginning of lines (after a newline)
@@ -95,7 +108,8 @@ Some of the functions in this module takes flags as optional parameters:
as the end of the string.
S DOTALL "." matches any character at all, including the newline.
X VERBOSE Ignore whitespace and comments for nicer looking RE's.
- U UNICODE Make \w, \W, \b, \B, dependent on the Unicode locale.
+ U UNICODE For compatibility only. Ignored for string patterns (it
+ is the default), and forbidden for bytes patterns.
This module also defines an exception 'error'.
@@ -107,16 +121,17 @@ import sre_parse
# public symbols
__all__ = [ "match", "search", "sub", "subn", "split", "findall",
- "compile", "purge", "template", "escape", "I", "L", "M", "S", "X",
- "U", "IGNORECASE", "LOCALE", "MULTILINE", "DOTALL", "VERBOSE",
+ "compile", "purge", "template", "escape", "A", "I", "L", "M", "S", "X",
+ "U", "ASCII", "IGNORECASE", "LOCALE", "MULTILINE", "DOTALL", "VERBOSE",
"UNICODE", "error" ]
__version__ = "2.2.1"
# flags
+A = ASCII = sre_compile.SRE_FLAG_ASCII # assume ascii "locale"
I = IGNORECASE = sre_compile.SRE_FLAG_IGNORECASE # ignore case
L = LOCALE = sre_compile.SRE_FLAG_LOCALE # assume current 8-bit locale
-U = UNICODE = sre_compile.SRE_FLAG_UNICODE # assume unicode locale
+U = UNICODE = sre_compile.SRE_FLAG_UNICODE # assume unicode "locale"
M = MULTILINE = sre_compile.SRE_FLAG_MULTILINE # make anchors look for newline
S = DOTALL = sre_compile.SRE_FLAG_DOTALL # make dot match newline
X = VERBOSE = sre_compile.SRE_FLAG_VERBOSE # ignore whitespace and comments
diff --git a/Lib/sre_constants.py b/Lib/sre_constants.py
index 94962cb..6cf69c3 100644
--- a/Lib/sre_constants.py
+++ b/Lib/sre_constants.py
@@ -207,9 +207,10 @@ SRE_FLAG_IGNORECASE = 2 # case insensitive
SRE_FLAG_LOCALE = 4 # honour system locale
SRE_FLAG_MULTILINE = 8 # treat target as multiline string
SRE_FLAG_DOTALL = 16 # treat target as a single string
-SRE_FLAG_UNICODE = 32 # use unicode locale
+SRE_FLAG_UNICODE = 32 # use unicode "locale"
SRE_FLAG_VERBOSE = 64 # ignore whitespace and comments
SRE_FLAG_DEBUG = 128 # debugging
+SRE_FLAG_ASCII = 256 # use ascii "locale"
# flags for INFO primitive
SRE_INFO_PREFIX = 1 # has prefix
diff --git a/Lib/sre_parse.py b/Lib/sre_parse.py
index ffa8902..9d6e631 100644
--- a/Lib/sre_parse.py
+++ b/Lib/sre_parse.py
@@ -64,6 +64,7 @@ FLAGS = {
"s": SRE_FLAG_DOTALL,
"x": SRE_FLAG_VERBOSE,
# extensions
+ "a": SRE_FLAG_ASCII,
"t": SRE_FLAG_TEMPLATE,
"u": SRE_FLAG_UNICODE,
}
@@ -672,6 +673,18 @@ def _parse(source, state):
return subpattern
+def fix_flags(src, flags):
+ # Check and fix flags according to the type of pattern (str or bytes)
+ if isinstance(src, str):
+ if not flags & SRE_FLAG_ASCII:
+ flags |= SRE_FLAG_UNICODE
+ elif flags & SRE_FLAG_UNICODE:
+ raise ValueError("ASCII and UNICODE flags are incompatible")
+ else:
+ if flags & SRE_FLAG_UNICODE:
+ raise ValueError("can't use UNICODE flag with a bytes pattern")
+ return flags
+
def parse(str, flags=0, pattern=None):
# parse 're' pattern into list of (opcode, argument) tuples
@@ -683,6 +696,7 @@ def parse(str, flags=0, pattern=None):
pattern.str = str
p = _parse_sub(source, pattern, 0)
+ p.pattern.flags = fix_flags(str, p.pattern.flags)
tail = source.get()
if tail == ")":
diff --git a/Lib/tarfile.py b/Lib/tarfile.py
index ecb32f6..222b433 100644
--- a/Lib/tarfile.py
+++ b/Lib/tarfile.py
@@ -1368,7 +1368,7 @@ class TarInfo(object):
# "%d %s=%s\n" % (length, keyword, value). length is the size
# of the complete record including the length field itself and
# the newline. keyword and value are both UTF-8 encoded strings.
- regex = re.compile(r"(\d+) ([^=]+)=", re.U)
+ regex = re.compile(br"(\d+) ([^=]+)=")
pos = 0
while True:
match = regex.match(buf, pos)
diff --git a/Lib/test/re_tests.py b/Lib/test/re_tests.py
index 220301a..d314e20 100755
--- a/Lib/test/re_tests.py
+++ b/Lib/test/re_tests.py
@@ -667,4 +667,4 @@ tests.extend([
(r'\b.\b', 'a', SUCCEED, 'found', 'a'),
(r'(?u)\b.\b', u, SUCCEED, 'found', u),
(r'(?u)\w', u, SUCCEED, 'found', u),
- ])
+])
diff --git a/Lib/test/test_bytes.py b/Lib/test/test_bytes.py
index c64ac19..630f862 100644
--- a/Lib/test/test_bytes.py
+++ b/Lib/test/test_bytes.py
@@ -506,7 +506,7 @@ class ByteArrayTest(BaseBytesTest):
def by(s):
return bytearray(map(ord, s))
b = by("Hello, world")
- self.assertEqual(re.findall(r"\w+", b), [by("Hello"), by("world")])
+ self.assertEqual(re.findall(br"\w+", b), [by("Hello"), by("world")])
def test_setitem(self):
b = bytearray([1, 2, 3])
diff --git a/Lib/test/test_mmap.py b/Lib/test/test_mmap.py
index 0b5202a..9fe044f 100644
--- a/Lib/test/test_mmap.py
+++ b/Lib/test/test_mmap.py
@@ -54,7 +54,7 @@ class MmapTests(unittest.TestCase):
m.flush()
# Test doing a regular expression match in an mmap'ed file
- match = re.search('[A-Za-z]+', m)
+ match = re.search(b'[A-Za-z]+', m)
if match is None:
self.fail('regex match on mmap failed!')
else:
diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py
index 60b816e..755cb00 100644
--- a/Lib/test/test_re.py
+++ b/Lib/test/test_re.py
@@ -83,23 +83,6 @@ class ReTests(unittest.TestCase):
self.assertEqual(re.sub('\r\n', '\n', 'abc\r\ndef\r\n'),
'abc\ndef\n')
- def test_bug_1140(self):
- # re.sub(x, y, b'') should return b'', not '', and
- # re.sub(x, y, '') should return '', not b''.
- # Also:
- # re.sub(x, y, str(x)) should return str(y), and
- # re.sub(x, y, bytes(x)) should return
- # str(y) if isinstance(y, str) else unicode(y).
- for x in 'x', b'x':
- for y in 'y', b'y':
- z = re.sub(x, y, b'')
- self.assertEqual(z, b'')
- self.assertEqual(type(z), bytes)
- #
- z = re.sub(x, y, '')
- self.assertEqual(z, '')
- self.assertEqual(type(z), str)
-
def test_bug_1661(self):
# Verify that flags do not get silently ignored with compiled patterns
pattern = re.compile('.')
@@ -327,7 +310,7 @@ class ReTests(unittest.TestCase):
def test_getattr(self):
self.assertEqual(re.compile("(?i)(a)(b)").pattern, "(?i)(a)(b)")
- self.assertEqual(re.compile("(?i)(a)(b)").flags, re.I)
+ self.assertEqual(re.compile("(?i)(a)(b)").flags, re.I | re.U)
self.assertEqual(re.compile("(?i)(a)(b)").groups, 2)
self.assertEqual(re.compile("(?i)(a)(b)").groupindex, {})
self.assertEqual(re.compile("(?i)(?P<first>a)(?P<other>b)").groupindex,
@@ -614,8 +597,8 @@ class ReTests(unittest.TestCase):
import array
for typecode in 'bBuhHiIlLfd':
a = array.array(typecode)
- self.assertEqual(re.compile("bla").match(a), None)
- self.assertEqual(re.compile("").match(a).groups(), ())
+ self.assertEqual(re.compile(b"bla").match(a), None)
+ self.assertEqual(re.compile(b"").match(a).groups(), ())
def test_inline_flags(self):
# Bug #1700
@@ -658,6 +641,48 @@ class ReTests(unittest.TestCase):
self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a#\nb#\nc#')
self.assertEqual(pattern.sub('#', '\n'), '#\n#')
+ def test_bytes_str_mixing(self):
+ # Mixing str and bytes is disallowed
+ pat = re.compile('.')
+ bpat = re.compile(b'.')
+ self.assertRaises(TypeError, pat.match, b'b')
+ self.assertRaises(TypeError, bpat.match, 'b')
+ self.assertRaises(TypeError, pat.sub, b'b', 'c')
+ self.assertRaises(TypeError, pat.sub, 'b', b'c')
+ self.assertRaises(TypeError, pat.sub, b'b', b'c')
+ self.assertRaises(TypeError, bpat.sub, b'b', 'c')
+ self.assertRaises(TypeError, bpat.sub, 'b', b'c')
+ self.assertRaises(TypeError, bpat.sub, 'b', 'c')
+
+ def test_ascii_and_unicode_flag(self):
+ # String patterns
+ for flags in (0, re.UNICODE):
+ pat = re.compile('\xc0', flags | re.IGNORECASE)
+ self.assertNotEqual(pat.match('\xe0'), None)
+ pat = re.compile('\w', flags)
+ self.assertNotEqual(pat.match('\xe0'), None)
+ pat = re.compile('\xc0', re.ASCII | re.IGNORECASE)
+ self.assertEqual(pat.match('\xe0'), None)
+ pat = re.compile('(?a)\xc0', re.IGNORECASE)
+ self.assertEqual(pat.match('\xe0'), None)
+ pat = re.compile('\w', re.ASCII)
+ self.assertEqual(pat.match('\xe0'), None)
+ pat = re.compile('(?a)\w')
+ self.assertEqual(pat.match('\xe0'), None)
+ # Bytes patterns
+ for flags in (0, re.ASCII):
+ pat = re.compile(b'\xc0', re.IGNORECASE)
+ self.assertEqual(pat.match(b'\xe0'), None)
+ pat = re.compile(b'\w')
+ self.assertEqual(pat.match(b'\xe0'), None)
+ # Incompatibilities
+ self.assertRaises(ValueError, re.compile, b'\w', re.UNICODE)
+ self.assertRaises(ValueError, re.compile, b'(?u)\w')
+ self.assertRaises(ValueError, re.compile, '\w', re.UNICODE | re.ASCII)
+ self.assertRaises(ValueError, re.compile, '(?u)\w', re.ASCII)
+ self.assertRaises(ValueError, re.compile, '(?a)\w', re.UNICODE)
+ self.assertRaises(ValueError, re.compile, '(?au)\w')
+
def run_re_tests():
from test.re_tests import benchmarks, tests, SUCCEED, FAIL, SYNTAX_ERROR
diff --git a/Lib/tokenize.py b/Lib/tokenize.py
index 31366de..ec5a79a 100644
--- a/Lib/tokenize.py
+++ b/Lib/tokenize.py
@@ -47,21 +47,23 @@ def group(*choices): return '(' + '|'.join(choices) + ')'
def any(*choices): return group(*choices) + '*'
def maybe(*choices): return group(*choices) + '?'
+# Note: we use unicode matching for names ("\w") but ascii matching for
+# number literals.
Whitespace = r'[ \f\t]*'
Comment = r'#[^\r\n]*'
Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
Name = r'[a-zA-Z_]\w*'
-Hexnumber = r'0[xX][\da-fA-F]+'
+Hexnumber = r'0[xX][0-9a-fA-F]+'
Binnumber = r'0[bB][01]+'
Octnumber = r'0[oO][0-7]+'
-Decnumber = r'(?:0+|[1-9]\d*)'
+Decnumber = r'(?:0+|[1-9][0-9]*)'
Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
-Exponent = r'[eE][-+]?\d+'
-Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
-Expfloat = r'\d+' + Exponent
+Exponent = r'[eE][-+]?[0-9]+'
+Pointfloat = group(r'[0-9]+\.[0-9]*', r'\.[0-9]+') + maybe(Exponent)
+Expfloat = r'[0-9]+' + Exponent
Floatnumber = group(Pointfloat, Expfloat)
-Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')
+Imagnumber = group(r'[0-9]+[jJ]', Floatnumber + r'[jJ]')
Number = group(Imagnumber, Floatnumber, Intnumber)
# Tail end of ' string.
diff --git a/Lib/urllib/request.py b/Lib/urllib/request.py
index 889d9642..aaab059 100644
--- a/Lib/urllib/request.py
+++ b/Lib/urllib/request.py
@@ -141,7 +141,7 @@ def urlcleanup():
_opener = None
# copied from cookielib.py
-_cut_port_re = re.compile(r":\d+$")
+_cut_port_re = re.compile(r":\d+$", re.ASCII)
def request_host(request):
"""Return request-host, as defined by RFC 2965.
diff --git a/Misc/NEWS b/Misc/NEWS
index 3823772..e8fa5f5 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -30,6 +30,14 @@ Core and Builtins
Library
-------
+- Issue #2834: update the regular expression library to match the unicode
+ standards of py3k. In other words, mixing bytes and unicode strings
+ (be it as pattern, search string or replacement string) raises a TypeError.
+ Moreover, the re.UNICODE flag is enabled automatically for unicode patterns,
+ and can be disabled by specifying a new re.ASCII flag; as for bytes
+ patterns, ASCII matching is the only option and trying to specify re.UNICODE
+ for such patterns raises a ValueError.
+
- Issue #3300: make urllib.parse.[un]quote() default to UTF-8.
Code contributed by Matt Giuca. quote() now encodes the input
before quoting, unquote() decodes after unquoting. There are
diff --git a/Modules/_sre.c b/Modules/_sre.c
index 64fc513..2a54d8e 100644
--- a/Modules/_sre.c
+++ b/Modules/_sre.c
@@ -1691,7 +1691,7 @@ getstring(PyObject* string, Py_ssize_t* p_length, int* p_charsize)
/* get pointer to string buffer */
view.len = -1;
buffer = Py_TYPE(string)->tp_as_buffer;
- if (!buffer || !buffer->bf_getbuffer ||
+ if (!buffer || !buffer->bf_getbuffer ||
(*buffer->bf_getbuffer)(string, &view, PyBUF_SIMPLE) < 0) {
PyErr_SetString(PyExc_TypeError, "expected string or buffer");
return NULL;
@@ -1717,7 +1717,7 @@ getstring(PyObject* string, Py_ssize_t* p_length, int* p_charsize)
if (PyBytes_Check(string) || bytes == size)
charsize = 1;
#if defined(HAVE_UNICODE)
- else if (bytes == (Py_ssize_t) (size * sizeof(Py_UNICODE)))
+ else if (bytes == (Py_ssize_t) (size * sizeof(Py_UNICODE)))
charsize = sizeof(Py_UNICODE);
#endif
else {
@@ -1729,7 +1729,7 @@ getstring(PyObject* string, Py_ssize_t* p_length, int* p_charsize)
*p_charsize = charsize;
if (ptr == NULL) {
- PyErr_SetString(PyExc_ValueError,
+ PyErr_SetString(PyExc_ValueError,
"Buffer is NULL");
}
return ptr;
@@ -1754,6 +1754,17 @@ state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
if (!ptr)
return NULL;
+ if (charsize == 1 && pattern->charsize > 1) {
+ PyErr_SetString(PyExc_TypeError,
+ "can't use a string pattern on a bytes-like object");
+ return NULL;
+ }
+ if (charsize > 1 && pattern->charsize == 1) {
+ PyErr_SetString(PyExc_TypeError,
+ "can't use a bytes pattern on a string-like object");
+ return NULL;
+ }
+
/* adjust boundaries */
if (start < 0)
start = 0;
@@ -2682,6 +2693,16 @@ _compile(PyObject* self_, PyObject* args)
return NULL;
}
+ if (pattern == Py_None)
+ self->charsize = -1;
+ else {
+ Py_ssize_t p_length;
+ if (!getstring(pattern, &p_length, &self->charsize)) {
+ PyObject_DEL(self);
+ return NULL;
+ }
+ }
+
Py_INCREF(pattern);
self->pattern = pattern;
diff --git a/Modules/sre.h b/Modules/sre.h
index d4af05c..518c11d 100644
--- a/Modules/sre.h
+++ b/Modules/sre.h
@@ -30,6 +30,7 @@ typedef struct {
PyObject* pattern; /* pattern source (or None) */
int flags; /* flags used when compiling pattern source */
PyObject *weakreflist; /* List of weak references */
+ int charsize; /* pattern charsize (or -1) */
/* pattern code */
Py_ssize_t codesize;
SRE_CODE code[1];