diff options
37 files changed, 280 insertions, 163 deletions
diff --git a/Doc/library/re.rst b/Doc/library/re.rst index a6ebc22..f6f0d89 100644 --- a/Doc/library/re.rst +++ b/Doc/library/re.rst @@ -11,9 +11,13 @@ This module provides regular expression matching operations similar to -those found in Perl. Both patterns and strings to be searched can be -Unicode strings as well as 8-bit strings. The :mod:`re` module is -always available. +those found in Perl. The :mod:`re` module is always available. + +Both patterns and strings to be searched can be Unicode strings as well as +8-bit strings. However, Unicode strings and 8-bit strings cannot be mixed: +that is, you cannot match an Unicode string with a byte pattern or +vice-versa; similarly, when asking for a substition, the replacement +string must be of the same type as both the pattern and the search string. Regular expressions use the backslash character (``'\'``) to indicate special forms or to allow special characters to be used without invoking @@ -212,12 +216,12 @@ The special characters are: group; ``(?P<name>...)`` is the only exception to this rule. Following are the currently supported extensions. -``(?iLmsux)`` - (One or more letters from the set ``'i'``, ``'L'``, ``'m'``, ``'s'``, - ``'u'``, ``'x'``.) The group matches the empty string; the letters - set the corresponding flags: :const:`re.I` (ignore case), - :const:`re.L` (locale dependent), :const:`re.M` (multi-line), - :const:`re.S` (dot matches all), :const:`re.U` (Unicode dependent), +``(?aiLmsux)`` + (One or more letters from the set ``'a'``, ``'i'``, ``'L'``, ``'m'``, + ``'s'``, ``'u'``, ``'x'``.) The group matches the empty string; the + letters set the corresponding flags: :const:`re.a` (ASCII-only matching), + :const:`re.I` (ignore case), :const:`re.L` (locale dependent), + :const:`re.M` (multi-line), :const:`re.S` (dot matches all), and :const:`re.X` (verbose), for the entire regular expression. (The flags are described in :ref:`contents-of-module-re`.) This is useful if you wish to include the flags as part of the regular @@ -324,56 +328,62 @@ the second character. For example, ``\$`` matches the character ``'$'``. word is indicated by whitespace or a non-alphanumeric, non-underscore character. Note that ``\b`` is defined as the boundary between ``\w`` and ``\ W``, so the precise set of characters deemed to be alphanumeric depends on the values of the - ``UNICODE`` and ``LOCALE`` flags. Inside a character range, ``\b`` represents + ``ASCII`` and ``LOCALE`` flags. Inside a character range, ``\b`` represents the backspace character, for compatibility with Python's string literals. ``\B`` Matches the empty string, but only when it is *not* at the beginning or end of a word. This is just the opposite of ``\b``, so is also subject to the settings - of ``LOCALE`` and ``UNICODE``. + of ``ASCII`` and ``LOCALE`` . ``\d`` - When the :const:`UNICODE` flag is not specified, matches any decimal digit; this - is equivalent to the set ``[0-9]``. With :const:`UNICODE`, it will match - whatever is classified as a digit in the Unicode character properties database. + For Unicode (str) patterns: + When the :const:`ASCII` flag is specified, matches any decimal digit; this + is equivalent to the set ``[0-9]``. Otherwise, it will match whatever + is classified as a digit in the Unicode character properties database + (but this does include the standard ASCII digits and is thus a superset + of [0-9]). + For 8-bit (bytes) patterns: + Matches any decimal digit; this is equivalent to the set ``[0-9]``. ``\D`` - When the :const:`UNICODE` flag is not specified, matches any non-digit - character; this is equivalent to the set ``[^0-9]``. With :const:`UNICODE`, it - will match anything other than character marked as digits in the Unicode - character properties database. + Matches any character which is not a decimal digit. This is the + opposite of ``\d`` and is therefore similarly subject to the settings of + ``ASCII`` and ``LOCALE``. ``\s`` - When the :const:`LOCALE` and :const:`UNICODE` flags are not specified, matches - any whitespace character; this is equivalent to the set ``[ \t\n\r\f\v]``. With - :const:`LOCALE`, it will match this set plus whatever characters are defined as - space for the current locale. If :const:`UNICODE` is set, this will match the - characters ``[ \t\n\r\f\v]`` plus whatever is classified as space in the Unicode - character properties database. + For Unicode (str) patterns: + When the :const:`ASCII` flag is specified, matches only ASCII whitespace + characters; this is equivalent to the set ``[ \t\n\r\f\v]``. Otherwise, + it will match this set whatever is classified as space in the Unicode + character properties database (including for example the non-breaking + spaces mandated by typography rules in many languages). + For 8-bit (bytes) patterns: + Matches characters considered whitespace in the ASCII character set; + this is equivalent to the set ``[ \t\n\r\f\v]``. ``\S`` - When the :const:`LOCALE` and :const:`UNICODE` flags are not specified, matches - any non-whitespace character; this is equivalent to the set ``[^ \t\n\r\f\v]`` - With :const:`LOCALE`, it will match any character not in this set, and not - defined as space in the current locale. If :const:`UNICODE` is set, this will - match anything other than ``[ \t\n\r\f\v]`` and characters marked as space in - the Unicode character properties database. + Matches any character which is not a whitespace character. This is the + opposite of ``\s`` and is therefore similarly subject to the settings of + ``ASCII`` and ``LOCALE``. ``\w`` - When the :const:`LOCALE` and :const:`UNICODE` flags are not specified, matches - any alphanumeric character and the underscore; this is equivalent to the set - ``[a-zA-Z0-9_]``. With :const:`LOCALE`, it will match the set ``[0-9_]`` plus - whatever characters are defined as alphanumeric for the current locale. If - :const:`UNICODE` is set, this will match the characters ``[0-9_]`` plus whatever - is classified as alphanumeric in the Unicode character properties database. + For Unicode (str) patterns: + When the :const:`ASCII` flag is specified, this is equivalent to the set + ``[a-zA-Z0-9_]``. Otherwise, it will match whatever is classified as + alphanumeric in the Unicode character properties database (it will + include most characters that can be part of a word in whatever language, + as well as numbers and the underscore sign). + For 8-bit (bytes) patterns: + Matches characters considered alphanumeric in the ASCII character set; + this is equivalent to the set ``[a-zA-Z0-9_]``. With :const:`LOCALE`, + it will additionally match whatever characters are defined as + alphanumeric for the current locale. ``\W`` - When the :const:`LOCALE` and :const:`UNICODE` flags are not specified, matches - any non-alphanumeric character; this is equivalent to the set ``[^a-zA-Z0-9_]``. - With :const:`LOCALE`, it will match any character not in the set ``[0-9_]``, and - not defined as alphanumeric for the current locale. If :const:`UNICODE` is set, - this will match anything other than ``[0-9_]`` and characters marked as - alphanumeric in the Unicode character properties database. + Matches any character which is not an alphanumeric character. This is the + opposite of ``\w`` and is therefore similarly subject to the settings of + ``ASCII`` and ``LOCALE``. ``\Z`` Matches only at the end of the string. @@ -454,6 +464,25 @@ form. expression at a time needn't worry about compiling regular expressions.) +.. data:: A + ASCII + + Make ``\w``, ``\W``, ``\b``, ``\B``, ``\s`` and ``\S`` perform ASCII-only + matching instead of full Unicode matching. This is only meaningful for + Unicode patterns, and is ignored for byte patterns. + + Note that the :const:`re.U` flag still exists (as well as its synonym + :const:`re.UNICODE` and its embedded counterpart ``(?u)``), but it has + become useless in Python 3.0. + In previous Python versions, it was used to specify that + matching had to be Unicode dependent (the default was ASCII matching in + all circumstances). Starting from Python 3.0, the default is Unicode + matching for Unicode strings (which can be changed by specifying the + ``'a'`` flag), and ASCII matching for 8-bit strings. Further, Unicode + dependent matching for 8-bit strings isn't allowed anymore and results + in a ValueError. + + .. data:: I IGNORECASE @@ -465,7 +494,10 @@ form. LOCALE Make ``\w``, ``\W``, ``\b``, ``\B``, ``\s`` and ``\S`` dependent on the - current locale. + current locale. The use of this flag is discouraged as the locale mechanism + is very unreliable, and it only handles one "culture" at a time anyway; + you should use Unicode matching instead, which is the default in Python 3.0 + for Unicode (str) patterns. .. data:: M @@ -486,13 +518,6 @@ form. newline; without this flag, ``'.'`` will match anything *except* a newline. -.. data:: U - UNICODE - - Make ``\w``, ``\W``, ``\b``, ``\B``, ``\d``, ``\D``, ``\s`` and ``\S`` dependent - on the Unicode character properties database. - - .. data:: X VERBOSE @@ -511,6 +536,8 @@ form. b = re.compile(r"\d+\.\d*") + + .. function:: search(pattern, string[, flags]) Scan through *string* looking for a location where the regular expression diff --git a/Lib/_strptime.py b/Lib/_strptime.py index c3568b0..896c798 100644 --- a/Lib/_strptime.py +++ b/Lib/_strptime.py @@ -14,7 +14,7 @@ import time import locale import calendar from re import compile as re_compile -from re import IGNORECASE +from re import IGNORECASE, ASCII from re import escape as re_escape from datetime import date as datetime_date try: @@ -262,7 +262,7 @@ class TimeRE(dict): def compile(self, format): """Return a compiled re object for the format string.""" - return re_compile(self.pattern(format), IGNORECASE) + return re_compile(self.pattern(format), IGNORECASE | ASCII) _cache_lock = _thread_allocate_lock() # DO NOT modify _TimeRE_cache or _regex_cache without acquiring the cache lock diff --git a/Lib/base64.py b/Lib/base64.py index fc80835..4308fb4 100755 --- a/Lib/base64.py +++ b/Lib/base64.py @@ -39,7 +39,7 @@ def _translate(s, altchars): return s.translate(translation) - + # Base64 encoding/decoding uses binascii def b64encode(s, altchars=None): @@ -126,7 +126,7 @@ def urlsafe_b64decode(s): return b64decode(s, b'-_') - + # Base32 encoding/decoding must be done in Python _b32alphabet = { 0: b'A', 9: b'J', 18: b'S', 27: b'3', @@ -225,7 +225,7 @@ def b32decode(s, casefold=False, map01=None): # characters because this will tell us how many null bytes to remove from # the end of the decoded string. padchars = 0 - mo = re.search('(?P<pad>[=]*)$', s) + mo = re.search(b'(?P<pad>[=]*)$', s) if mo: padchars = len(mo.group('pad')) if padchars > 0: @@ -262,7 +262,7 @@ def b32decode(s, casefold=False, map01=None): return b''.join(parts) - + # RFC 3548, Base 16 Alphabet specifies uppercase, but hexlify() returns # lowercase. The RFC also recommends against accepting input case # insensitively. @@ -291,12 +291,12 @@ def b16decode(s, casefold=False): raise TypeError("expected bytes, not %s" % s.__class__.__name__) if casefold: s = s.upper() - if re.search('[^0-9A-F]', s): + if re.search(b'[^0-9A-F]', s): raise binascii.Error('Non-base16 digit found') return binascii.unhexlify(s) - + # Legacy interface. This code could be cleaned up since I don't believe # binascii has any line length limitations. It just doesn't seem worth it # though. The files should be opened in binary mode. @@ -353,7 +353,7 @@ def decodestring(s): return binascii.a2b_base64(s) - + # Usable as a script... def main(): """Small main program""" diff --git a/Lib/decimal.py b/Lib/decimal.py index 2c02f11..88cc5cd 100644 --- a/Lib/decimal.py +++ b/Lib/decimal.py @@ -5415,7 +5415,7 @@ ExtendedContext = Context( # 2. For finite numbers (not infinities and NaNs) the body of the # number between the optional sign and the optional exponent must have # at least one decimal digit, possibly after the decimal point. The -# lookahead expression '(?=\d|\.\d)' checks this. +# lookahead expression '(?=[0-9]|\.[0-9])' checks this. # # As the flag UNICODE is not enabled here, we're explicitly avoiding any # other meaning for \d than the numbers [0-9]. diff --git a/Lib/distutils/cygwinccompiler.py b/Lib/distutils/cygwinccompiler.py index 48875230..da2c74a 100644 --- a/Lib/distutils/cygwinccompiler.py +++ b/Lib/distutils/cygwinccompiler.py @@ -409,7 +409,7 @@ def get_versions(): out = os.popen(gcc_exe + ' -dumpversion','r') out_string = out.read() out.close() - result = re.search('(\d+\.\d+(\.\d+)*)',out_string) + result = re.search('(\d+\.\d+(\.\d+)*)', out_string, re.ASCII) if result: gcc_version = StrictVersion(result.group(1)) else: @@ -421,7 +421,7 @@ def get_versions(): out = os.popen(ld_exe + ' -v','r') out_string = out.read() out.close() - result = re.search('(\d+\.\d+(\.\d+)*)',out_string) + result = re.search('(\d+\.\d+(\.\d+)*)', out_string, re.ASCII) if result: ld_version = StrictVersion(result.group(1)) else: @@ -433,7 +433,7 @@ def get_versions(): out = os.popen(dllwrap_exe + ' --version','r') out_string = out.read() out.close() - result = re.search(' (\d+\.\d+(\.\d+)*)',out_string) + result = re.search(' (\d+\.\d+(\.\d+)*)', out_string, re.ASCII) if result: dllwrap_version = StrictVersion(result.group(1)) else: diff --git a/Lib/distutils/emxccompiler.py b/Lib/distutils/emxccompiler.py index d9ee82d..62a4c5b 100644 --- a/Lib/distutils/emxccompiler.py +++ b/Lib/distutils/emxccompiler.py @@ -300,7 +300,7 @@ def get_versions(): out = os.popen(gcc_exe + ' -dumpversion','r') out_string = out.read() out.close() - result = re.search('(\d+\.\d+\.\d+)',out_string) + result = re.search('(\d+\.\d+\.\d+)', out_string, re.ASCII) if result: gcc_version = StrictVersion(result.group(1)) else: diff --git a/Lib/distutils/sysconfig.py b/Lib/distutils/sysconfig.py index 3a120dd..b17743a 100644 --- a/Lib/distutils/sysconfig.py +++ b/Lib/distutils/sysconfig.py @@ -512,7 +512,7 @@ def get_config_vars(*args): # patched up as well. 'CFLAGS', 'PY_CFLAGS', 'BLDSHARED'): flags = _config_vars[key] - flags = re.sub('-arch\s+\w+\s', ' ', flags) + flags = re.sub('-arch\s+\w+\s', ' ', flags, re.ASCII) flags = re.sub('-isysroot [^ \t]*', ' ', flags) _config_vars[key] = flags diff --git a/Lib/distutils/util.py b/Lib/distutils/util.py index 76798b9..b87dfbe 100644 --- a/Lib/distutils/util.py +++ b/Lib/distutils/util.py @@ -81,7 +81,7 @@ def get_platform (): return "%s-%s.%s" % (osname, version, release) elif osname[:6] == "cygwin": osname = "cygwin" - rel_re = re.compile (r'[\d.]+') + rel_re = re.compile (r'[\d.]+', re.ASCII) m = rel_re.match(release) if m: release = m.group() diff --git a/Lib/distutils/version.py b/Lib/distutils/version.py index f71b2f6..907f71c 100644 --- a/Lib/distutils/version.py +++ b/Lib/distutils/version.py @@ -134,7 +134,7 @@ class StrictVersion (Version): """ version_re = re.compile(r'^(\d+) \. (\d+) (\. (\d+))? ([ab](\d+))?$', - re.VERBOSE) + re.VERBOSE | re.ASCII) def parse (self, vstring): diff --git a/Lib/distutils/versionpredicate.py b/Lib/distutils/versionpredicate.py index 434b34f..b0dd9f4 100644 --- a/Lib/distutils/versionpredicate.py +++ b/Lib/distutils/versionpredicate.py @@ -5,7 +5,8 @@ import distutils.version import operator -re_validPackage = re.compile(r"(?i)^\s*([a-z_]\w*(?:\.[a-z_]\w*)*)(.*)") +re_validPackage = re.compile(r"(?i)^\s*([a-z_]\w*(?:\.[a-z_]\w*)*)(.*)", + re.ASCII) # (package) (rest) re_paren = re.compile(r"^\s*\((.*)\)\s*$") # (list) inside of parentheses @@ -153,7 +154,8 @@ def split_provision(value): global _provision_rx if _provision_rx is None: _provision_rx = re.compile( - "([a-zA-Z_]\w*(?:\.[a-zA-Z_]\w*)*)(?:\s*\(\s*([^)\s]+)\s*\))?$") + "([a-zA-Z_]\w*(?:\.[a-zA-Z_]\w*)*)(?:\s*\(\s*([^)\s]+)\s*\))?$", + re.ASCII) value = value.strip() m = _provision_rx.match(value) if not m: diff --git a/Lib/email/quoprimime.py b/Lib/email/quoprimime.py index 68dc11c..ca91631 100644 --- a/Lib/email/quoprimime.py +++ b/Lib/email/quoprimime.py @@ -70,7 +70,7 @@ for c in (b' !"#$%&\'()*+,-./0123456789:;<>' _QUOPRI_BODY_MAP[c] = chr(c) - + # Helpers def header_check(octet): """Return True if the octet should be escaped with header quopri.""" @@ -125,7 +125,7 @@ def quote(c): return '=%02X' % ord(c) - + def header_encode(header_bytes, charset='iso-8859-1'): """Encode a single header line with quoted-printable (like) encoding. @@ -149,7 +149,7 @@ def header_encode(header_bytes, charset='iso-8859-1'): return '=?%s?q?%s?=' % (charset, EMPTYSTRING.join(encoded)) - + def body_encode(body, maxlinelen=76, eol=NL): """Encode with quoted-printable, wrapping at maxlinelen characters. @@ -225,7 +225,7 @@ def body_encode(body, maxlinelen=76, eol=NL): return encoded_body - + # BAW: I'm not sure if the intent was for the signature of this function to be # the same as base64MIME.decode() or not... def decode(encoded, eol=NL): @@ -280,7 +280,7 @@ body_decode = decode decodestring = decode - + def _unquote_match(match): """Turn a match in the form =AB to the ASCII character with value 0xab""" s = match.group(0) @@ -296,4 +296,4 @@ def header_decode(s): the high level email.Header class for that functionality. """ s = s.replace('_', ' ') - return re.sub(r'=\w{2}', _unquote_match, s) + return re.sub(r'=\w{2}', _unquote_match, s, re.ASCII) diff --git a/Lib/email/utils.py b/Lib/email/utils.py index 35275f6..465903f 100644 --- a/Lib/email/utils.py +++ b/Lib/email/utils.py @@ -52,7 +52,7 @@ specialsre = re.compile(r'[][\\()<>@,:;".]') escapesre = re.compile(r'[][\\()"]') - + # Helpers def formataddr(pair): @@ -73,7 +73,7 @@ def formataddr(pair): return address - + def getaddresses(fieldvalues): """Return a list of (REALNAME, EMAIL) for each fieldvalue.""" all = COMMASPACE.join(fieldvalues) @@ -81,7 +81,7 @@ def getaddresses(fieldvalues): return a.addresslist - + ecre = re.compile(r''' =\? # literal =? (?P<charset>[^?]*?) # non-greedy up to the next ? is the charset @@ -93,7 +93,7 @@ ecre = re.compile(r''' ''', re.VERBOSE | re.IGNORECASE) - + def formatdate(timeval=None, localtime=False, usegmt=False): """Returns a date string as specified by RFC 2822, e.g.: @@ -146,7 +146,7 @@ def formatdate(timeval=None, localtime=False, usegmt=False): zone) - + def make_msgid(idstring=None): """Returns a string suitable for RFC 2822 compliant Message-ID, e.g: @@ -168,7 +168,7 @@ def make_msgid(idstring=None): return msgid - + # These functions are in the standalone mimelib version only because they've # subsequently been fixed in the latest Python versions. We use this to worm # around broken older Pythons. @@ -202,7 +202,7 @@ def unquote(str): return str - + # RFC2231-related functions - parameter encoding and decoding def decode_rfc2231(s): """Decode string according to RFC 2231""" @@ -227,7 +227,8 @@ def encode_rfc2231(s, charset=None, language=None): return "%s'%s'%s" % (charset, language, s) -rfc2231_continuation = re.compile(r'^(?P<name>\w+)\*((?P<num>[0-9]+)\*?)?$') +rfc2231_continuation = re.compile(r'^(?P<name>\w+)\*((?P<num>[0-9]+)\*?)?$', + re.ASCII) def decode_params(params): """Decode parameters list according to RFC 2231. diff --git a/Lib/encodings/idna.py b/Lib/encodings/idna.py index c2ba1da..583bdf1 100644 --- a/Lib/encodings/idna.py +++ b/Lib/encodings/idna.py @@ -176,12 +176,10 @@ class Codec(codecs.Codec): return "", 0 # IDNA allows decoding to operate on Unicode strings, too. - if isinstance(input, bytes): - labels = dots.split(input) - else: - # Force to bytes + if not isinstance(input, bytes): + # XXX obviously wrong, see #3232 input = bytes(input) - labels = input.split(b".") + labels = input.split(b".") if labels and len(labels[-1]) == 0: trailing_dot = '.' diff --git a/Lib/ftplib.py b/Lib/ftplib.py index 4955727..b64e45e 100644 --- a/Lib/ftplib.py +++ b/Lib/ftplib.py @@ -590,7 +590,8 @@ def parse150(resp): global _150_re if _150_re is None: import re - _150_re = re.compile("150 .* \((\d+) bytes\)", re.IGNORECASE) + _150_re = re.compile( + "150 .* \((\d+) bytes\)", re.IGNORECASE | re.ASCII) m = _150_re.match(resp) if not m: return None @@ -613,7 +614,7 @@ def parse227(resp): global _227_re if _227_re is None: import re - _227_re = re.compile(r'(\d+),(\d+),(\d+),(\d+),(\d+),(\d+)') + _227_re = re.compile(r'(\d+),(\d+),(\d+),(\d+),(\d+),(\d+)', re.ASCII) m = _227_re.search(resp) if not m: raise error_proto(resp) diff --git a/Lib/html/parser.py b/Lib/html/parser.py index 828eece..83a5825 100644 --- a/Lib/html/parser.py +++ b/Lib/html/parser.py @@ -385,4 +385,4 @@ class HTMLParser(_markupbase.ParserBase): return '&'+s+';' return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));", - replaceEntities, s) + replaceEntities, s, re.ASCII) diff --git a/Lib/http/cookiejar.py b/Lib/http/cookiejar.py index afd5f20..e9efab8 100644 --- a/Lib/http/cookiejar.py +++ b/Lib/http/cookiejar.py @@ -121,7 +121,7 @@ def time2netscape(t=None): UTC_ZONES = {"GMT": None, "UTC": None, "UT": None, "Z": None} -TIMEZONE_RE = re.compile(r"^([-+])?(\d\d?):?(\d\d)?$") +TIMEZONE_RE = re.compile(r"^([-+])?(\d\d?):?(\d\d)?$", re.ASCII) def offset_from_tz_string(tz): offset = None if tz in UTC_ZONES: @@ -191,9 +191,9 @@ def _str2time(day, mon, yr, hr, min, sec, tz): STRICT_DATE_RE = re.compile( r"^[SMTWF][a-z][a-z], (\d\d) ([JFMASOND][a-z][a-z]) " - "(\d\d\d\d) (\d\d):(\d\d):(\d\d) GMT$") + "(\d\d\d\d) (\d\d):(\d\d):(\d\d) GMT$", re.ASCII) WEEKDAY_RE = re.compile( - r"^(?:Sun|Mon|Tue|Wed|Thu|Fri|Sat)[a-z]*,?\s*", re.I) + r"^(?:Sun|Mon|Tue|Wed|Thu|Fri|Sat)[a-z]*,?\s*", re.I | re.ASCII) LOOSE_HTTP_DATE_RE = re.compile( r"""^ (\d\d?) # day @@ -210,7 +210,7 @@ LOOSE_HTTP_DATE_RE = re.compile( ([-+]?\d{2,4}|(?![APap][Mm]\b)[A-Za-z]+)? # timezone \s* (?:\(\w+\))? # ASCII representation of timezone in parens. - \s*$""", re.X) + \s*$""", re.X | re.ASCII) def http2time(text): """Returns time in seconds since epoch of time represented by a string. @@ -282,7 +282,7 @@ ISO_DATE_RE = re.compile( \s* ([-+]?\d\d?:?(:?\d\d)? |Z|z)? # timezone (Z is "zero meridian", i.e. GMT) - \s*$""", re.X) + \s*$""", re.X | re. ASCII) def iso2time(text): """ As for http2time, but parses the ISO 8601 formats: @@ -489,7 +489,7 @@ def parse_ns_headers(ns_headers): return result -IPV4_RE = re.compile(r"\.\d+$") +IPV4_RE = re.compile(r"\.\d+$", re.ASCII) def is_HDN(text): """Return True if text is a host domain name.""" # XXX @@ -574,7 +574,7 @@ def user_domain_match(A, B): return True return False -cut_port_re = re.compile(r":\d+$") +cut_port_re = re.compile(r":\d+$", re.ASCII) def request_host(request): """Return request-host, as defined by RFC 2965. @@ -1207,7 +1207,7 @@ class CookieJar: domain_re = re.compile(r"[^.]*") dots_re = re.compile(r"^\.+") - magic_re = r"^\#LWP-Cookies-(\d+\.\d+)" + magic_re = re.compile(r"^\#LWP-Cookies-(\d+\.\d+)", re.ASCII) def __init__(self, policy=None): if policy is None: @@ -1856,7 +1856,7 @@ class LWPCookieJar(FileCookieJar): def _really_load(self, f, filename, ignore_discard, ignore_expires): magic = f.readline() - if not re.search(self.magic_re, magic): + if not self.magic_re.search(magic): msg = ("%r does not look like a Set-Cookie3 (LWP) format " "file" % filename) raise LoadError(msg) @@ -1965,7 +1965,7 @@ class MozillaCookieJar(FileCookieJar): header by default (Mozilla can cope with that). """ - magic_re = "#( Netscape)? HTTP Cookie File" + magic_re = re.compile("#( Netscape)? HTTP Cookie File") header = """\ # Netscape HTTP Cookie File # http://www.netscape.com/newsref/std/cookie_spec.html @@ -1977,7 +1977,7 @@ class MozillaCookieJar(FileCookieJar): now = time.time() magic = f.readline() - if not re.search(self.magic_re, magic): + if not self.magic_re.search(magic): f.close() raise LoadError( "%r does not look like a Netscape format cookies file" % diff --git a/Lib/http/cookies.py b/Lib/http/cookies.py index 344bede..3242d83 100644 --- a/Lib/http/cookies.py +++ b/Lib/http/cookies.py @@ -445,7 +445,7 @@ _CookiePattern = re.compile( ""+ _LegalCharsPatt +"*" # Any word or empty string r")" # End of group 'val' r"\s*;?" # Probably ending in a semi-colon - ) + , re.ASCII) # May be removed if safe. # At long last, here is the cookie class. diff --git a/Lib/imaplib.py b/Lib/imaplib.py index 0d9704e..156b15d 100644 --- a/Lib/imaplib.py +++ b/Lib/imaplib.py @@ -88,11 +88,12 @@ InternalDate = re.compile(r'.*INTERNALDATE "' r' (?P<hour>[0-9][0-9]):(?P<min>[0-9][0-9]):(?P<sec>[0-9][0-9])' r' (?P<zonen>[-+])(?P<zoneh>[0-9][0-9])(?P<zonem>[0-9][0-9])' r'"') -Literal = re.compile(r'.*{(?P<size>\d+)}$') +Literal = re.compile(r'.*{(?P<size>\d+)}$', re.ASCII) MapCRLF = re.compile(r'\r\n|\r|\n') Response_code = re.compile(r'\[(?P<type>[A-Z-]+)( (?P<data>[^\]]*))?\]') Untagged_response = re.compile(r'\* (?P<type>[A-Z-]+)( (?P<data>.*))?') -Untagged_status = re.compile(r'\* (?P<data>\d+) (?P<type>[A-Z-]+)( (?P<data2>.*))?') +Untagged_status = re.compile( + r'\* (?P<data>\d+) (?P<type>[A-Z-]+)( (?P<data2>.*))?', re.ASCII) @@ -146,7 +147,7 @@ class IMAP4: class abort(error): pass # Service errors - close and retry class readonly(abort): pass # Mailbox status changed to READ-ONLY - mustquote = re.compile(r"[^\w!#$%&'*+,.:;<=>?^`|~-]") + mustquote = re.compile(r"[^\w!#$%&'*+,.:;<=>?^`|~-]", re.ASCII) def __init__(self, host = '', port = IMAP4_PORT): self.debug = Debug @@ -168,7 +169,7 @@ class IMAP4: self.tagpre = Int2AP(random.randint(4096, 65535)) self.tagre = re.compile(r'(?P<tag>' + self.tagpre - + r'\d+) (?P<type>[A-Z]+) (?P<data>.*)') + + r'\d+) (?P<type>[A-Z]+) (?P<data>.*)', re.ASCII) # Get server welcome message, # request and store CAPABILITY response. diff --git a/Lib/json/decoder.py b/Lib/json/decoder.py index 5283eae..f0bc245 100644 --- a/Lib/json/decoder.py +++ b/Lib/json/decoder.py @@ -67,7 +67,7 @@ def JSONNumber(match, context): fn = getattr(context, 'parse_int', None) or int res = fn(integer) return res, None -pattern(r'(-?(?:0|[1-9]\d*))(\.\d+)?([eE][-+]?\d+)?')(JSONNumber) +pattern(r'(-?(?:0|[1-9][0-9]*))(\.[0-9]+)?([eE][-+]?[0-9]+)?')(JSONNumber) STRINGCHUNK = re.compile(r'(.*?)(["\\\x00-\x1f])', FLAGS) diff --git a/Lib/logging/handlers.py b/Lib/logging/handlers.py index 1e69120..19fb959 100644 --- a/Lib/logging/handlers.py +++ b/Lib/logging/handlers.py @@ -199,7 +199,7 @@ class TimedRotatingFileHandler(BaseRotatingHandler): else: raise ValueError("Invalid rollover interval specified: %s" % self.when) - self.extMatch = re.compile(self.extMatch) + self.extMatch = re.compile(self.extMatch, re.ASCII) self.interval = self.interval * interval # multiply by units requested self.rolloverAt = currentTime + self.interval diff --git a/Lib/platform.py b/Lib/platform.py index ff81d28..0182180 100755 --- a/Lib/platform.py +++ b/Lib/platform.py @@ -118,7 +118,7 @@ _libc_search = re.compile(r'(__libc_init)' '|' '(GLIBC_([0-9.]+))' '|' - '(libc(_\w+)?\.so(?:\.(\d[0-9.]*))?)') + '(libc(_\w+)?\.so(?:\.(\d[0-9.]*))?)', re.ASCII) def libc_ver(executable=sys.executable,lib='',version='', @@ -223,15 +223,15 @@ def _dist_try_harder(distname,version,id): return distname,version,id -_release_filename = re.compile(r'(\w+)[-_](release|version)') +_release_filename = re.compile(r'(\w+)[-_](release|version)', re.ASCII) _lsb_release_version = re.compile(r'(.+)' ' release ' '([\d.]+)' - '[^(]*(?:\((.+)\))?') + '[^(]*(?:\((.+)\))?', re.ASCII) _release_version = re.compile(r'([^0-9]+)' '(?: release )?' '([\d.]+)' - '[^(]*(?:\((.+)\))?') + '[^(]*(?:\((.+)\))?', re.ASCII) # See also http://www.novell.com/coolsolutions/feature/11251.html # and http://linuxmafia.com/faq/Admin/release-files.html @@ -464,7 +464,7 @@ def _norm_version(version, build=''): _ver_output = re.compile(r'(?:([\w ]+) ([\w.]+) ' '.*' - 'Version ([\d.]+))') + 'Version ([\d.]+))', re.ASCII) def _syscmd_ver(system='', release='', version='', @@ -1253,16 +1253,16 @@ def processor(): _sys_version_parser = re.compile( r'([\w.+]+)\s*' '\(#?([^,]+),\s*([\w ]+),\s*([\w :]+)\)\s*' - '\[([^\]]+)\]?') + '\[([^\]]+)\]?', re.ASCII) _jython_sys_version_parser = re.compile( - r'([\d\.]+)') + r'([\d\.]+)', re.ASCII) _ironpython_sys_version_parser = re.compile( r'IronPython\s*' '([\d\.]+)' '(?: \(([\d\.]+)\))?' - ' on (.NET [\d\.]+)') + ' on (.NET [\d\.]+)', re.ASCII) _sys_version_cache = {} diff --git a/Lib/plistlib.py b/Lib/plistlib.py index b775324..83e5d04 100644 --- a/Lib/plistlib.py +++ b/Lib/plistlib.py @@ -147,7 +147,7 @@ class DumbXMLWriter: # Contents should conform to a subset of ISO 8601 # (in particular, YYYY '-' MM '-' DD 'T' HH ':' MM ':' SS 'Z'. Smaller units may be omitted with # a loss of precision) -_dateParser = re.compile(r"(?P<year>\d\d\d\d)(?:-(?P<month>\d\d)(?:-(?P<day>\d\d)(?:T(?P<hour>\d\d)(?::(?P<minute>\d\d)(?::(?P<second>\d\d))?)?)?)?)?Z") +_dateParser = re.compile(r"(?P<year>\d\d\d\d)(?:-(?P<month>\d\d)(?:-(?P<day>\d\d)(?:T(?P<hour>\d\d)(?::(?P<minute>\d\d)(?::(?P<second>\d\d))?)?)?)?)?Z", re.ASCII) def _dateFromString(s): order = ('year', 'month', 'day', 'hour', 'minute', 'second') diff --git a/Lib/posixpath.py b/Lib/posixpath.py index dc0aa10..575492f 100644 --- a/Lib/posixpath.py +++ b/Lib/posixpath.py @@ -241,7 +241,7 @@ def expandvars(path): return path if not _varprog: import re - _varprog = re.compile(r'\$(\w+|\{[^}]*\})') + _varprog = re.compile(r'\$(\w+|\{[^}]*\})', re.ASCII) i = 0 while True: m = _varprog.search(path, i) diff --git a/Lib/py_compile.py b/Lib/py_compile.py index 8ef3662..cce5ac1 100644 --- a/Lib/py_compile.py +++ b/Lib/py_compile.py @@ -86,7 +86,7 @@ def read_encoding(file, default): line = f.readline() if not line: break - m = re.match(r".*\bcoding:\s*(\S+)\b", line) + m = re.match(br".*\bcoding:\s*(\S+)\b", line) if m: return m.group(1).decode("ascii") return default @@ -44,7 +44,7 @@ The special characters are: "|" A|B, creates an RE that will match either A or B. (...) Matches the RE inside the parentheses. The contents can be retrieved or matched later in the string. - (?iLmsux) Set the I, L, M, S, U, or X flag for the RE (see below). + (?aiLmsux) Set the A, I, L, M, S, U, or X flag for the RE (see below). (?:...) Non-grouping version of regular parentheses. (?P<name>...) The substring matched by the group is accessible by name. (?P=name) Matches the text matched earlier by the group named name. @@ -64,11 +64,18 @@ resulting RE will match the second character. \Z Matches only at the end of the string. \b Matches the empty string, but only at the start or end of a word. \B Matches the empty string, but not at the start or end of a word. - \d Matches any decimal digit; equivalent to the set [0-9]. - \D Matches any non-digit character; equivalent to the set [^0-9]. + \d Matches any decimal digit; equivalent to the set [0-9] in + bytes patterns or string patterns with the ASCII flag. + In string patterns without the ASCII flag, it will match the whole + range of Unicode digits. + \D Matches any non-digit character; equivalent to [^\d]. \s Matches any whitespace character; equivalent to [ \t\n\r\f\v]. \S Matches any non-whitespace character; equiv. to [^ \t\n\r\f\v]. - \w Matches any alphanumeric character; equivalent to [a-zA-Z0-9_]. + \w Matches any alphanumeric character; equivalent to [a-zA-Z0-9_] + in bytes patterns or string patterns with the ASCII flag. + In string patterns without the ASCII flag, it will match the + range of Unicode alphanumeric characters (letters plus digits + plus underscore). With LOCALE, it will match the set [0-9_] plus characters defined as letters for the current locale. \W Matches the complement of \w. @@ -87,6 +94,12 @@ This module exports the following functions: escape Backslash all non-alphanumerics in a string. Some of the functions in this module takes flags as optional parameters: + A ASCII For string patterns, make \w, \W, \b, \B, \d, \D + match the corresponding ASCII character categories + (rather than the whole Unicode categories, which is the + default). + For bytes patterns, this flag is the only available + behaviour and needn't be specified. I IGNORECASE Perform case-insensitive matching. L LOCALE Make \w, \W, \b, \B, dependent on the current locale. M MULTILINE "^" matches the beginning of lines (after a newline) @@ -95,7 +108,8 @@ Some of the functions in this module takes flags as optional parameters: as the end of the string. S DOTALL "." matches any character at all, including the newline. X VERBOSE Ignore whitespace and comments for nicer looking RE's. - U UNICODE Make \w, \W, \b, \B, dependent on the Unicode locale. + U UNICODE For compatibility only. Ignored for string patterns (it + is the default), and forbidden for bytes patterns. This module also defines an exception 'error'. @@ -107,16 +121,17 @@ import sre_parse # public symbols __all__ = [ "match", "search", "sub", "subn", "split", "findall", - "compile", "purge", "template", "escape", "I", "L", "M", "S", "X", - "U", "IGNORECASE", "LOCALE", "MULTILINE", "DOTALL", "VERBOSE", + "compile", "purge", "template", "escape", "A", "I", "L", "M", "S", "X", + "U", "ASCII", "IGNORECASE", "LOCALE", "MULTILINE", "DOTALL", "VERBOSE", "UNICODE", "error" ] __version__ = "2.2.1" # flags +A = ASCII = sre_compile.SRE_FLAG_ASCII # assume ascii "locale" I = IGNORECASE = sre_compile.SRE_FLAG_IGNORECASE # ignore case L = LOCALE = sre_compile.SRE_FLAG_LOCALE # assume current 8-bit locale -U = UNICODE = sre_compile.SRE_FLAG_UNICODE # assume unicode locale +U = UNICODE = sre_compile.SRE_FLAG_UNICODE # assume unicode "locale" M = MULTILINE = sre_compile.SRE_FLAG_MULTILINE # make anchors look for newline S = DOTALL = sre_compile.SRE_FLAG_DOTALL # make dot match newline X = VERBOSE = sre_compile.SRE_FLAG_VERBOSE # ignore whitespace and comments diff --git a/Lib/sre_constants.py b/Lib/sre_constants.py index 94962cb..6cf69c3 100644 --- a/Lib/sre_constants.py +++ b/Lib/sre_constants.py @@ -207,9 +207,10 @@ SRE_FLAG_IGNORECASE = 2 # case insensitive SRE_FLAG_LOCALE = 4 # honour system locale SRE_FLAG_MULTILINE = 8 # treat target as multiline string SRE_FLAG_DOTALL = 16 # treat target as a single string -SRE_FLAG_UNICODE = 32 # use unicode locale +SRE_FLAG_UNICODE = 32 # use unicode "locale" SRE_FLAG_VERBOSE = 64 # ignore whitespace and comments SRE_FLAG_DEBUG = 128 # debugging +SRE_FLAG_ASCII = 256 # use ascii "locale" # flags for INFO primitive SRE_INFO_PREFIX = 1 # has prefix diff --git a/Lib/sre_parse.py b/Lib/sre_parse.py index ffa8902..9d6e631 100644 --- a/Lib/sre_parse.py +++ b/Lib/sre_parse.py @@ -64,6 +64,7 @@ FLAGS = { "s": SRE_FLAG_DOTALL, "x": SRE_FLAG_VERBOSE, # extensions + "a": SRE_FLAG_ASCII, "t": SRE_FLAG_TEMPLATE, "u": SRE_FLAG_UNICODE, } @@ -672,6 +673,18 @@ def _parse(source, state): return subpattern +def fix_flags(src, flags): + # Check and fix flags according to the type of pattern (str or bytes) + if isinstance(src, str): + if not flags & SRE_FLAG_ASCII: + flags |= SRE_FLAG_UNICODE + elif flags & SRE_FLAG_UNICODE: + raise ValueError("ASCII and UNICODE flags are incompatible") + else: + if flags & SRE_FLAG_UNICODE: + raise ValueError("can't use UNICODE flag with a bytes pattern") + return flags + def parse(str, flags=0, pattern=None): # parse 're' pattern into list of (opcode, argument) tuples @@ -683,6 +696,7 @@ def parse(str, flags=0, pattern=None): pattern.str = str p = _parse_sub(source, pattern, 0) + p.pattern.flags = fix_flags(str, p.pattern.flags) tail = source.get() if tail == ")": diff --git a/Lib/tarfile.py b/Lib/tarfile.py index ecb32f6..222b433 100644 --- a/Lib/tarfile.py +++ b/Lib/tarfile.py @@ -1368,7 +1368,7 @@ class TarInfo(object): # "%d %s=%s\n" % (length, keyword, value). length is the size # of the complete record including the length field itself and # the newline. keyword and value are both UTF-8 encoded strings. - regex = re.compile(r"(\d+) ([^=]+)=", re.U) + regex = re.compile(br"(\d+) ([^=]+)=") pos = 0 while True: match = regex.match(buf, pos) diff --git a/Lib/test/re_tests.py b/Lib/test/re_tests.py index 220301a..d314e20 100755 --- a/Lib/test/re_tests.py +++ b/Lib/test/re_tests.py @@ -667,4 +667,4 @@ tests.extend([ (r'\b.\b', 'a', SUCCEED, 'found', 'a'), (r'(?u)\b.\b', u, SUCCEED, 'found', u), (r'(?u)\w', u, SUCCEED, 'found', u), - ]) +]) diff --git a/Lib/test/test_bytes.py b/Lib/test/test_bytes.py index c64ac19..630f862 100644 --- a/Lib/test/test_bytes.py +++ b/Lib/test/test_bytes.py @@ -506,7 +506,7 @@ class ByteArrayTest(BaseBytesTest): def by(s): return bytearray(map(ord, s)) b = by("Hello, world") - self.assertEqual(re.findall(r"\w+", b), [by("Hello"), by("world")]) + self.assertEqual(re.findall(br"\w+", b), [by("Hello"), by("world")]) def test_setitem(self): b = bytearray([1, 2, 3]) diff --git a/Lib/test/test_mmap.py b/Lib/test/test_mmap.py index 0b5202a..9fe044f 100644 --- a/Lib/test/test_mmap.py +++ b/Lib/test/test_mmap.py @@ -54,7 +54,7 @@ class MmapTests(unittest.TestCase): m.flush() # Test doing a regular expression match in an mmap'ed file - match = re.search('[A-Za-z]+', m) + match = re.search(b'[A-Za-z]+', m) if match is None: self.fail('regex match on mmap failed!') else: diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py index 60b816e..755cb00 100644 --- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -83,23 +83,6 @@ class ReTests(unittest.TestCase): self.assertEqual(re.sub('\r\n', '\n', 'abc\r\ndef\r\n'), 'abc\ndef\n') - def test_bug_1140(self): - # re.sub(x, y, b'') should return b'', not '', and - # re.sub(x, y, '') should return '', not b''. - # Also: - # re.sub(x, y, str(x)) should return str(y), and - # re.sub(x, y, bytes(x)) should return - # str(y) if isinstance(y, str) else unicode(y). - for x in 'x', b'x': - for y in 'y', b'y': - z = re.sub(x, y, b'') - self.assertEqual(z, b'') - self.assertEqual(type(z), bytes) - # - z = re.sub(x, y, '') - self.assertEqual(z, '') - self.assertEqual(type(z), str) - def test_bug_1661(self): # Verify that flags do not get silently ignored with compiled patterns pattern = re.compile('.') @@ -327,7 +310,7 @@ class ReTests(unittest.TestCase): def test_getattr(self): self.assertEqual(re.compile("(?i)(a)(b)").pattern, "(?i)(a)(b)") - self.assertEqual(re.compile("(?i)(a)(b)").flags, re.I) + self.assertEqual(re.compile("(?i)(a)(b)").flags, re.I | re.U) self.assertEqual(re.compile("(?i)(a)(b)").groups, 2) self.assertEqual(re.compile("(?i)(a)(b)").groupindex, {}) self.assertEqual(re.compile("(?i)(?P<first>a)(?P<other>b)").groupindex, @@ -614,8 +597,8 @@ class ReTests(unittest.TestCase): import array for typecode in 'bBuhHiIlLfd': a = array.array(typecode) - self.assertEqual(re.compile("bla").match(a), None) - self.assertEqual(re.compile("").match(a).groups(), ()) + self.assertEqual(re.compile(b"bla").match(a), None) + self.assertEqual(re.compile(b"").match(a).groups(), ()) def test_inline_flags(self): # Bug #1700 @@ -658,6 +641,48 @@ class ReTests(unittest.TestCase): self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a#\nb#\nc#') self.assertEqual(pattern.sub('#', '\n'), '#\n#') + def test_bytes_str_mixing(self): + # Mixing str and bytes is disallowed + pat = re.compile('.') + bpat = re.compile(b'.') + self.assertRaises(TypeError, pat.match, b'b') + self.assertRaises(TypeError, bpat.match, 'b') + self.assertRaises(TypeError, pat.sub, b'b', 'c') + self.assertRaises(TypeError, pat.sub, 'b', b'c') + self.assertRaises(TypeError, pat.sub, b'b', b'c') + self.assertRaises(TypeError, bpat.sub, b'b', 'c') + self.assertRaises(TypeError, bpat.sub, 'b', b'c') + self.assertRaises(TypeError, bpat.sub, 'b', 'c') + + def test_ascii_and_unicode_flag(self): + # String patterns + for flags in (0, re.UNICODE): + pat = re.compile('\xc0', flags | re.IGNORECASE) + self.assertNotEqual(pat.match('\xe0'), None) + pat = re.compile('\w', flags) + self.assertNotEqual(pat.match('\xe0'), None) + pat = re.compile('\xc0', re.ASCII | re.IGNORECASE) + self.assertEqual(pat.match('\xe0'), None) + pat = re.compile('(?a)\xc0', re.IGNORECASE) + self.assertEqual(pat.match('\xe0'), None) + pat = re.compile('\w', re.ASCII) + self.assertEqual(pat.match('\xe0'), None) + pat = re.compile('(?a)\w') + self.assertEqual(pat.match('\xe0'), None) + # Bytes patterns + for flags in (0, re.ASCII): + pat = re.compile(b'\xc0', re.IGNORECASE) + self.assertEqual(pat.match(b'\xe0'), None) + pat = re.compile(b'\w') + self.assertEqual(pat.match(b'\xe0'), None) + # Incompatibilities + self.assertRaises(ValueError, re.compile, b'\w', re.UNICODE) + self.assertRaises(ValueError, re.compile, b'(?u)\w') + self.assertRaises(ValueError, re.compile, '\w', re.UNICODE | re.ASCII) + self.assertRaises(ValueError, re.compile, '(?u)\w', re.ASCII) + self.assertRaises(ValueError, re.compile, '(?a)\w', re.UNICODE) + self.assertRaises(ValueError, re.compile, '(?au)\w') + def run_re_tests(): from test.re_tests import benchmarks, tests, SUCCEED, FAIL, SYNTAX_ERROR diff --git a/Lib/tokenize.py b/Lib/tokenize.py index 31366de..ec5a79a 100644 --- a/Lib/tokenize.py +++ b/Lib/tokenize.py @@ -47,21 +47,23 @@ def group(*choices): return '(' + '|'.join(choices) + ')' def any(*choices): return group(*choices) + '*' def maybe(*choices): return group(*choices) + '?' +# Note: we use unicode matching for names ("\w") but ascii matching for +# number literals. Whitespace = r'[ \f\t]*' Comment = r'#[^\r\n]*' Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment) Name = r'[a-zA-Z_]\w*' -Hexnumber = r'0[xX][\da-fA-F]+' +Hexnumber = r'0[xX][0-9a-fA-F]+' Binnumber = r'0[bB][01]+' Octnumber = r'0[oO][0-7]+' -Decnumber = r'(?:0+|[1-9]\d*)' +Decnumber = r'(?:0+|[1-9][0-9]*)' Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber) -Exponent = r'[eE][-+]?\d+' -Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent) -Expfloat = r'\d+' + Exponent +Exponent = r'[eE][-+]?[0-9]+' +Pointfloat = group(r'[0-9]+\.[0-9]*', r'\.[0-9]+') + maybe(Exponent) +Expfloat = r'[0-9]+' + Exponent Floatnumber = group(Pointfloat, Expfloat) -Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]') +Imagnumber = group(r'[0-9]+[jJ]', Floatnumber + r'[jJ]') Number = group(Imagnumber, Floatnumber, Intnumber) # Tail end of ' string. diff --git a/Lib/urllib/request.py b/Lib/urllib/request.py index 889d9642..aaab059 100644 --- a/Lib/urllib/request.py +++ b/Lib/urllib/request.py @@ -141,7 +141,7 @@ def urlcleanup(): _opener = None # copied from cookielib.py -_cut_port_re = re.compile(r":\d+$") +_cut_port_re = re.compile(r":\d+$", re.ASCII) def request_host(request): """Return request-host, as defined by RFC 2965. @@ -30,6 +30,14 @@ Core and Builtins Library ------- +- Issue #2834: update the regular expression library to match the unicode + standards of py3k. In other words, mixing bytes and unicode strings + (be it as pattern, search string or replacement string) raises a TypeError. + Moreover, the re.UNICODE flag is enabled automatically for unicode patterns, + and can be disabled by specifying a new re.ASCII flag; as for bytes + patterns, ASCII matching is the only option and trying to specify re.UNICODE + for such patterns raises a ValueError. + - Issue #3300: make urllib.parse.[un]quote() default to UTF-8. Code contributed by Matt Giuca. quote() now encodes the input before quoting, unquote() decodes after unquoting. There are diff --git a/Modules/_sre.c b/Modules/_sre.c index 64fc513..2a54d8e 100644 --- a/Modules/_sre.c +++ b/Modules/_sre.c @@ -1691,7 +1691,7 @@ getstring(PyObject* string, Py_ssize_t* p_length, int* p_charsize) /* get pointer to string buffer */ view.len = -1; buffer = Py_TYPE(string)->tp_as_buffer; - if (!buffer || !buffer->bf_getbuffer || + if (!buffer || !buffer->bf_getbuffer || (*buffer->bf_getbuffer)(string, &view, PyBUF_SIMPLE) < 0) { PyErr_SetString(PyExc_TypeError, "expected string or buffer"); return NULL; @@ -1717,7 +1717,7 @@ getstring(PyObject* string, Py_ssize_t* p_length, int* p_charsize) if (PyBytes_Check(string) || bytes == size) charsize = 1; #if defined(HAVE_UNICODE) - else if (bytes == (Py_ssize_t) (size * sizeof(Py_UNICODE))) + else if (bytes == (Py_ssize_t) (size * sizeof(Py_UNICODE))) charsize = sizeof(Py_UNICODE); #endif else { @@ -1729,7 +1729,7 @@ getstring(PyObject* string, Py_ssize_t* p_length, int* p_charsize) *p_charsize = charsize; if (ptr == NULL) { - PyErr_SetString(PyExc_ValueError, + PyErr_SetString(PyExc_ValueError, "Buffer is NULL"); } return ptr; @@ -1754,6 +1754,17 @@ state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string, if (!ptr) return NULL; + if (charsize == 1 && pattern->charsize > 1) { + PyErr_SetString(PyExc_TypeError, + "can't use a string pattern on a bytes-like object"); + return NULL; + } + if (charsize > 1 && pattern->charsize == 1) { + PyErr_SetString(PyExc_TypeError, + "can't use a bytes pattern on a string-like object"); + return NULL; + } + /* adjust boundaries */ if (start < 0) start = 0; @@ -2682,6 +2693,16 @@ _compile(PyObject* self_, PyObject* args) return NULL; } + if (pattern == Py_None) + self->charsize = -1; + else { + Py_ssize_t p_length; + if (!getstring(pattern, &p_length, &self->charsize)) { + PyObject_DEL(self); + return NULL; + } + } + Py_INCREF(pattern); self->pattern = pattern; diff --git a/Modules/sre.h b/Modules/sre.h index d4af05c..518c11d 100644 --- a/Modules/sre.h +++ b/Modules/sre.h @@ -30,6 +30,7 @@ typedef struct { PyObject* pattern; /* pattern source (or None) */ int flags; /* flags used when compiling pattern source */ PyObject *weakreflist; /* List of weak references */ + int charsize; /* pattern charsize (or -1) */ /* pattern code */ Py_ssize_t codesize; SRE_CODE code[1]; |