summaryrefslogtreecommitdiffstats
path: root/Lib/re.py
diff options
context:
space:
mode:
Diffstat (limited to 'Lib/re.py')
-rw-r--r--Lib/re.py145
1 files changed, 82 insertions, 63 deletions
diff --git a/Lib/re.py b/Lib/re.py
index aab5d37..85c5a57 100644
--- a/Lib/re.py
+++ b/Lib/re.py
@@ -44,7 +44,7 @@ The special characters are:
"|" A|B, creates an RE that will match either A or B.
(...) Matches the RE inside the parentheses.
The contents can be retrieved or matched later in the string.
- (?iLmsux) Set the I, L, M, S, U, or X flag for the RE (see below).
+ (?aiLmsux) Set the A, I, L, M, S, U, or X flag for the RE (see below).
(?:...) Non-grouping version of regular parentheses.
(?P<name>...) The substring matched by the group is accessible by name.
(?P=name) Matches the text matched earlier by the group named name.
@@ -64,11 +64,21 @@ resulting RE will match the second character.
\Z Matches only at the end of the string.
\b Matches the empty string, but only at the start or end of a word.
\B Matches the empty string, but not at the start or end of a word.
- \d Matches any decimal digit; equivalent to the set [0-9].
- \D Matches any non-digit character; equivalent to the set [^0-9].
- \s Matches any whitespace character; equivalent to [ \t\n\r\f\v].
- \S Matches any non-whitespace character; equiv. to [^ \t\n\r\f\v].
- \w Matches any alphanumeric character; equivalent to [a-zA-Z0-9_].
+ \d Matches any decimal digit; equivalent to the set [0-9] in
+ bytes patterns or string patterns with the ASCII flag.
+ In string patterns without the ASCII flag, it will match the whole
+ range of Unicode digits.
+ \D Matches any non-digit character; equivalent to [^\d].
+ \s Matches any whitespace character; equivalent to [ \t\n\r\f\v] in
+ bytes patterns or string patterns with the ASCII flag.
+ In string patterns without the ASCII flag, it will match the whole
+ range of Unicode whitespace characters.
+ \S Matches any non-whitespace character; equivalent to [^\s].
+ \w Matches any alphanumeric character; equivalent to [a-zA-Z0-9_]
+ in bytes patterns or string patterns with the ASCII flag.
+ In string patterns without the ASCII flag, it will match the
+ range of Unicode alphanumeric characters (letters plus digits
+ plus underscore).
With LOCALE, it will match the set [0-9_] plus characters defined
as letters for the current locale.
\W Matches the complement of \w.
@@ -87,6 +97,12 @@ This module exports the following functions:
escape Backslash all non-alphanumerics in a string.
Some of the functions in this module takes flags as optional parameters:
+ A ASCII For string patterns, make \w, \W, \b, \B, \d, \D
+ match the corresponding ASCII character categories
+ (rather than the whole Unicode categories, which is the
+ default).
+ For bytes patterns, this flag is the only available
+ behaviour and needn't be specified.
I IGNORECASE Perform case-insensitive matching.
L LOCALE Make \w, \W, \b, \B, dependent on the current locale.
M MULTILINE "^" matches the beginning of lines (after a newline)
@@ -95,7 +111,8 @@ Some of the functions in this module takes flags as optional parameters:
as the end of the string.
S DOTALL "." matches any character at all, including the newline.
X VERBOSE Ignore whitespace and comments for nicer looking RE's.
- U UNICODE Make \w, \W, \b, \B, dependent on the Unicode locale.
+ U UNICODE For compatibility only. Ignored for string patterns (it
+ is the default), and forbidden for bytes patterns.
This module also defines an exception 'error'.
@@ -104,19 +121,21 @@ This module also defines an exception 'error'.
import sys
import sre_compile
import sre_parse
+import functools
# public symbols
__all__ = [ "match", "search", "sub", "subn", "split", "findall",
- "compile", "purge", "template", "escape", "I", "L", "M", "S", "X",
- "U", "IGNORECASE", "LOCALE", "MULTILINE", "DOTALL", "VERBOSE",
+ "compile", "purge", "template", "escape", "A", "I", "L", "M", "S", "X",
+ "U", "ASCII", "IGNORECASE", "LOCALE", "MULTILINE", "DOTALL", "VERBOSE",
"UNICODE", "error" ]
__version__ = "2.2.1"
# flags
+A = ASCII = sre_compile.SRE_FLAG_ASCII # assume ascii "locale"
I = IGNORECASE = sre_compile.SRE_FLAG_IGNORECASE # ignore case
L = LOCALE = sre_compile.SRE_FLAG_LOCALE # assume current 8-bit locale
-U = UNICODE = sre_compile.SRE_FLAG_UNICODE # assume unicode locale
+U = UNICODE = sre_compile.SRE_FLAG_UNICODE # assume unicode "locale"
M = MULTILINE = sre_compile.SRE_FLAG_MULTILINE # make anchors look for newline
S = DOTALL = sre_compile.SRE_FLAG_DOTALL # make dot match newline
X = VERBOSE = sre_compile.SRE_FLAG_VERBOSE # ignore whitespace and comments
@@ -163,14 +182,19 @@ def subn(pattern, repl, string, count=0, flags=0):
def split(pattern, string, maxsplit=0, flags=0):
"""Split the source string by the occurrences of the pattern,
- returning a list containing the resulting substrings."""
+ returning a list containing the resulting substrings. If
+ capturing parentheses are used in pattern, then the text of all
+ groups in the pattern are also returned as part of the resulting
+ list. If maxsplit is nonzero, at most maxsplit splits occur,
+ and the remainder of the string is returned as the final element
+ of the list."""
return _compile(pattern, flags).split(string, maxsplit)
def findall(pattern, string, flags=0):
"""Return a list of all non-overlapping matches in the string.
- If one or more groups are present in the pattern, return a
- list of groups; this will be a list of tuples if the pattern
+ If one or more capturing groups are present in the pattern, return
+ a list of groups; this will be a list of tuples if the pattern
has more than one group.
Empty matches are included in the result."""
@@ -190,75 +214,70 @@ def compile(pattern, flags=0):
return _compile(pattern, flags)
def purge():
- "Clear the regular expression cache"
- _cache.clear()
- _cache_repl.clear()
+ "Clear the regular expression caches"
+ _compile_typed.cache_clear()
+ _compile_repl.cache_clear()
def template(pattern, flags=0):
"Compile a template pattern, returning a pattern object"
return _compile(pattern, flags|T)
-_alphanum = frozenset(
- "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789")
+_alphanum_str = frozenset(
+ "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ01234567890")
+_alphanum_bytes = frozenset(
+ b"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ01234567890")
def escape(pattern):
"Escape all non-alphanumeric characters in pattern."
- s = list(pattern)
- alphanum = _alphanum
- for i, c in enumerate(pattern):
- if c not in alphanum:
- if c == "\000":
- s[i] = "\\000"
+ if isinstance(pattern, str):
+ alphanum = _alphanum_str
+ s = list(pattern)
+ for i, c in enumerate(pattern):
+ if c not in alphanum:
+ if c == "\000":
+ s[i] = "\\000"
+ else:
+ s[i] = "\\" + c
+ return "".join(s)
+ else:
+ alphanum = _alphanum_bytes
+ s = []
+ esc = ord(b"\\")
+ for c in pattern:
+ if c in alphanum:
+ s.append(c)
else:
- s[i] = "\\" + c
- return pattern[:0].join(s)
+ if c == 0:
+ s.extend(b"\\000")
+ else:
+ s.append(esc)
+ s.append(c)
+ return bytes(s)
# --------------------------------------------------------------------
# internals
-_cache = {}
-_cache_repl = {}
-
_pattern_type = type(sre_compile.compile("", 0))
-_MAXCACHE = 100
+def _compile(pattern, flags):
+ return _compile_typed(type(pattern), pattern, flags)
-def _compile(*key):
+@functools.lru_cache(maxsize=500)
+def _compile_typed(text_bytes_type, pattern, flags):
# internal: compile pattern
- cachekey = (type(key[0]),) + key
- p = _cache.get(cachekey)
- if p is not None:
- return p
- pattern, flags = key
if isinstance(pattern, _pattern_type):
if flags:
- raise ValueError('Cannot process flags argument with a compiled pattern')
+ raise ValueError(
+ "Cannot process flags argument with a compiled pattern")
return pattern
if not sre_compile.isstring(pattern):
- raise TypeError, "first argument must be string or compiled pattern"
- try:
- p = sre_compile.compile(pattern, flags)
- except error, v:
- raise error, v # invalid expression
- if len(_cache) >= _MAXCACHE:
- _cache.clear()
- _cache[cachekey] = p
- return p
-
-def _compile_repl(*key):
+ raise TypeError("first argument must be string or compiled pattern")
+ return sre_compile.compile(pattern, flags)
+
+@functools.lru_cache(maxsize=500)
+def _compile_repl(repl, pattern):
# internal: compile replacement pattern
- p = _cache_repl.get(key)
- if p is not None:
- return p
- repl, pattern = key
- try:
- p = sre_parse.parse_template(repl, pattern)
- except error, v:
- raise error, v # invalid expression
- if len(_cache_repl) >= _MAXCACHE:
- _cache_repl.clear()
- _cache_repl[key] = p
- return p
+ return sre_parse.parse_template(repl, pattern)
def _expand(pattern, match, template):
# internal: match.expand implementation hook
@@ -277,12 +296,12 @@ def _subx(pattern, template):
# register myself for pickling
-import copy_reg
+import copyreg
def _pickle(p):
return _compile, (p.pattern, p.flags)
-copy_reg.pickle(_pattern_type, _pickle, _compile)
+copyreg.pickle(_pattern_type, _pickle, _compile)
# --------------------------------------------------------------------
# experimental stuff (see python-dev discussions for details)
@@ -315,7 +334,7 @@ class Scanner:
if i == j:
break
action = self.lexicon[m.lastindex-1][1]
- if hasattr(action, '__call__'):
+ if callable(action):
self.match = m
action = action(self, m.group())
if action is not None: