diff options
Diffstat (limited to 'Lib/sre_parse.py')
-rw-r--r-- | Lib/sre_parse.py | 412 |
1 files changed, 213 insertions, 199 deletions
diff --git a/Lib/sre_parse.py b/Lib/sre_parse.py index fad8014..aa2d64b 100644 --- a/Lib/sre_parse.py +++ b/Lib/sre_parse.py @@ -13,17 +13,19 @@ # XXX: show string offset and offending character for all errors from sre_constants import * -from _sre import MAXREPEAT SPECIAL_CHARS = ".\\[{()*+?^$|" REPEAT_CHARS = "*+?{" -DIGITS = set("0123456789") +DIGITS = frozenset("0123456789") -OCTDIGITS = set("01234567") -HEXDIGITS = set("0123456789abcdefABCDEF") +OCTDIGITS = frozenset("01234567") +HEXDIGITS = frozenset("0123456789abcdefABCDEF") -WHITESPACE = set(" \t\n\r\v\f") +WHITESPACE = frozenset(" \t\n\r\v\f") + +_REPEATCODES = frozenset({MIN_REPEAT, MAX_REPEAT}) +_UNITCODES = frozenset({ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY}) ESCAPES = { r"\a": (LITERAL, ord("\a")), @@ -74,11 +76,13 @@ class Pattern: def opengroup(self, name=None): gid = self.groups self.subpatterns.append(None) + if self.groups > MAXGROUPS: + raise error("groups number is too large") if name is not None: ogid = self.groupdict.get(name, None) if ogid is not None: - raise error("redefinition of group name %s as group %d; " - "was group %d" % (repr(name), gid, ogid)) + raise error("redefinition of group name %r as group %d; " + "was group %d" % (name, gid, ogid)) self.groupdict[name] = gid return gid def closegroup(self, gid, p): @@ -98,24 +102,24 @@ class SubPattern: nl = True seqtypes = (tuple, list) for op, av in self.data: - print(level*" " + op, end='') + print(level*" " + str(op), end='') if op == IN: # member sublanguage print() for op, a in av: - print((level+1)*" " + op, a) + print((level+1)*" " + str(op), a) elif op == BRANCH: print() for i, a in enumerate(av[1]): if i: - print(level*" " + "or") + print(level*" " + "OR") a.dump(level+1) elif op == GROUPREF_EXISTS: condgroup, item_yes, item_no = av print('', condgroup) item_yes.dump(level+1) if item_no: - print(level*" " + "else") + print(level*" " + "ELSE") item_no.dump(level+1) elif isinstance(av, seqtypes): nl = False @@ -152,11 +156,9 @@ class SubPattern: self.data.append(code) def getwidth(self): # determine the width (min, max) for this subpattern - if self.width: + if self.width is not None: return self.width lo = hi = 0 - UNITCODES = (ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY) - REPEATCODES = (MIN_REPEAT, MAX_REPEAT) for op, av in self.data: if op is BRANCH: i = MAXREPEAT - 1 @@ -175,11 +177,11 @@ class SubPattern: i, j = av[1].getwidth() lo = lo + i hi = hi + j - elif op in REPEATCODES: + elif op in _REPEATCODES: i, j = av[2].getwidth() lo = lo + i * av[0] hi = hi + j * av[1] - elif op in UNITCODES: + elif op in _UNITCODES: lo = lo + 1 hi = hi + 1 elif op is GROUPREF: @@ -205,33 +207,33 @@ class Tokenizer: def __init__(self, string): self.istext = isinstance(string, str) self.string = string + if not self.istext: + string = str(string, 'latin1') + self.decoded_string = string self.index = 0 + self.next = None self.__next() def __next(self): - if self.index >= len(self.string): + index = self.index + try: + char = self.decoded_string[index] + except IndexError: self.next = None return - char = self.string[self.index:self.index+1] - # Special case for the str8, since indexing returns a integer - # XXX This is only needed for test_bug_926075 in test_re.py - if char and not self.istext: - char = chr(char[0]) if char == "\\": + index += 1 try: - c = self.string[self.index + 1] + char += self.decoded_string[index] except IndexError: - raise error("bogus escape (end of line)") - if not self.istext: - c = chr(c) - char = char + c - self.index = self.index + len(char) + raise error("bogus escape (end of line)", + self.string, len(self.string) - 1) from None + self.index = index + 1 self.next = char - def match(self, char, skip=1): + def match(self, char): if char == self.next: - if skip: - self.__next() - return 1 - return 0 + self.__next() + return True + return False def get(self): this = self.next self.__next() @@ -245,10 +247,25 @@ class Tokenizer: result += c self.__next() return result + def getuntil(self, terminator): + result = '' + while True: + c = self.next + self.__next() + if c is None: + raise self.error("unterminated name") + if c == terminator: + break + result += c + return result def tell(self): - return self.index, self.next + return self.index - len(self.next or '') def seek(self, index): - self.index, self.next = index + self.index = index + self.__next() + + def error(self, msg, offset=0): + return error(msg, self.string, self.tell() - offset) # The following three functions are not used in this module anymore, but we keep # them here (with DeprecationWarnings) for backwards compatibility. @@ -283,7 +300,7 @@ def _class_escape(source, escape): if code: return code code = CATEGORIES.get(escape) - if code and code[0] == IN: + if code and code[0] is IN: return code try: c = escape[1:2] @@ -292,7 +309,7 @@ def _class_escape(source, escape): escape += source.getwhile(2, HEXDIGITS) if len(escape) != 4: raise ValueError - return LITERAL, int(escape[2:], 16) & 0xff + return LITERAL, int(escape[2:], 16) elif c == "u" and source.istext: # unicode escape (exactly four digits) escape += source.getwhile(4, HEXDIGITS) @@ -310,14 +327,18 @@ def _class_escape(source, escape): elif c in OCTDIGITS: # octal escape (up to three digits) escape += source.getwhile(2, OCTDIGITS) - return LITERAL, int(escape[1:], 8) & 0xff + c = int(escape[1:], 8) + if c > 0o377: + raise source.error('octal escape value %r outside of ' + 'range 0-0o377' % escape, len(escape)) + return LITERAL, c elif c in DIGITS: raise ValueError if len(escape) == 2: return LITERAL, ord(escape[1]) except ValueError: pass - raise error("bogus escape: %s" % repr(escape)) + raise source.error("bogus escape: %r" % escape, len(escape)) def _escape(source, escape, state): # handle escape code in expression @@ -334,7 +355,7 @@ def _escape(source, escape, state): escape += source.getwhile(2, HEXDIGITS) if len(escape) != 4: raise ValueError - return LITERAL, int(escape[2:], 16) & 0xff + return LITERAL, int(escape[2:], 16) elif c == "u" and source.istext: # unicode escape (exactly four digits) escape += source.getwhile(4, HEXDIGITS) @@ -352,45 +373,47 @@ def _escape(source, escape, state): elif c == "0": # octal escape escape += source.getwhile(2, OCTDIGITS) - return LITERAL, int(escape[1:], 8) & 0xff + return LITERAL, int(escape[1:], 8) elif c in DIGITS: # octal escape *or* decimal group reference (sigh) if source.next in DIGITS: - escape = escape + source.get() + escape += source.get() if (escape[1] in OCTDIGITS and escape[2] in OCTDIGITS and source.next in OCTDIGITS): # got three octal digits; this is an octal escape - escape = escape + source.get() - return LITERAL, int(escape[1:], 8) & 0xff + escape += source.get() + c = int(escape[1:], 8) + if c > 0o377: + raise source.error('octal escape value %r outside of ' + 'range 0-0o377' % escape, + len(escape)) + return LITERAL, c # not an octal escape, so this is a group reference group = int(escape[1:]) if group < state.groups: if not state.checkgroup(group): - raise error("cannot refer to open group") + raise source.error("cannot refer to open group", + len(escape)) return GROUPREF, group raise ValueError if len(escape) == 2: return LITERAL, ord(escape[1]) except ValueError: pass - raise error("bogus escape: %s" % repr(escape)) + raise source.error("bogus escape: %r" % escape, len(escape)) -def _parse_sub(source, state, nested=1): +def _parse_sub(source, state, nested=True): # parse an alternation: a|b|c items = [] itemsappend = items.append sourcematch = source.match - while 1: + while True: itemsappend(_parse(source, state)) - if sourcematch("|"): - continue - if not nested: + if not sourcematch("|"): break - if not source.next or sourcematch(")", 0): - break - else: - raise error("pattern not properly closed") + if nested and source.next is not None and source.next != ")": + raise source.error("pattern not properly closed") if len(items) == 1: return items[0] @@ -399,7 +422,7 @@ def _parse_sub(source, state, nested=1): subpatternappend = subpattern.append # check if all items share a common prefix - while 1: + while True: prefix = None for item in items: if not item: @@ -419,16 +442,12 @@ def _parse_sub(source, state, nested=1): # check if the branch can be replaced by a character set for item in items: - if len(item) != 1 or item[0][0] != LITERAL: + if len(item) != 1 or item[0][0] is not LITERAL: break else: # we can store this as a character set instead of a # branch (the compiler may optimize this even more) - set = [] - setappend = set.append - for item in items: - setappend(item[0]) - subpatternappend((IN, set)) + subpatternappend((IN, [item[0] for item in items])) return subpattern subpattern.append((BRANCH, (None, items))) @@ -438,21 +457,16 @@ def _parse_sub_cond(source, state, condgroup): item_yes = _parse(source, state) if source.match("|"): item_no = _parse(source, state) - if source.match("|"): - raise error("conditional backref with more than two branches") + if source.next == "|": + raise source.error("conditional backref with more than two branches") else: item_no = None - if source.next and not source.match(")", 0): - raise error("pattern not properly closed") + if source.next is not None and source.next != ")": + raise source.error("pattern not properly closed") subpattern = SubPattern(state) subpattern.append((GROUPREF_EXISTS, (condgroup, item_yes, item_no))) return subpattern -_PATTERNENDERS = set("|)") -_ASSERTCHARS = set("=!<") -_LOOKBEHINDASSERTCHARS = set("=!") -_REPEATCODES = set([MIN_REPEAT, MAX_REPEAT]) - def _parse(source, state): # parse a simple pattern subpattern = SubPattern(state) @@ -462,32 +476,35 @@ def _parse(source, state): sourceget = source.get sourcematch = source.match _len = len - PATTERNENDERS = _PATTERNENDERS - ASSERTCHARS = _ASSERTCHARS - LOOKBEHINDASSERTCHARS = _LOOKBEHINDASSERTCHARS - REPEATCODES = _REPEATCODES + _ord = ord + verbose = state.flags & SRE_FLAG_VERBOSE - while 1: + while True: - if source.next in PATTERNENDERS: - break # end of subpattern - this = sourceget() + this = source.next if this is None: break # end of pattern + if this in "|)": + break # end of subpattern + sourceget() - if state.flags & SRE_FLAG_VERBOSE: + if verbose: # skip whitespace and comments if this in WHITESPACE: continue if this == "#": - while 1: + while True: this = sourceget() - if this in (None, "\n"): + if this is None or this == "\n": break continue - if this and this[0] not in SPECIAL_CHARS: - subpatternappend((LITERAL, ord(this))) + if this[0] == "\\": + code = _escape(source, this, state) + subpatternappend(code) + + elif this not in SPECIAL_CHARS: + subpatternappend((LITERAL, _ord(this))) elif this == "[": # character set @@ -499,39 +516,38 @@ def _parse(source, state): setappend((NEGATE, None)) # check remaining characters start = set[:] - while 1: + while True: this = sourceget() + if this is None: + raise source.error("unexpected end of regular expression") if this == "]" and set != start: break - elif this and this[0] == "\\": + elif this[0] == "\\": code1 = _class_escape(source, this) - elif this: - code1 = LITERAL, ord(this) else: - raise error("unexpected end of regular expression") + code1 = LITERAL, _ord(this) if sourcematch("-"): # potential range this = sourceget() + if this is None: + raise source.error("unexpected end of regular expression") if this == "]": if code1[0] is IN: code1 = code1[1][0] setappend(code1) - setappend((LITERAL, ord("-"))) + setappend((LITERAL, _ord("-"))) break - elif this: - if this[0] == "\\": - code2 = _class_escape(source, this) - else: - code2 = LITERAL, ord(this) - if code1[0] != LITERAL or code2[0] != LITERAL: - raise error("bad character range") - lo = code1[1] - hi = code2[1] - if hi < lo: - raise error("bad character range") - setappend((RANGE, (lo, hi))) + if this[0] == "\\": + code2 = _class_escape(source, this) else: - raise error("unexpected end of regular expression") + code2 = LITERAL, _ord(this) + if code1[0] != LITERAL or code2[0] != LITERAL: + raise source.error("bad character range", len(this)) + lo = code1[1] + hi = code2[1] + if hi < lo: + raise source.error("bad character range", len(this)) + setappend((RANGE, (lo, hi))) else: if code1[0] is IN: code1 = code1[1][0] @@ -546,8 +562,9 @@ def _parse(source, state): # XXX: <fl> should add charmap optimization here subpatternappend((IN, set)) - elif this and this[0] in REPEAT_CHARS: + elif this in REPEAT_CHARS: # repeat previous item + here = source.tell() if this == "?": min, max = 0, 1 elif this == "*": @@ -557,20 +574,19 @@ def _parse(source, state): min, max = 1, MAXREPEAT elif this == "{": if source.next == "}": - subpatternappend((LITERAL, ord(this))) + subpatternappend((LITERAL, _ord(this))) continue - here = source.tell() min, max = 0, MAXREPEAT lo = hi = "" while source.next in DIGITS: - lo = lo + source.get() + lo += sourceget() if sourcematch(","): while source.next in DIGITS: - hi = hi + sourceget() + hi += sourceget() else: hi = lo if not sourcematch("}"): - subpatternappend((LITERAL, ord(this))) + subpatternappend((LITERAL, _ord(this))) source.seek(here) continue if lo: @@ -582,18 +598,21 @@ def _parse(source, state): if max >= MAXREPEAT: raise OverflowError("the repetition number is too large") if max < min: - raise error("bad repeat interval") + raise source.error("bad repeat interval", + source.tell() - here) else: - raise error("not supported") + raise source.error("not supported", len(this)) # figure out which item to repeat if subpattern: item = subpattern[-1:] else: item = None if not item or (_len(item) == 1 and item[0][0] == AT): - raise error("nothing to repeat") - if item[0][0] in REPEATCODES: - raise error("multiple repeat") + raise source.error("nothing to repeat", + source.tell() - here + len(this)) + if item[0][0] in _REPEATCODES: + raise source.error("multiple repeat", + source.tell() - here + len(this)) if sourcematch("?"): subpattern[-1] = (MIN_REPEAT, (min, max, item)) else: @@ -609,131 +628,129 @@ def _parse(source, state): if sourcematch("?"): group = 0 # options - if sourcematch("P"): + char = sourceget() + if char is None: + raise self.error("unexpected end of pattern") + if char == "P": # python extensions if sourcematch("<"): # named group: skip forward to end of name - name = "" - while 1: - char = sourceget() - if char is None: - raise error("unterminated name") - if char == ">": - break - name = name + char + name = source.getuntil(">") group = 1 if not name: - raise error("missing group name") + raise source.error("missing group name", 1) if not name.isidentifier(): - raise error("bad character in group name %r" % name) + raise source.error("bad character in group name " + "%r" % name, + len(name) + 1) elif sourcematch("="): # named backreference - name = "" - while 1: - char = sourceget() - if char is None: - raise error("unterminated name") - if char == ")": - break - name = name + char + name = source.getuntil(")") if not name: - raise error("missing group name") + raise source.error("missing group name", 1) if not name.isidentifier(): - raise error("bad character in backref group name " - "%r" % name) + raise source.error("bad character in backref " + "group name %r" % name, + len(name) + 1) gid = state.groupdict.get(name) if gid is None: msg = "unknown group name: {0!r}".format(name) - raise error(msg) + raise source.error(msg, len(name) + 1) subpatternappend((GROUPREF, gid)) continue else: char = sourceget() if char is None: - raise error("unexpected end of pattern") - raise error("unknown specifier: ?P%s" % char) - elif sourcematch(":"): + raise source.error("unexpected end of pattern") + raise source.error("unknown specifier: ?P%s" % char, + len(char)) + elif char == ":": # non-capturing group group = 2 - elif sourcematch("#"): + elif char == "#": # comment - while 1: - if source.next is None or source.next == ")": + while True: + if source.next is None: + raise source.error("unbalanced parenthesis") + if sourceget() == ")": break - sourceget() - if not sourcematch(")"): - raise error("unbalanced parenthesis") continue - elif source.next in ASSERTCHARS: + elif char in "=!<": # lookahead assertions - char = sourceget() dir = 1 if char == "<": - if source.next not in LOOKBEHINDASSERTCHARS: - raise error("syntax error") - dir = -1 # lookbehind char = sourceget() + if char is None or char not in "=!": + raise source.error("syntax error") + dir = -1 # lookbehind p = _parse_sub(source, state) if not sourcematch(")"): - raise error("unbalanced parenthesis") + raise source.error("unbalanced parenthesis") if char == "=": subpatternappend((ASSERT, (dir, p))) else: subpatternappend((ASSERT_NOT, (dir, p))) continue - elif sourcematch("("): + elif char == "(": # conditional backreference group - condname = "" - while 1: - char = sourceget() - if char is None: - raise error("unterminated name") - if char == ")": - break - condname = condname + char + condname = source.getuntil(")") group = 2 if not condname: - raise error("missing group name") + raise source.error("missing group name", 1) if condname.isidentifier(): condgroup = state.groupdict.get(condname) if condgroup is None: msg = "unknown group name: {0!r}".format(condname) - raise error(msg) + raise source.error(msg, len(condname) + 1) else: try: condgroup = int(condname) + if condgroup < 0: + raise ValueError except ValueError: - raise error("bad character in group name") - else: + raise source.error("bad character in group name", + len(condname) + 1) + if not condgroup: + raise source.error("bad group number", + len(condname) + 1) + if condgroup >= MAXGROUPS: + raise source.error("the group number is too large", + len(condname) + 1) + elif char in FLAGS: # flags - if not source.next in FLAGS: - raise error("unexpected end of pattern") + state.flags |= FLAGS[char] while source.next in FLAGS: - state.flags = state.flags | FLAGS[sourceget()] + state.flags |= FLAGS[sourceget()] + verbose = state.flags & SRE_FLAG_VERBOSE + else: + raise source.error("unexpected end of pattern") if group: # parse group contents if group == 2: # anonymous group group = None else: - group = state.opengroup(name) + try: + group = state.opengroup(name) + except error as err: + raise source.error(err.msg, len(name) + 1) if condgroup: p = _parse_sub_cond(source, state, condgroup) else: p = _parse_sub(source, state) if not sourcematch(")"): - raise error("unbalanced parenthesis") + raise source.error("unbalanced parenthesis") if group is not None: state.closegroup(group, p) subpatternappend((SUBPATTERN, (group, p))) else: - while 1: + while True: char = sourceget() if char is None: - raise error("unexpected end of pattern") + raise source.error("unexpected end of pattern") if char == ")": break - raise error("unknown extension") + raise source.error("unknown extension", len(char)) elif this == "^": subpatternappend((AT, AT_BEGINNING)) @@ -741,12 +758,8 @@ def _parse(source, state): elif this == "$": subpattern.append((AT, AT_END)) - elif this and this[0] == "\\": - code = _escape(source, this, state) - subpatternappend(code) - else: - raise error("parser error") + raise source.error("parser error", len(this)) return subpattern @@ -775,11 +788,12 @@ def parse(str, flags=0, pattern=None): p = _parse_sub(source, pattern, 0) p.pattern.flags = fix_flags(str, p.pattern.flags) - tail = source.get() - if tail == ")": - raise error("unbalanced parenthesis") - elif tail: - raise error("bogus characters at end of regular expression") + if source.next is not None: + if source.next == ")": + raise source.error("unbalanced parenthesis") + else: + raise source.error("bogus characters at end of regular expression", + len(tail)) if flags & SRE_FLAG_DEBUG: p.dump() @@ -816,22 +830,20 @@ def parse_template(source, pattern): if c == "g": name = "" if s.match("<"): - while True: - char = sget() - if char is None: - raise error("unterminated group name") - if char == ">": - break - name += char + name = s.getuntil(">") if not name: - raise error("missing group name") + raise s.error("missing group name", 1) try: index = int(name) if index < 0: - raise error("negative group number") + raise s.error("negative group number", len(name) + 1) + if index >= MAXGROUPS: + raise s.error("the group number is too large", + len(name) + 1) except ValueError: if not name.isidentifier(): - raise error("bad character in group name") + raise s.error("bad character in group name", + len(name) + 1) try: index = pattern.groupindex[name] except KeyError: @@ -852,7 +864,11 @@ def parse_template(source, pattern): s.next in OCTDIGITS): this += sget() isoctal = True - lappend(chr(int(this[1:], 8) & 0xff)) + c = int(this[1:], 8) + if c > 0o377: + raise s.error('octal escape value %r outside of ' + 'range 0-0o377' % this, len(this)) + lappend(chr(c)) if not isoctal: addgroup(int(this[1:])) else: @@ -873,14 +889,12 @@ def parse_template(source, pattern): def expand_template(template, match): g = match.group - sep = match.string[:0] + empty = match.string[:0] groups, literals = template literals = literals[:] try: for index, group in groups: - literals[index] = s = g(group) - if s is None: - raise error("unmatched group") + literals[index] = g(group) or empty except IndexError: raise error("invalid group reference") - return sep.join(literals) + return empty.join(literals) |