diff options
author | Guido van Rossum <guido@python.org> | 2000-06-30 16:13:37 (GMT) |
---|---|---|
committer | Guido van Rossum <guido@python.org> | 2000-06-30 16:13:37 (GMT) |
commit | 4358b2c928e756a77613da7cb8c57676e8872935 (patch) | |
tree | a2b17260f4109d2deec6381b20da0a864a82c0cc | |
parent | 257543c78d099f45d94e7f557214648768bd7460 (diff) | |
download | cpython-4358b2c928e756a77613da7cb8c57676e8872935.zip cpython-4358b2c928e756a77613da7cb8c57676e8872935.tar.gz cpython-4358b2c928e756a77613da7cb8c57676e8872935.tar.bz2 |
the usual
-rw-r--r-- | Lib/dos-8x3/sre_comp.py | 329 | ||||
-rw-r--r-- | Lib/dos-8x3/sre_cons.py | 32 | ||||
-rw-r--r-- | Lib/dos-8x3/sre_pars.py | 898 | ||||
-rw-r--r-- | Lib/dos-8x3/test_has.py | 26 |
4 files changed, 713 insertions, 572 deletions
diff --git a/Lib/dos-8x3/sre_comp.py b/Lib/dos-8x3/sre_comp.py index c042375..e48a7eb 100644 --- a/Lib/dos-8x3/sre_comp.py +++ b/Lib/dos-8x3/sre_comp.py @@ -18,157 +18,212 @@ from sre_constants import * # find an array type code that matches the engine's code size for WORDSIZE in "BHil": if len(array.array(WORDSIZE, [0]).tostring()) == _sre.getcodesize(): - break + break else: raise RuntimeError, "cannot find a useable array type" def _compile(code, pattern, flags): + # internal: compile a (sub)pattern emit = code.append for op, av in pattern: - if op is ANY: - if flags & SRE_FLAG_DOTALL: - emit(OPCODES[op]) - else: - emit(OPCODES[CATEGORY]) - emit(CHCODES[CATEGORY_NOT_LINEBREAK]) - elif op in (SUCCESS, FAILURE): - emit(OPCODES[op]) - elif op is AT: - emit(OPCODES[op]) - if flags & SRE_FLAG_MULTILINE: - emit(ATCODES[AT_MULTILINE[av]]) - else: - emit(ATCODES[av]) - elif op is BRANCH: - emit(OPCODES[op]) - tail = [] - for av in av[1]: - skip = len(code); emit(0) - _compile(code, av, flags) - emit(OPCODES[JUMP]) - tail.append(len(code)); emit(0) - code[skip] = len(code) - skip - emit(0) # end of branch - for tail in tail: - code[tail] = len(code) - tail - elif op is CALL: - emit(OPCODES[op]) - skip = len(code); emit(0) - _compile(code, av, flags) - emit(OPCODES[SUCCESS]) - code[skip] = len(code) - skip - elif op is CATEGORY: - emit(OPCODES[op]) - if flags & SRE_FLAG_LOCALE: - emit(CH_LOCALE[CHCODES[av]]) - elif flags & SRE_FLAG_UNICODE: - emit(CH_UNICODE[CHCODES[av]]) - else: - emit(CHCODES[av]) - elif op is GROUP: - if flags & SRE_FLAG_IGNORECASE: - emit(OPCODES[OP_IGNORE[op]]) - else: - emit(OPCODES[op]) - emit(av-1) - elif op is IN: - if flags & SRE_FLAG_IGNORECASE: - emit(OPCODES[OP_IGNORE[op]]) - def fixup(literal, flags=flags): - return _sre.getlower(ord(literal), flags) - else: - emit(OPCODES[op]) - fixup = ord - skip = len(code); emit(0) - for op, av in av: - emit(OPCODES[op]) - if op is NEGATE: - pass - elif op is LITERAL: - emit(fixup(av)) - elif op is RANGE: - emit(fixup(av[0])) - emit(fixup(av[1])) - elif op is CATEGORY: - if flags & SRE_FLAG_LOCALE: - emit(CH_LOCALE[CHCODES[av]]) - elif flags & SRE_FLAG_UNICODE: - emit(CH_UNICODE[CHCODES[av]]) - else: - emit(CHCODES[av]) - else: - raise error, "internal: unsupported set operator" - emit(OPCODES[FAILURE]) - code[skip] = len(code) - skip - elif op in (LITERAL, NOT_LITERAL): - if flags & SRE_FLAG_IGNORECASE: - emit(OPCODES[OP_IGNORE[op]]) - else: - emit(OPCODES[op]) - emit(ord(av)) - elif op is MARK: - emit(OPCODES[op]) - emit(av) - elif op in (REPEAT, MIN_REPEAT, MAX_REPEAT): - if flags & SRE_FLAG_TEMPLATE: - emit(OPCODES[REPEAT]) - skip = len(code); emit(0) - emit(av[0]) - emit(av[1]) - _compile(code, av[2], flags) - emit(OPCODES[SUCCESS]) - code[skip] = len(code) - skip - else: - lo, hi = av[2].getwidth() - if lo == 0: - raise error, "nothing to repeat" - if 0 and lo == hi == 1 and op is MAX_REPEAT: - # FIXME: <fl> need a better way to figure out when - # it's safe to use this one (in the parser, probably) - emit(OPCODES[MAX_REPEAT_ONE]) - skip = len(code); emit(0) - emit(av[0]) - emit(av[1]) - _compile(code, av[2], flags) - emit(OPCODES[SUCCESS]) - code[skip] = len(code) - skip - else: - emit(OPCODES[op]) - skip = len(code); emit(0) - emit(av[0]) - emit(av[1]) - _compile(code, av[2], flags) - emit(OPCODES[SUCCESS]) - code[skip] = len(code) - skip - elif op is SUBPATTERN: - group = av[0] - if group: - emit(OPCODES[MARK]) - emit((group-1)*2) - _compile(code, av[1], flags) - if group: - emit(OPCODES[MARK]) - emit((group-1)*2+1) - else: - raise ValueError, ("unsupported operand type", op) + if op in (LITERAL, NOT_LITERAL): + if flags & SRE_FLAG_IGNORECASE: + emit(OPCODES[OP_IGNORE[op]]) + else: + emit(OPCODES[op]) + emit(av) + elif op is IN: + if flags & SRE_FLAG_IGNORECASE: + emit(OPCODES[OP_IGNORE[op]]) + def fixup(literal, flags=flags): + return _sre.getlower(literal, flags) + else: + emit(OPCODES[op]) + fixup = lambda x: x + skip = len(code); emit(0) + for op, av in av: + emit(OPCODES[op]) + if op is NEGATE: + pass + elif op is LITERAL: + emit(fixup(av)) + elif op is RANGE: + emit(fixup(av[0])) + emit(fixup(av[1])) + elif op is CATEGORY: + if flags & SRE_FLAG_LOCALE: + emit(CHCODES[CH_LOCALE[av]]) + elif flags & SRE_FLAG_UNICODE: + emit(CHCODES[CH_UNICODE[av]]) + else: + emit(CHCODES[av]) + else: + raise error, "internal: unsupported set operator" + emit(OPCODES[FAILURE]) + code[skip] = len(code) - skip + elif op is ANY: + if flags & SRE_FLAG_DOTALL: + emit(OPCODES[op]) + else: + emit(OPCODES[CATEGORY]) + emit(CHCODES[CATEGORY_NOT_LINEBREAK]) + elif op in (REPEAT, MIN_REPEAT, MAX_REPEAT): + if flags & SRE_FLAG_TEMPLATE: + emit(OPCODES[REPEAT]) + skip = len(code); emit(0) + emit(av[0]) + emit(av[1]) + _compile(code, av[2], flags) + emit(OPCODES[SUCCESS]) + code[skip] = len(code) - skip + else: + lo, hi = av[2].getwidth() + if lo == 0: + raise error, "nothing to repeat" + if 0 and lo == hi == 1 and op is MAX_REPEAT: + # FIXME: <fl> need a better way to figure out when + # it's safe to use this one (in the parser, probably) + emit(OPCODES[MAX_REPEAT_ONE]) + skip = len(code); emit(0) + emit(av[0]) + emit(av[1]) + _compile(code, av[2], flags) + emit(OPCODES[SUCCESS]) + code[skip] = len(code) - skip + else: + emit(OPCODES[op]) + skip = len(code); emit(0) + emit(av[0]) + emit(av[1]) + _compile(code, av[2], flags) + emit(OPCODES[SUCCESS]) + code[skip] = len(code) - skip + elif op is SUBPATTERN: + group = av[0] + if group: + emit(OPCODES[MARK]) + emit((group-1)*2) + _compile(code, av[1], flags) + if group: + emit(OPCODES[MARK]) + emit((group-1)*2+1) + elif op in (SUCCESS, FAILURE): + emit(OPCODES[op]) + elif op in (ASSERT, ASSERT_NOT, CALL): + emit(OPCODES[op]) + skip = len(code); emit(0) + _compile(code, av, flags) + emit(OPCODES[SUCCESS]) + code[skip] = len(code) - skip + elif op is AT: + emit(OPCODES[op]) + if flags & SRE_FLAG_MULTILINE: + emit(ATCODES[AT_MULTILINE[av]]) + else: + emit(ATCODES[av]) + elif op is BRANCH: + emit(OPCODES[op]) + tail = [] + for av in av[1]: + skip = len(code); emit(0) + _compile(code, av, flags) + emit(OPCODES[JUMP]) + tail.append(len(code)); emit(0) + code[skip] = len(code) - skip + emit(0) # end of branch + for tail in tail: + code[tail] = len(code) - tail + elif op is CATEGORY: + emit(OPCODES[op]) + if flags & SRE_FLAG_LOCALE: + emit(CHCODES[CH_LOCALE[av]]) + elif flags & SRE_FLAG_UNICODE: + emit(CHCODES[CH_UNICODE[av]]) + else: + emit(CHCODES[av]) + elif op is GROUP: + if flags & SRE_FLAG_IGNORECASE: + emit(OPCODES[OP_IGNORE[op]]) + else: + emit(OPCODES[op]) + emit(av-1) + elif op is MARK: + emit(OPCODES[op]) + emit(av) + else: + raise ValueError, ("unsupported operand type", op) + +def _compile_info(code, pattern, flags): + # internal: compile an info block. in the current version, + # this contains min/max pattern width and a literal prefix, + # if any + lo, hi = pattern.getwidth() + if lo == 0: + return # not worth it + # look for a literal prefix + prefix = [] + if not (flags & SRE_FLAG_IGNORECASE): + for op, av in pattern.data: + if op is LITERAL: + prefix.append(av) + else: + break + # add an info block + emit = code.append + emit(OPCODES[INFO]) + skip = len(code); emit(0) + # literal flag + mask = 0 + if len(prefix) == len(pattern.data): + mask = 1 + emit(mask) + # pattern length + emit(lo) + if hi < 32768: + emit(hi) + else: + emit(0) + # add literal prefix + emit(len(prefix)) + if prefix: + code.extend(prefix) + # generate overlap table + table = [-1] + ([0]*len(prefix)) + for i in range(len(prefix)): + table[i+1] = table[i]+1 + while table[i+1] > 0 and prefix[i] != prefix[table[i+1]-1]: + table[i+1] = table[table[i+1]-1]+1 + code.extend(table[1:]) # don't store first entry + code[skip] = len(code) - skip def compile(p, flags=0): # internal: convert pattern list to internal format + + # compile, as necessary if type(p) in (type(""), type(u"")): - import sre_parse - pattern = p - p = sre_parse.parse(p) + import sre_parse + pattern = p + p = sre_parse.parse(p) else: - pattern = None + pattern = None + flags = p.pattern.flags | flags code = [] + + # compile info block + _compile_info(code, p, flags) + + # compile the pattern _compile(code, p.data, flags) + code.append(OPCODES[SUCCESS]) - # FIXME: <fl> get rid of this limitation + + # FIXME: <fl> get rid of this limitation! assert p.pattern.groups <= 100,\ - "sorry, but this version only supports 100 named groups" + "sorry, but this version only supports 100 named groups" + return _sre.compile( - pattern, flags, - array.array(WORDSIZE, code).tostring(), - p.pattern.groups-1, p.pattern.groupdict - ) + pattern, flags, + array.array(WORDSIZE, code).tostring(), + p.pattern.groups-1, p.pattern.groupdict + ) diff --git a/Lib/dos-8x3/sre_cons.py b/Lib/dos-8x3/sre_cons.py index f5e7894..45f4f48 100644 --- a/Lib/dos-8x3/sre_cons.py +++ b/Lib/dos-8x3/sre_cons.py @@ -23,6 +23,7 @@ SUCCESS = "success" ANY = "any" ASSERT = "assert" +ASSERT_NOT = "assert_not" AT = "at" BRANCH = "branch" CALL = "call" @@ -81,7 +82,7 @@ OPCODES = [ FAILURE, SUCCESS, ANY, - ASSERT, + ASSERT, ASSERT_NOT, AT, BRANCH, CALL, @@ -121,8 +122,8 @@ def makedict(list): d = {} i = 0 for item in list: - d[item] = i - i = i + 1 + d[item] = i + i = i + 1 return d OPCODES = makedict(OPCODES) @@ -176,12 +177,27 @@ SRE_FLAG_VERBOSE = 64 if __name__ == "__main__": import string def dump(f, d, prefix): - items = d.items() - items.sort(lambda a, b: cmp(a[1], b[1])) - for k, v in items: - f.write("#define %s_%s %s\n" % (prefix, string.upper(k), v)) + items = d.items() + items.sort(lambda a, b: cmp(a[1], b[1])) + for k, v in items: + f.write("#define %s_%s %s\n" % (prefix, string.upper(k), v)) f = open("sre_constants.h", "w") - f.write("/* generated from sre_constants.py */\n") + f.write("""\ +/* + * Secret Labs' Regular Expression Engine + * + * regular expression matching engine + * + * NOTE: This file is generated by sre_constants.py. If you need + * to change anything in here, edit sre_constants.py and run it. + * + * Copyright (c) 1997-2000 by Secret Labs AB. All rights reserved. + * + * See the _sre.c file for information on usage and redistribution. + */ + +""") + dump(f, OPCODES, "SRE_OP") dump(f, ATCODES, "SRE") dump(f, CHCODES, "SRE") diff --git a/Lib/dos-8x3/sre_pars.py b/Lib/dos-8x3/sre_pars.py index 93a7b5d..fb954e9 100644 --- a/Lib/dos-8x3/sre_pars.py +++ b/Lib/dos-8x3/sre_pars.py @@ -19,6 +19,9 @@ from sre_constants import * # FIXME: should be 65535, but the arraymodule is still broken MAXREPEAT = 32767 +# FIXME: same here +CHARMASK = 0x7fff + SPECIAL_CHARS = ".\\[{()*+?^$|" REPEAT_CHARS = "*+?{" @@ -30,26 +33,27 @@ HEXDIGITS = tuple("0123456789abcdefABCDEF") WHITESPACE = string.whitespace ESCAPES = { - "\\a": (LITERAL, chr(7)), - "\\b": (LITERAL, chr(8)), - "\\f": (LITERAL, chr(12)), - "\\n": (LITERAL, chr(10)), - "\\r": (LITERAL, chr(13)), - "\\t": (LITERAL, chr(9)), - "\\v": (LITERAL, chr(11)) + r"\a": (LITERAL, 7), + r"\b": (LITERAL, 8), + r"\f": (LITERAL, 12), + r"\n": (LITERAL, 10), + r"\r": (LITERAL, 13), + r"\t": (LITERAL, 9), + r"\v": (LITERAL, 11), + r"\\": (LITERAL, ord("\\")) } CATEGORIES = { - "\\A": (AT, AT_BEGINNING), # start of string - "\\b": (AT, AT_BOUNDARY), - "\\B": (AT, AT_NON_BOUNDARY), - "\\d": (IN, [(CATEGORY, CATEGORY_DIGIT)]), - "\\D": (IN, [(CATEGORY, CATEGORY_NOT_DIGIT)]), - "\\s": (IN, [(CATEGORY, CATEGORY_SPACE)]), - "\\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]), - "\\w": (IN, [(CATEGORY, CATEGORY_WORD)]), - "\\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]), - "\\Z": (AT, AT_END), # end of string + r"\A": (AT, AT_BEGINNING), # start of string + r"\b": (AT, AT_BOUNDARY), + r"\B": (AT, AT_NON_BOUNDARY), + r"\d": (IN, [(CATEGORY, CATEGORY_DIGIT)]), + r"\D": (IN, [(CATEGORY, CATEGORY_NOT_DIGIT)]), + r"\s": (IN, [(CATEGORY, CATEGORY_SPACE)]), + r"\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]), + r"\w": (IN, [(CATEGORY, CATEGORY_WORD)]), + r"\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]), + r"\Z": (AT, AT_END), # end of string } FLAGS = { @@ -66,106 +70,106 @@ FLAGS = { class State: def __init__(self): - self.flags = 0 - self.groups = 1 - self.groupdict = {} + self.flags = 0 + self.groups = 1 + self.groupdict = {} def getgroup(self, name=None): - gid = self.groups - self.groups = gid + 1 - if name: - self.groupdict[name] = gid - return gid + gid = self.groups + self.groups = gid + 1 + if name: + self.groupdict[name] = gid + return gid class SubPattern: # a subpattern, in intermediate form def __init__(self, pattern, data=None): - self.pattern = pattern - if not data: - data = [] - self.data = data - self.width = None + self.pattern = pattern + if not data: + data = [] + self.data = data + self.width = None def __repr__(self): - return repr(self.data) + return repr(self.data) def __len__(self): - return len(self.data) + return len(self.data) def __delitem__(self, index): - del self.data[index] + del self.data[index] def __getitem__(self, index): - return self.data[index] + return self.data[index] def __setitem__(self, index, code): - self.data[index] = code + self.data[index] = code def __getslice__(self, start, stop): - return SubPattern(self.pattern, self.data[start:stop]) + return SubPattern(self.pattern, self.data[start:stop]) def insert(self, index, code): - self.data.insert(index, code) + self.data.insert(index, code) def append(self, code): - self.data.append(code) + self.data.append(code) def getwidth(self): - # determine the width (min, max) for this subpattern - if self.width: - return self.width - lo = hi = 0L - for op, av in self.data: - if op is BRANCH: - l = sys.maxint - h = 0 - for av in av[1]: - i, j = av.getwidth() - l = min(l, i) - h = min(h, j) - lo = lo + i - hi = hi + j - elif op is CALL: - i, j = av.getwidth() - lo = lo + i - hi = hi + j - elif op is SUBPATTERN: - i, j = av[1].getwidth() - lo = lo + i - hi = hi + j - elif op in (MIN_REPEAT, MAX_REPEAT): - i, j = av[2].getwidth() - lo = lo + long(i) * av[0] - hi = hi + long(j) * av[1] - elif op in (ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY): - lo = lo + 1 - hi = hi + 1 - elif op == SUCCESS: - break - self.width = int(min(lo, sys.maxint)), int(min(hi, sys.maxint)) - return self.width + # determine the width (min, max) for this subpattern + if self.width: + return self.width + lo = hi = 0L + for op, av in self.data: + if op is BRANCH: + l = sys.maxint + h = 0 + for av in av[1]: + i, j = av.getwidth() + l = min(l, i) + h = min(h, j) + lo = lo + i + hi = hi + j + elif op is CALL: + i, j = av.getwidth() + lo = lo + i + hi = hi + j + elif op is SUBPATTERN: + i, j = av[1].getwidth() + lo = lo + i + hi = hi + j + elif op in (MIN_REPEAT, MAX_REPEAT): + i, j = av[2].getwidth() + lo = lo + long(i) * av[0] + hi = hi + long(j) * av[1] + elif op in (ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY): + lo = lo + 1 + hi = hi + 1 + elif op == SUCCESS: + break + self.width = int(min(lo, sys.maxint)), int(min(hi, sys.maxint)) + return self.width class Tokenizer: def __init__(self, string): - self.index = 0 - self.string = string - self.next = self.__next() + self.index = 0 + self.string = string + self.next = self.__next() def __next(self): - if self.index >= len(self.string): - return None - char = self.string[self.index] - if char[0] == "\\": - try: - c = self.string[self.index + 1] - except IndexError: - raise error, "bogus escape" - char = char + c - self.index = self.index + len(char) - return char + if self.index >= len(self.string): + return None + char = self.string[self.index] + if char[0] == "\\": + try: + c = self.string[self.index + 1] + except IndexError: + raise error, "bogus escape" + char = char + c + self.index = self.index + len(char) + return char def match(self, char): - if char == self.next: - self.next = self.__next() - return 1 - return 0 + if char == self.next: + self.next = self.__next() + return 1 + return 0 def match_set(self, set): - if self.next and self.next in set: - self.next = self.__next() - return 1 - return 0 + if self.next and self.next in set: + self.next = self.__next() + return 1 + return 0 def get(self): - this = self.next - self.next = self.__next() - return this + this = self.next + self.next = self.__next() + return this def isident(char): return "a" <= char <= "z" or "A" <= char <= "Z" or char == "_" @@ -175,127 +179,118 @@ def isdigit(char): def isname(name): # check that group name is a valid string - # FIXME: <fl> this code is really lame. should use a regular - # expression instead, but I seem to have certain bootstrapping - # problems here ;-) if not isident(name[0]): - return 0 + return 0 for char in name: - if not isident(char) and not isdigit(char): - return 0 + if not isident(char) and not isdigit(char): + return 0 return 1 -def _group(escape, state): +def _group(escape, groups): # check if the escape string represents a valid group try: - group = int(escape[1:]) - if group and group < state.groups: - return group + gid = int(escape[1:]) + if gid and gid < groups: + return gid except ValueError: - pass + pass return None # not a valid group def _class_escape(source, escape): # handle escape code inside character class code = ESCAPES.get(escape) if code: - return code + return code code = CATEGORIES.get(escape) if code: - return code + return code try: - if escape[1:2] == "x": - while source.next in HEXDIGITS: - escape = escape + source.get() - escape = escape[2:] - # FIXME: support unicode characters! - return LITERAL, chr(int(escape[-4:], 16) & 0xff) - elif str(escape[1:2]) in OCTDIGITS: - while source.next in OCTDIGITS: - escape = escape + source.get() - escape = escape[1:] - # FIXME: support unicode characters! - return LITERAL, chr(int(escape[-6:], 8) & 0xff) - if len(escape) == 2: - return LITERAL, escape[1] + if escape[1:2] == "x": + while source.next in HEXDIGITS: + escape = escape + source.get() + escape = escape[2:] + return LITERAL, int(escape[-4:], 16) & CHARMASK + elif str(escape[1:2]) in OCTDIGITS: + while source.next in OCTDIGITS: + escape = escape + source.get() + escape = escape[1:] + return LITERAL, int(escape[-6:], 8) & CHARMASK + if len(escape) == 2: + return LITERAL, ord(escape[1]) except ValueError: - pass + pass raise error, "bogus escape: %s" % repr(escape) def _escape(source, escape, state): # handle escape code in expression code = CATEGORIES.get(escape) if code: - return code + return code code = ESCAPES.get(escape) if code: - return code + return code try: - if escape[1:2] == "x": - while source.next in HEXDIGITS: - escape = escape + source.get() - escape = escape[2:] - # FIXME: support unicode characters! - return LITERAL, chr(int(escape[-4:], 16) & 0xff) - elif escape[1:2] in DIGITS: - while 1: - group = _group(escape, state) - if group: - if (not source.next or - not _group(escape + source.next, state)): - return GROUP, group - escape = escape + source.get() - elif source.next in OCTDIGITS: - escape = escape + source.get() - else: - break - escape = escape[1:] - # FIXME: support unicode characters! - return LITERAL, chr(int(escape[-6:], 8) & 0xff) - if len(escape) == 2: - return LITERAL, escape[1] + if escape[1:2] == "x": + while source.next in HEXDIGITS: + escape = escape + source.get() + escape = escape[2:] + return LITERAL, int(escape[-4:], 16) & CHARMASK + elif escape[1:2] in DIGITS: + while 1: + group = _group(escape, state.groups) + if group: + if (not source.next or + not _group(escape + source.next, state.groups)): + return GROUP, group + escape = escape + source.get() + elif source.next in OCTDIGITS: + escape = escape + source.get() + else: + break + escape = escape[1:] + return LITERAL, int(escape[-6:], 8) & CHARMASK + if len(escape) == 2: + return LITERAL, ord(escape[1]) except ValueError: - pass + pass raise error, "bogus escape: %s" % repr(escape) - def _branch(pattern, items): - # form a branch operator from a set of items subpattern = SubPattern(pattern) # check if all items share a common prefix while 1: - prefix = None - for item in items: - if not item: - break - if prefix is None: - prefix = item[0] - elif item[0] != prefix: - break - else: - # all subitems start with a common "prefix". - # move it out of the branch - for item in items: - del item[0] - subpattern.append(prefix) - continue # check next one - break + prefix = None + for item in items: + if not item: + break + if prefix is None: + prefix = item[0] + elif item[0] != prefix: + break + else: + # all subitems start with a common "prefix". + # move it out of the branch + for item in items: + del item[0] + subpattern.append(prefix) + continue # check next one + break # check if the branch can be replaced by a character set for item in items: - if len(item) != 1 or item[0][0] != LITERAL: - break + if len(item) != 1 or item[0][0] != LITERAL: + break else: - # we can store this as a character set instead of a - # branch (FIXME: use a range if possible) - set = [] - for item in items: - set.append(item[0]) - subpattern.append((IN, set)) - return subpattern + # we can store this as a character set instead of a + # branch (FIXME: use a range if possible) + set = [] + for item in items: + set.append(item[0]) + subpattern.append((IN, set)) + return subpattern subpattern.append((BRANCH, (None, items))) return subpattern @@ -308,197 +303,227 @@ def _parse(source, state, flags=0): while 1: - if source.next in ("|", ")"): - break # end of subpattern - this = source.get() - if this is None: - break # end of pattern - - if state.flags & SRE_FLAG_VERBOSE: - # skip whitespace and comments - if this in WHITESPACE: - continue - if this == "#": - while 1: - this = source.get() - if this in (None, "\n"): - break - continue - - if this and this[0] not in SPECIAL_CHARS: - subpattern.append((LITERAL, this)) - - elif this == "[": - # character set - set = [] -## if source.match(":"): -## pass # handle character classes - if source.match("^"): - set.append((NEGATE, None)) - # check remaining characters - start = set[:] - while 1: - this = source.get() - if this == "]" and set != start: - break - elif this and this[0] == "\\": - code1 = _class_escape(source, this) - elif this: - code1 = LITERAL, this - else: - raise error, "unexpected end of regular expression" - if source.match("-"): - # potential range - this = source.get() - if this == "]": - set.append(code1) - set.append((LITERAL, "-")) - break - else: - if this[0] == "\\": - code2 = _class_escape(source, this) - else: - code2 = LITERAL, this - if code1[0] != LITERAL or code2[0] != LITERAL: - raise error, "illegal range" - if len(code1[1]) != 1 or len(code2[1]) != 1: - raise error, "illegal range" - set.append((RANGE, (code1[1], code2[1]))) - else: - if code1[0] is IN: - code1 = code1[1][0] - set.append(code1) - - # FIXME: <fl> move set optimization to compiler! - if len(set)==1 and set[0][0] is LITERAL: - subpattern.append(set[0]) # optimization - elif len(set)==2 and set[0][0] is NEGATE and set[1][0] is LITERAL: - subpattern.append((NOT_LITERAL, set[1][1])) # optimization - else: - # FIXME: <fl> add charmap optimization - subpattern.append((IN, set)) - - elif this and this[0] in REPEAT_CHARS: - # repeat previous item - if this == "?": - min, max = 0, 1 - elif this == "*": - min, max = 0, MAXREPEAT - elif this == "+": - min, max = 1, MAXREPEAT - elif this == "{": - min, max = 0, MAXREPEAT - lo = hi = "" - while source.next in DIGITS: - lo = lo + source.get() - if source.match(","): - while source.next in DIGITS: - hi = hi + source.get() - else: - hi = lo - if not source.match("}"): - raise error, "bogus range" - if lo: - min = int(lo) - if hi: - max = int(hi) - # FIXME: <fl> check that hi >= lo! - else: - raise error, "not supported" - # figure out which item to repeat - if subpattern: - item = subpattern[-1:] - else: - raise error, "nothing to repeat" - if source.match("?"): - subpattern[-1] = (MIN_REPEAT, (min, max, item)) - else: - subpattern[-1] = (MAX_REPEAT, (min, max, item)) - - elif this == ".": - subpattern.append((ANY, None)) - - elif this == "(": - group = 1 - name = None - if source.match("?"): - group = 0 - # options - if source.match("P"): - # python extensions - if source.match("<"): - # named group: skip forward to end of name - name = "" - while 1: - char = source.get() - if char is None: - raise error, "unterminated name" - if char == ">": - break - name = name + char - group = 1 - if not isname(name): - raise error, "illegal character in group name" - elif source.match("="): - # named backreference - raise error, "not yet implemented" - else: - char = source.get() - if char is None: - raise error, "unexpected end of pattern" - raise error, "unknown specifier: ?P%s" % char - elif source.match(":"): - # non-capturing group - group = 2 - elif source.match("#"): - # comment - while 1: - if source.next is None or source.next == ")": - break - source.get() - else: - # flags - while FLAGS.has_key(source.next): - state.flags = state.flags | FLAGS[source.get()] - if group: - # parse group contents - b = [] - if group == 2: - # anonymous group - group = None - else: - group = state.getgroup(name) - while 1: - p = _parse(source, state, flags) - if source.match(")"): - if b: - b.append(p) - p = _branch(state, b) - subpattern.append((SUBPATTERN, (group, p))) - break - elif source.match("|"): - b.append(p) - else: - raise error, "group not properly closed" - else: - while 1: - char = source.get() - if char is None or char == ")": - break - raise error, "unknown extension" - - elif this == "^": - subpattern.append((AT, AT_BEGINNING)) - - elif this == "$": - subpattern.append((AT, AT_END)) - - elif this and this[0] == "\\": - code = _escape(source, this, state) - subpattern.append(code) - - else: - raise error, "parser error" + if source.next in ("|", ")"): + break # end of subpattern + this = source.get() + if this is None: + break # end of pattern + + if state.flags & SRE_FLAG_VERBOSE: + # skip whitespace and comments + if this in WHITESPACE: + continue + if this == "#": + while 1: + this = source.get() + if this in (None, "\n"): + break + continue + + if this and this[0] not in SPECIAL_CHARS: + subpattern.append((LITERAL, ord(this))) + + elif this == "[": + # character set + set = [] +## if source.match(":"): +## pass # handle character classes + if source.match("^"): + set.append((NEGATE, None)) + # check remaining characters + start = set[:] + while 1: + this = source.get() + if this == "]" and set != start: + break + elif this and this[0] == "\\": + code1 = _class_escape(source, this) + elif this: + code1 = LITERAL, ord(this) + else: + raise error, "unexpected end of regular expression" + if source.match("-"): + # potential range + this = source.get() + if this == "]": + set.append(code1) + set.append((LITERAL, ord("-"))) + break + else: + if this[0] == "\\": + code2 = _class_escape(source, this) + else: + code2 = LITERAL, ord(this) + if code1[0] != LITERAL or code2[0] != LITERAL: + raise error, "illegal range" + set.append((RANGE, (code1[1], code2[1]))) + else: + if code1[0] is IN: + code1 = code1[1][0] + set.append(code1) + + # FIXME: <fl> move set optimization to compiler! + if len(set)==1 and set[0][0] is LITERAL: + subpattern.append(set[0]) # optimization + elif len(set)==2 and set[0][0] is NEGATE and set[1][0] is LITERAL: + subpattern.append((NOT_LITERAL, set[1][1])) # optimization + else: + # FIXME: <fl> add charmap optimization + subpattern.append((IN, set)) + + elif this and this[0] in REPEAT_CHARS: + # repeat previous item + if this == "?": + min, max = 0, 1 + elif this == "*": + min, max = 0, MAXREPEAT + elif this == "+": + min, max = 1, MAXREPEAT + elif this == "{": + min, max = 0, MAXREPEAT + lo = hi = "" + while source.next in DIGITS: + lo = lo + source.get() + if source.match(","): + while source.next in DIGITS: + hi = hi + source.get() + else: + hi = lo + if not source.match("}"): + raise error, "bogus range" + if lo: + min = int(lo) + if hi: + max = int(hi) + # FIXME: <fl> check that hi >= lo! + else: + raise error, "not supported" + # figure out which item to repeat + if subpattern: + item = subpattern[-1:] + else: + raise error, "nothing to repeat" + if source.match("?"): + subpattern[-1] = (MIN_REPEAT, (min, max, item)) + else: + subpattern[-1] = (MAX_REPEAT, (min, max, item)) + + elif this == ".": + subpattern.append((ANY, None)) + + elif this == "(": + group = 1 + name = None + if source.match("?"): + group = 0 + # options + if source.match("P"): + # python extensions + if source.match("<"): + # named group: skip forward to end of name + name = "" + while 1: + char = source.get() + if char is None: + raise error, "unterminated name" + if char == ">": + break + name = name + char + group = 1 + if not isname(name): + raise error, "illegal character in group name" + elif source.match("="): + # named backreference + name = "" + while 1: + char = source.get() + if char is None: + raise error, "unterminated name" + if char == ")": + break + name = name + char + if not isname(name): + raise error, "illegal character in group name" + gid = state.groupdict.get(name) + if gid is None: + raise error, "unknown group name" + subpattern.append((GROUP, gid)) + else: + char = source.get() + if char is None: + raise error, "unexpected end of pattern" + raise error, "unknown specifier: ?P%s" % char + elif source.match(":"): + # non-capturing group + group = 2 + elif source.match("#"): + # comment + while 1: + if source.next is None or source.next == ")": + break + source.get() + elif source.next in ("=", "!"): + # lookahead assertions + char = source.get() + b = [] + while 1: + p = _parse(source, state, flags) + if source.next == ")": + if b: + b.append(p) + p = _branch(state, b) + if char == "=": + subpattern.append((ASSERT, p)) + else: + subpattern.append((ASSERT_NOT, p)) + break + elif source.match("|"): + b.append(p) + else: + raise error, "pattern not properly closed" + else: + # flags + while FLAGS.has_key(source.next): + state.flags = state.flags | FLAGS[source.get()] + if group: + # parse group contents + b = [] + if group == 2: + # anonymous group + group = None + else: + group = state.getgroup(name) + while 1: + p = _parse(source, state, flags) + if source.match(")"): + if b: + b.append(p) + p = _branch(state, b) + subpattern.append((SUBPATTERN, (group, p))) + break + elif source.match("|"): + b.append(p) + else: + raise error, "group not properly closed" + else: + while 1: + char = source.get() + if char is None or char == ")": + break + raise error, "unknown extension" + + elif this == "^": + subpattern.append((AT, AT_BEGINNING)) + + elif this == "$": + subpattern.append((AT, AT_END)) + + elif this and this[0] == "\\": + code = _escape(source, this, state) + subpattern.append(code) + + else: + raise error, "parser error" return subpattern @@ -508,19 +533,19 @@ def parse(pattern, flags=0): state = State() b = [] while 1: - p = _parse(source, state, flags) - tail = source.get() - if tail == "|": - b.append(p) - elif tail == ")": - raise error, "unbalanced parenthesis" - elif tail is None: - if b: - b.append(p) - p = _branch(state, b) - break - else: - raise error, "bogus characters at end of regular expression" + p = _parse(source, state, flags) + tail = source.get() + if tail == "|": + b.append(p) + elif tail == ")": + raise error, "unbalanced parenthesis" + elif tail is None: + if b: + b.append(p) + p = _branch(state, b) + break + else: + raise error, "bogus characters at end of regular expression" return p def parse_template(source, pattern): @@ -530,44 +555,58 @@ def parse_template(source, pattern): p = [] a = p.append while 1: - this = s.get() - if this is None: - break # end of replacement string - if this and this[0] == "\\": - if this == "\\g": - name = "" - if s.match("<"): - while 1: - char = s.get() - if char is None: - raise error, "unterminated group name" - if char == ">": - break - name = name + char - if not name: - raise error, "bad group name" - try: - index = int(name) - except ValueError: - if not isname(name): - raise error, "illegal character in group name" - try: - index = pattern.groupindex[name] - except KeyError: - raise IndexError, "unknown group name" - a((MARK, index)) - elif len(this) > 1 and this[1] in DIGITS: - while s.next in DIGITS: - this = this + s.get() - a((MARK, int(this[1:]))) - else: - try: - a(ESCAPES[this]) - except KeyError: - for char in this: - a((LITERAL, char)) - else: - a((LITERAL, this)) + this = s.get() + if this is None: + break # end of replacement string + if this and this[0] == "\\": + # group + if this == "\\g": + name = "" + if s.match("<"): + while 1: + char = s.get() + if char is None: + raise error, "unterminated group name" + if char == ">": + break + name = name + char + if not name: + raise error, "bad group name" + try: + index = int(name) + except ValueError: + if not isname(name): + raise error, "illegal character in group name" + try: + index = pattern.groupindex[name] + except KeyError: + raise IndexError, "unknown group name" + a((MARK, index)) + elif len(this) > 1 and this[1] in DIGITS: + code = None + while 1: + group = _group(this, pattern.groups+1) + if group: + if (not s.next or + not _group(this + s.next, pattern.groups+1)): + code = MARK, int(group) + break + elif s.next in OCTDIGITS: + this = this + s.get() + else: + break + if not code: + this = this[1:] + code = LITERAL, int(this[-6:], 8) & CHARMASK + a(code) + else: + try: + a(ESCAPES[this]) + except KeyError: + for c in this: + a((LITERAL, ord(c))) + else: + a((LITERAL, ord(this))) return p def expand_template(template, match): @@ -575,12 +614,17 @@ def expand_template(template, match): # code instead p = [] a = p.append + sep = match.string[:0] + if type(sep) is type(""): + char = chr + else: + char = unichr for c, s in template: - if c is LITERAL: - a(s) - elif c is MARK: - s = match.group(s) - if s is None: - raise error, "empty group" - a(s) - return match.string[:0].join(p) + if c is LITERAL: + a(char(s)) + elif c is MARK: + s = match.group(s) + if s is None: + raise error, "empty group" + a(s) + return sep.join(p) diff --git a/Lib/dos-8x3/test_has.py b/Lib/dos-8x3/test_has.py new file mode 100644 index 0000000..51b4c33 --- /dev/null +++ b/Lib/dos-8x3/test_has.py @@ -0,0 +1,26 @@ +# test the invariant that +# iff a==b then hash(a)==hash(b) +# + +import test_support + + +def same_hash(*objlist): + # hash each object given an raise TestFailed if + # the hash values are not all the same + hashed = map(hash, objlist) + for h in hashed[1:]: + if h != hashed[0]: + raise TestFailed, "hashed values differ: %s" % `objlist` + + + +same_hash(1, 1L, 1.0, 1.0+0.0j) +same_hash(int(1), long(1), float(1), complex(1)) + +same_hash(long(1.23e300), float(1.23e300)) + +same_hash(float(0.5), complex(0.5, 0.0)) + + + |