diff options
author | Fredrik Lundh <fredrik@pythonware.com> | 2000-07-02 12:00:07 (GMT) |
---|---|---|
committer | Fredrik Lundh <fredrik@pythonware.com> | 2000-07-02 12:00:07 (GMT) |
commit | 3562f1176403653ebfbef6275d449ad42d1b843a (patch) | |
tree | 371b851c1c6bd77c5149a4c553c5ca94824632b4 /Lib | |
parent | c13222cdff4373a9763b9c7df4b2e12e7e3b776f (diff) | |
download | cpython-3562f1176403653ebfbef6275d449ad42d1b843a.zip cpython-3562f1176403653ebfbef6275d449ad42d1b843a.tar.gz cpython-3562f1176403653ebfbef6275d449ad42d1b843a.tar.bz2 |
-- use charset bitmaps where appropriate. this gives a 5-10%
speedup for some tests, including the python tokenizer.
-- added support for an optional charset anchor to the engine
(currently unused by the code generator).
-- removed workaround for array module bug.
Diffstat (limited to 'Lib')
-rw-r--r-- | Lib/sre_compile.py | 110 | ||||
-rw-r--r-- | Lib/sre_constants.py | 27 | ||||
-rw-r--r-- | Lib/sre_parse.py | 7 |
3 files changed, 116 insertions, 28 deletions
diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py index 14b1970..a593ee7 100644 --- a/Lib/sre_compile.py +++ b/Lib/sre_compile.py @@ -16,12 +16,71 @@ import _sre from sre_constants import * # find an array type code that matches the engine's code size -for WORDSIZE in "BHil": +for WORDSIZE in "Hil": if len(array.array(WORDSIZE, [0]).tostring()) == _sre.getcodesize(): break else: raise RuntimeError, "cannot find a useable array type" +MAXCODE = 65535 + +def _charset(charset, fixup): + # internal: optimize character set + out = [] + charmap = [0]*256 + try: + for op, av in charset: + if op is NEGATE: + out.append((op, av)) + elif op is LITERAL: + charmap[fixup(av)] = 1 + elif op is RANGE: + for i in range(fixup(av[0]), fixup(av[1])+1): + charmap[i] = 1 + elif op is CATEGORY: + # FIXME: could append to charmap tail + return charset # cannot compress + except IndexError: + # unicode + return charset + # compress character map + i = p = n = 0 + runs = [] + for c in charmap: + if c: + if n == 0: + p = i + n = n + 1 + elif n: + runs.append((p, n)) + n = 0 + i = i + 1 + if n: + runs.append((p, n)) + if len(runs) <= 2: + # use literal/range + for p, n in runs: + if n == 1: + out.append((LITERAL, p)) + else: + out.append((RANGE, (p, p+n-1))) + if len(out) < len(charset): + return out + else: + # use bitmap + data = [] + m = 1; v = 0 + for c in charmap: + if c: + v = v + m + m = m << 1 + if m > MAXCODE: + data.append(v) + m = 1; v = 0 + out.append((CHARSET, data)) + return out + return charset + def _compile(code, pattern, flags): # internal: compile a (sub)pattern emit = code.append @@ -41,7 +100,7 @@ def _compile(code, pattern, flags): emit(OPCODES[op]) fixup = lambda x: x skip = len(code); emit(0) - for op, av in av: + for op, av in _charset(av, fixup): emit(OPCODES[op]) if op is NEGATE: pass @@ -50,6 +109,8 @@ def _compile(code, pattern, flags): elif op is RANGE: emit(fixup(av[0])) emit(fixup(av[1])) + elif op is CHARSET: + code.extend(av) elif op is CATEGORY: if flags & SRE_FLAG_LOCALE: emit(CHCODES[CH_LOCALE[av]]) @@ -155,13 +216,14 @@ def _compile(code, pattern, flags): def _compile_info(code, pattern, flags): # internal: compile an info block. in the current version, - # this contains min/max pattern width and a literal prefix, - # if any + # this contains min/max pattern width, and an optional literal + # prefix or a character map lo, hi = pattern.getwidth() if lo == 0: return # not worth it # look for a literal prefix prefix = [] + charset = [] # not used if not (flags & SRE_FLAG_IGNORECASE): for op, av in pattern.data: if op is LITERAL: @@ -174,26 +236,40 @@ def _compile_info(code, pattern, flags): skip = len(code); emit(0) # literal flag mask = 0 - if len(prefix) == len(pattern.data): - mask = 1 + if prefix: + mask = SRE_INFO_PREFIX + if len(prefix) == len(pattern.data): + mask = mask + SRE_INFO_LITERAL + elif charset: + mask = mask + SRE_INFO_CHARSET emit(mask) # pattern length - emit(lo) - if hi < 32768: + if lo < MAXCODE: + emit(lo) + else: + emit(MAXCODE) + prefix = prefix[:MAXCODE] + if hi < MAXCODE: emit(hi) else: emit(0) # add literal prefix - emit(len(prefix)) if prefix: - code.extend(prefix) - # generate overlap table - table = [-1] + ([0]*len(prefix)) - for i in range(len(prefix)): - table[i+1] = table[i]+1 - while table[i+1] > 0 and prefix[i] != prefix[table[i+1]-1]: - table[i+1] = table[table[i+1]-1]+1 - code.extend(table[1:]) # don't store first entry + emit(len(prefix)) + if prefix: + code.extend(prefix) + # generate overlap table + table = [-1] + ([0]*len(prefix)) + for i in range(len(prefix)): + table[i+1] = table[i]+1 + while table[i+1] > 0 and prefix[i] != prefix[table[i+1]-1]: + table[i+1] = table[table[i+1]-1]+1 + code.extend(table[1:]) # don't store first entry + elif charset: + for char in charset: + emit(OPCODES[LITERAL]) + emit(char) + emit(OPCODES[FAILURE]) code[skip] = len(code) - skip def compile(p, flags=0): diff --git a/Lib/sre_constants.py b/Lib/sre_constants.py index 39db58f..f0e45ea 100644 --- a/Lib/sre_constants.py +++ b/Lib/sre_constants.py @@ -28,6 +28,7 @@ AT = "at" BRANCH = "branch" CALL = "call" CATEGORY = "category" +CHARSET = "charset" GROUP = "group" GROUP_IGNORE = "group_ignore" IN = "in" @@ -87,6 +88,7 @@ OPCODES = [ BRANCH, CALL, CATEGORY, + CHARSET, GROUP, GROUP_IGNORE, IN, IN_IGNORE, INFO, @@ -166,13 +168,18 @@ CH_UNICODE = { } # flags -SRE_FLAG_TEMPLATE = 1 -SRE_FLAG_IGNORECASE = 2 -SRE_FLAG_LOCALE = 4 -SRE_FLAG_MULTILINE = 8 -SRE_FLAG_DOTALL = 16 -SRE_FLAG_UNICODE = 32 -SRE_FLAG_VERBOSE = 64 +SRE_FLAG_TEMPLATE = 1 # template mode (disable backtracking) +SRE_FLAG_IGNORECASE = 2 # case insensitive +SRE_FLAG_LOCALE = 4 # honour system locale +SRE_FLAG_MULTILINE = 8 # treat target as multiline string +SRE_FLAG_DOTALL = 16 # treat target as a single string +SRE_FLAG_UNICODE = 32 # use unicode locale +SRE_FLAG_VERBOSE = 64 # ignore whitespace and comments + +# flags for INFO primitive +SRE_INFO_PREFIX = 1 # has prefix +SRE_INFO_LITERAL = 2 # entire pattern is literal (given by prefix) +SRE_INFO_CHARSET = 4 # pattern starts with character from given set if __name__ == "__main__": import string @@ -201,6 +208,7 @@ if __name__ == "__main__": dump(f, OPCODES, "SRE_OP") dump(f, ATCODES, "SRE") dump(f, CHCODES, "SRE") + f.write("#define SRE_FLAG_TEMPLATE %d\n" % SRE_FLAG_TEMPLATE) f.write("#define SRE_FLAG_IGNORECASE %d\n" % SRE_FLAG_IGNORECASE) f.write("#define SRE_FLAG_LOCALE %d\n" % SRE_FLAG_LOCALE) @@ -208,5 +216,10 @@ if __name__ == "__main__": f.write("#define SRE_FLAG_DOTALL %d\n" % SRE_FLAG_DOTALL) f.write("#define SRE_FLAG_UNICODE %d\n" % SRE_FLAG_UNICODE) f.write("#define SRE_FLAG_VERBOSE %d\n" % SRE_FLAG_VERBOSE) + + f.write("#define SRE_INFO_PREFIX %d\n" % SRE_INFO_PREFIX) + f.write("#define SRE_INFO_LITERAL %d\n" % SRE_INFO_LITERAL) + f.write("#define SRE_INFO_CHARSET %d\n" % SRE_INFO_CHARSET) + f.close() print "done" diff --git a/Lib/sre_parse.py b/Lib/sre_parse.py index 12f49c3..b263256 100644 --- a/Lib/sre_parse.py +++ b/Lib/sre_parse.py @@ -16,11 +16,10 @@ import _sre from sre_constants import * -# FIXME: should be 65535, but the arraymodule is still broken -MAXREPEAT = 32767 +MAXREPEAT = 65535 -# FIXME: might change in 2.0 final. but for now, this seems -# to be the best way to be compatible with 1.5.2 +# FIXME: the following might change in 2.0 final. but for now, this +# seems to be the best way to be compatible with 1.5.2 CHARMASK = 0xff SPECIAL_CHARS = ".\\[{()*+?^$|" |