diff options
-rw-r--r-- | Lib/sre_compile.py | 81 | ||||
-rw-r--r-- | Lib/sre_constants.py | 5 | ||||
-rw-r--r-- | Modules/_sre.c | 13 | ||||
-rw-r--r-- | Modules/sre_constants.h | 39 |
4 files changed, 107 insertions, 31 deletions
diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py index 44cb23e..539e878 100644 --- a/Lib/sre_compile.py +++ b/Lib/sre_compile.py @@ -156,6 +156,8 @@ def _compile_charset(charset, flags, code, fixup=None): emit(fixup(av[1])) elif op is CHARSET: code.extend(av) + elif op is BIGCHARSET: + code.extend(av) elif op is CATEGORY: if flags & SRE_FLAG_LOCALE: emit(CHCODES[CH_LOCALE[av]]) @@ -185,7 +187,7 @@ def _optimize_charset(charset, fixup): return charset # cannot compress except IndexError: # character set contains unicode characters - return charset + return _optimize_unicode(charset, fixup) # compress character map i = p = n = 0 runs = [] @@ -211,19 +213,78 @@ def _optimize_charset(charset, fixup): return out else: # use bitmap - data = [] - m = 1; v = 0 - for c in charmap: - if c: - v = v + m - m = m << 1 - if m > MAXCODE: - data.append(v) - m = 1; v = 0 + data = _mk_bitmap(charmap) out.append((CHARSET, data)) return out return charset +def _mk_bitmap(bits): + data = [] + m = 1; v = 0 + for c in bits: + if c: + v = v + m + m = m << 1 + if m > MAXCODE: + data.append(v) + m = 1; v = 0 + return data + +# To represent a big charset, first a bitmap of all characters in the +# set is constructed. Then, this bitmap is sliced into chunks of 256 +# characters, duplicate chunks are eliminitated, and each chunk is +# given a number. In the compiled expression, the charset is +# represented by a 16-bit word sequence, consisting of one word for +# the number of different chunks, a sequence of 256 bytes (128 words) +# of chunk numbers indexed by their original chunk position, and a +# sequence of chunks (16 words each). + +# Compression is normally good: in a typical charset, large ranges of +# Unicode will be either completely excluded (e.g. if only cyrillic +# letters are to be matched), or completely included (e.g. if large +# subranges of Kanji match). These ranges will be represented by +# chunks of all one-bits or all zero-bits. + +# Matching can be also done efficiently: the more significant byte of +# the Unicode character is an index into the chunk number, and the +# less significant byte is a bit index in the chunk (just like the +# CHARSET matching). + +def _optimize_unicode(charset, fixup): + charmap = [0]*65536 + negate = 0 + for op, av in charset: + if op is NEGATE: + negate = 1 + elif op is LITERAL: + charmap[fixup(av)] = 1 + elif op is RANGE: + for i in range(fixup(av[0]), fixup(av[1])+1): + charmap[i] = 1 + elif op is CATEGORY: + # XXX: could expand category + return charset # cannot compress + if negate: + for i in range(65536): + charmap[i] = not charmap[i] + comps = {} + mapping = [0]*256 + block = 0 + data = [] + for i in range(256): + chunk = tuple(charmap[i*256:(i+1)*256]) + new = comps.setdefault(chunk, block) + mapping[i] = new + if new == block: + block += 1 + data += _mk_bitmap(chunk) + header = [block] + assert MAXCODE == 65535 + for i in range(128): + header.append(mapping[2*i]+256*mapping[2*i+1]) + data[0:0] = header + return [(BIGCHARSET, data)] + def _simple(av): # check if av is a "simple" operator lo, hi = av[2].getwidth() diff --git a/Lib/sre_constants.py b/Lib/sre_constants.py index bbe7880..3296b94 100644 --- a/Lib/sre_constants.py +++ b/Lib/sre_constants.py @@ -11,7 +11,7 @@ # update when constants are added or removed -MAGIC = 20010320 +MAGIC = 20010701 # max code word in this release @@ -33,6 +33,7 @@ ANY_ALL = "any_all" ASSERT = "assert" ASSERT_NOT = "assert_not" AT = "at" +BIGCHARSET = "bigcharset" BRANCH = "branch" CALL = "call" CATEGORY = "category" @@ -103,7 +104,7 @@ OPCODES = [ BRANCH, CALL, CATEGORY, - CHARSET, + CHARSET, BIGCHARSET, GROUPREF, GROUPREF_IGNORE, IN, IN_IGNORE, INFO, diff --git a/Modules/_sre.c b/Modules/_sre.c index 3e2d907..51a747a 100644 --- a/Modules/_sre.c +++ b/Modules/_sre.c @@ -506,6 +506,19 @@ SRE_CHARSET(SRE_CODE* set, SRE_CODE ch) set += 16; break; + case SRE_OP_BIGCHARSET: + /* <BIGCHARSET> <blockcount> <256 blockindices> <blocks> */ + { + int count, block; + count = *(set++); + block = ((unsigned char*)set)[ch >> 8]; + set += 128; + if (set[block*16 + ((ch & 255)>>4)] & (1 << (ch & 15))) + return ok; + set += count*16; + break; + } + case SRE_OP_CATEGORY: /* <CATEGORY> <code> */ if (sre_category(set[0], (int) ch)) diff --git a/Modules/sre_constants.h b/Modules/sre_constants.h index 73bcb34..ebb9fd0 100644 --- a/Modules/sre_constants.h +++ b/Modules/sre_constants.h @@ -11,7 +11,7 @@ * See the _sre.c file for information on usage and redistribution. */ -#define SRE_MAGIC 20010320 +#define SRE_MAGIC 20010701 #define SRE_OP_FAILURE 0 #define SRE_OP_SUCCESS 1 #define SRE_OP_ANY 2 @@ -23,24 +23,25 @@ #define SRE_OP_CALL 8 #define SRE_OP_CATEGORY 9 #define SRE_OP_CHARSET 10 -#define SRE_OP_GROUPREF 11 -#define SRE_OP_GROUPREF_IGNORE 12 -#define SRE_OP_IN 13 -#define SRE_OP_IN_IGNORE 14 -#define SRE_OP_INFO 15 -#define SRE_OP_JUMP 16 -#define SRE_OP_LITERAL 17 -#define SRE_OP_LITERAL_IGNORE 18 -#define SRE_OP_MARK 19 -#define SRE_OP_MAX_UNTIL 20 -#define SRE_OP_MIN_UNTIL 21 -#define SRE_OP_NOT_LITERAL 22 -#define SRE_OP_NOT_LITERAL_IGNORE 23 -#define SRE_OP_NEGATE 24 -#define SRE_OP_RANGE 25 -#define SRE_OP_REPEAT 26 -#define SRE_OP_REPEAT_ONE 27 -#define SRE_OP_SUBPATTERN 28 +#define SRE_OP_BIGCHARSET 11 +#define SRE_OP_GROUPREF 12 +#define SRE_OP_GROUPREF_IGNORE 13 +#define SRE_OP_IN 14 +#define SRE_OP_IN_IGNORE 15 +#define SRE_OP_INFO 16 +#define SRE_OP_JUMP 17 +#define SRE_OP_LITERAL 18 +#define SRE_OP_LITERAL_IGNORE 19 +#define SRE_OP_MARK 20 +#define SRE_OP_MAX_UNTIL 21 +#define SRE_OP_MIN_UNTIL 22 +#define SRE_OP_NOT_LITERAL 23 +#define SRE_OP_NOT_LITERAL_IGNORE 24 +#define SRE_OP_NEGATE 25 +#define SRE_OP_RANGE 26 +#define SRE_OP_REPEAT 27 +#define SRE_OP_REPEAT_ONE 28 +#define SRE_OP_SUBPATTERN 29 #define SRE_AT_BEGINNING 0 #define SRE_AT_BEGINNING_LINE 1 #define SRE_AT_BEGINNING_STRING 2 |