summaryrefslogtreecommitdiffstats
path: root/Lib/sre_compile.py
blob: 1d5dae37f5f018216e60150ce0027ac2bd536bf3 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
#
# Secret Labs' Regular Expression Engine
#
# convert template to internal format
#
# Copyright (c) 1997-2001 by Secret Labs AB.  All rights reserved.
#
# See the sre.py file for information on usage and redistribution.
#

"""Internal support module for sre"""

import _sre, sys

from sre_constants import *

assert _sre.MAGIC == MAGIC, "SRE module mismatch"

if _sre.CODESIZE == 2:
    MAXCODE = 65535
else:
    MAXCODE = 0xFFFFFFFFL

def _compile(code, pattern, flags):
    # internal: compile a (sub)pattern
    emit = code.append
    for op, av in pattern:
        if op in (LITERAL, NOT_LITERAL):
            if flags & SRE_FLAG_IGNORECASE:
                emit(OPCODES[OP_IGNORE[op]])
                emit(_sre.getlower(av, flags))
            else:
                emit(OPCODES[op])
                emit(av)
        elif op is IN:
            if flags & SRE_FLAG_IGNORECASE:
                emit(OPCODES[OP_IGNORE[op]])
                def fixup(literal, flags=flags):
                    return _sre.getlower(literal, flags)
            else:
                emit(OPCODES[op])
                fixup = lambda x: x
            skip = len(code); emit(0)
            _compile_charset(av, flags, code, fixup)
            code[skip] = len(code) - skip
        elif op is ANY:
            if flags & SRE_FLAG_DOTALL:
                emit(OPCODES[ANY_ALL])
            else:
                emit(OPCODES[ANY])
        elif op in (REPEAT, MIN_REPEAT, MAX_REPEAT):
            if flags & SRE_FLAG_TEMPLATE:
                raise error, "internal: unsupported template operator"
                emit(OPCODES[REPEAT])
                skip = len(code); emit(0)
                emit(av[0])
                emit(av[1])
                _compile(code, av[2], flags)
                emit(OPCODES[SUCCESS])
                code[skip] = len(code) - skip
            elif _simple(av) and op != REPEAT:
                if op == MAX_REPEAT:
                    emit(OPCODES[REPEAT_ONE])
                else:
                    emit(OPCODES[MIN_REPEAT_ONE])
                skip = len(code); emit(0)
                emit(av[0])
                emit(av[1])
                _compile(code, av[2], flags)
                emit(OPCODES[SUCCESS])
                code[skip] = len(code) - skip
            else:
                emit(OPCODES[REPEAT])
                skip = len(code); emit(0)
                emit(av[0])
                emit(av[1])
                _compile(code, av[2], flags)
                code[skip] = len(code) - skip
                if op == MAX_REPEAT:
                    emit(OPCODES[MAX_UNTIL])
                else:
                    emit(OPCODES[MIN_UNTIL])
        elif op is SUBPATTERN:
            if av[0]:
                emit(OPCODES[MARK])
                emit((av[0]-1)*2)
            # _compile_info(code, av[1], flags)
            _compile(code, av[1], flags)
            if av[0]:
                emit(OPCODES[MARK])
                emit((av[0]-1)*2+1)
        elif op in (SUCCESS, FAILURE):
            emit(OPCODES[op])
        elif op in (ASSERT, ASSERT_NOT):
            emit(OPCODES[op])
            skip = len(code); emit(0)
            if av[0] >= 0:
                emit(0) # look ahead
            else:
                lo, hi = av[1].getwidth()
                if lo != hi:
                    raise error, "look-behind requires fixed-width pattern"
                emit(lo) # look behind
            _compile(code, av[1], flags)
            emit(OPCODES[SUCCESS])
            code[skip] = len(code) - skip
        elif op is CALL:
            emit(OPCODES[op])
            skip = len(code); emit(0)
            _compile(code, av, flags)
            emit(OPCODES[SUCCESS])
            code[skip] = len(code) - skip
        elif op is AT:
            emit(OPCODES[op])
            if flags & SRE_FLAG_MULTILINE:
                av = AT_MULTILINE.get(av, av)
            if flags & SRE_FLAG_LOCALE:
                av = AT_LOCALE.get(av, av)
            elif flags & SRE_FLAG_UNICODE:
                av = AT_UNICODE.get(av, av)
            emit(ATCODES[av])
        elif op is BRANCH:
            emit(OPCODES[op])
            tail = []
            for av in av[1]:
                skip = len(code); emit(0)
                # _compile_info(code, av, flags)
                _compile(code, av, flags)
                emit(OPCODES[JUMP])
                tail.append(len(code)); emit(0)
                code[skip] = len(code) - skip
            emit(0) # end of branch
            for tail in tail:
                code[tail] = len(code) - tail
        elif op is CATEGORY:
            emit(OPCODES[op])
            if flags & SRE_FLAG_LOCALE:
                av = CH_LOCALE[av]
            elif flags & SRE_FLAG_UNICODE:
                av = CH_UNICODE[av]
            emit(CHCODES[av])
        elif op is GROUPREF:
            if flags & SRE_FLAG_IGNORECASE:
                emit(OPCODES[OP_IGNORE[op]])
            else:
                emit(OPCODES[op])
            emit(av-1)
        elif op is GROUPREF_EXISTS:
            emit(OPCODES[op])
            emit((av[0]-1)*2)
            skipyes = len(code); emit(0)
            _compile(code, av[1], flags)
            if av[2]:
                emit(OPCODES[JUMP])
                skipno = len(code); emit(0)
                code[skipyes] = len(code) - skipyes + 1
                _compile(code, av[2], flags)
                code[skipno] = len(code) - skipno
            else:
                code[skipyes] = len(code) - skipyes + 1
        else:
            raise ValueError, ("unsupported operand type", op)

def _compile_charset(charset, flags, code, fixup=None):
    # compile charset subprogram
    emit = code.append
    if fixup is None:
        fixup = lambda x: x
    for op, av in _optimize_charset(charset, fixup):
        emit(OPCODES[op])
        if op is NEGATE:
            pass
        elif op is LITERAL:
            emit(fixup(av))
        elif op is RANGE:
            emit(fixup(av[0]))
            emit(fixup(av[1]))
        elif op is CHARSET:
            code.extend(av)
        elif op is BIGCHARSET:
            code.extend(av)
        elif op is CATEGORY:
            if flags & SRE_FLAG_LOCALE:
                emit(CHCODES[CH_LOCALE[av]])
            elif flags & SRE_FLAG_UNICODE:
                emit(CHCODES[CH_UNICODE[av]])
            else:
                emit(CHCODES[av])
        else:
            raise error, "internal: unsupported set operator"
    emit(OPCODES[FAILURE])

def _optimize_charset(charset, fixup):
    # internal: optimize character set
    out = []
    charmap = [False]*256
    try:
        for op, av in charset:
            if op is NEGATE:
                out.append((op, av))
            elif op is LITERAL:
                charmap[fixup(av)] = True
            elif op is RANGE:
                for i in range(fixup(av[0]), fixup(av[1])+1):
                    charmap[i] = True
            elif op is CATEGORY:
                # XXX: could append to charmap tail
                return charset # cannot compress
    except IndexError:
        # character set contains unicode characters
        return _optimize_unicode(charset, fixup)
    # compress character map
    i = p = n = 0
    runs = []
    for c in charmap:
        if c:
            if n == 0:
                p = i
            n = n + 1
        elif n:
            runs.append((p, n))
            n = 0
        i = i + 1
    if n:
        runs.append((p, n))
    if len(runs) <= 2:
        # use literal/range
        for p, n in runs:
            if n == 1:
                out.append((LITERAL, p))
            else:
                out.append((RANGE, (p, p+n-1)))
        if len(out) < len(charset):
            return out
    else:
        # use bitmap
        data = _mk_bitmap(charmap)
        out.append((CHARSET, data))
        return out
    return charset

def _mk_bitmap(bits):
    data = []
    if _sre.CODESIZE == 2:
        start = (1, 0)
    else:
        start = (1L, 0L)
    m, v = start
    for c in bits:
        if c:
            v = v + m
        m = m << 1
        if m > MAXCODE:
            data.append(v)
            m, v = start
    return data

# To represent a big charset, first a bitmap of all characters in the
# set is constructed. Then, this bitmap is sliced into chunks of 256
# characters, duplicate chunks are eliminitated, and each chunk is
# given a number. In the compiled expression, the charset is
# represented by a 16-bit word sequence, consisting of one word for
# the number of different chunks, a sequence of 256 bytes (128 words)
# of chunk numbers indexed by their original chunk position, and a
# sequence of chunks (16 words each).

# Compression is normally good: in a typical charset, large ranges of
# Unicode will be either completely excluded (e.g. if only cyrillic
# letters are to be matched), or completely included (e.g. if large
# subranges of Kanji match). These ranges will be represented by
# chunks of all one-bits or all zero-bits.

# Matching can be also done efficiently: the more significant byte of
# the Unicode character is an index into the chunk number, and the
# less significant byte is a bit index in the chunk (just like the
# CHARSET matching).

# In UCS-4 mode, the BIGCHARSET opcode still supports only subsets
# of the basic multilingual plane; an efficient representation
# for all of UTF-16 has not yet been developed. This means,
# in particular, that negated charsets cannot be represented as
# bigcharsets.

def _optimize_unicode(charset, fixup):
    try:
        import array
    except ImportError:
        return charset
    charmap = [False]*65536
    negate = 0
    try:
        for op, av in charset:
            if op is NEGATE:
                negate = 1
            elif op is LITERAL:
                charmap[fixup(av)] = True
            elif op is RANGE:
                for i in range(fixup(av[0]), fixup(av[1])+1):
                    charmap[i] = True
            elif op is CATEGORY:
                # XXX: could expand category
                return charset # cannot compress
    except IndexError:
        # non-BMP characters
        return charset
    if negate:
        if sys.maxunicode != 65535:
            # XXX: negation does not work with big charsets
            return charset
        for i in range(65536):
            charmap[i] = not charmap[i]
    comps = {}
    mapping = [0]*256
    block = 0
    data = []
    for i in range(256):
        chunk = tuple(charmap[i*256:(i+1)*256])
        new = comps.setdefault(chunk, block)
        mapping[i] = new
        if new == block:
            block = block + 1
            data = data + _mk_bitmap(chunk)
    header = [block]
    if MAXCODE == 65535:
        code = 'H'
    else:
        code = 'L'
    # Convert block indices to byte array of 256 bytes
    mapping = array.array('b', mapping).tostring()
    # Convert byte array to word array
    header = header + array.array(code, mapping).tolist()
    data[0:0] = header
    return [(BIGCHARSET, data)]

def _simple(av):
    # check if av is a "simple" operator
    lo, hi = av[2].getwidth()
    if lo == 0 and hi == MAXREPEAT:
        raise error, "nothing to repeat"
    return lo == hi == 1 and av[2][0][0] != SUBPATTERN

def _compile_info(code, pattern, flags):
    # internal: compile an info block.  in the current version,
    # this contains min/max pattern width, and an optional literal
    # prefix or a character map
    lo, hi = pattern.getwidth()
    if lo == 0:
        return # not worth it
    # look for a literal prefix
    prefix = []
    prefix_skip = 0
    charset = [] # not used
    if not (flags & SRE_FLAG_IGNORECASE):
        # look for literal prefix
        for op, av in pattern.data:
            if op is LITERAL:
                if len(prefix) == prefix_skip:
                    prefix_skip = prefix_skip + 1
                prefix.append(av)
            elif op is SUBPATTERN and len(av[1]) == 1:
                op, av = av[1][0]
                if op is LITERAL:
                    prefix.append(av)
                else:
                    break
            else:
                break
        # if no prefix, look for charset prefix
        if not prefix and pattern.data:
            op, av = pattern.data[0]
            if op is SUBPATTERN and av[1]:
                op, av = av[1][0]
                if op is LITERAL:
                    charset.append((op, av))
                elif op is BRANCH:
                    c = []
                    for p in av[1]:
                        if not p:
                            break
                        op, av = p[0]
                        if op is LITERAL:
                            c.append((op, av))
                        else:
                            break
                    else:
                        charset = c
            elif op is BRANCH:
                c = []
                for p in av[1]:
                    if not p:
                        break
                    op, av = p[0]
                    if op is LITERAL:
                        c.append((op, av))
                    else:
                        break
                else:
                    charset = c
            elif op is IN:
                charset = av
##     if prefix:
##         print "*** PREFIX", prefix, prefix_skip
##     if charset:
##         print "*** CHARSET", charset
    # add an info block
    emit = code.append
    emit(OPCODES[INFO])
    skip = len(code); emit(0)
    # literal flag
    mask = 0
    if prefix:
        mask = SRE_INFO_PREFIX
        if len(prefix) == prefix_skip == len(pattern.data):
            mask = mask + SRE_INFO_LITERAL
    elif charset:
        mask = mask + SRE_INFO_CHARSET
    emit(mask)
    # pattern length
    if lo < MAXCODE:
        emit(lo)
    else:
        emit(MAXCODE)
        prefix = prefix[:MAXCODE]
    if hi < MAXCODE:
        emit(hi)
    else:
        emit(0)
    # add literal prefix
    if prefix:
        emit(len(prefix)) # length
        emit(prefix_skip) # skip
        code.extend(prefix)
        # generate overlap table
        table = [-1] + ([0]*len(prefix))
        for i in range(len(prefix)):
            table[i+1] = table[i]+1
            while table[i+1] > 0 and prefix[i] != prefix[table[i+1]-1]:
                table[i+1] = table[table[i+1]-1]+1
        code.extend(table[1:]) # don't store first entry
    elif charset:
        _compile_charset(charset, flags, code)
    code[skip] = len(code) - skip

try:
    unicode
except NameError:
    STRING_TYPES = (type(""),)
else:
    STRING_TYPES = (type(""), type(unicode("")))

def isstring(obj):
    for tp in STRING_TYPES:
        if isinstance(obj, tp):
            return 1
    return 0

def _code(p, flags):

    flags = p.pattern.flags | flags
    code = []

    # compile info block
    _compile_info(code, p, flags)

    # compile the pattern
    _compile(code, p.data, flags)

    code.append(OPCODES[SUCCESS])

    return code

def compile(p, flags=0):
    # internal: convert pattern list to internal format

    if isstring(p):
        import sre_parse
        pattern = p
        p = sre_parse.parse(p, flags)
    else:
        pattern = None

    code = _code(p, flags)

    # print code

    # XXX: <fl> get rid of this limitation!
    assert p.pattern.groups <= 100,\
           "sorry, but this version only supports 100 named groups"

    # map in either direction
    groupindex = p.pattern.groupdict
    indexgroup = [None] * p.pattern.groups
    for k, i in groupindex.items():
        indexgroup[i] = k

    return _sre.compile(
        pattern, flags, code,
        p.pattern.groups-1,
        groupindex, indexgroup
        )