sre_compile.py revision eb99e5157498ae37117a54cdb01fea082d842a0d
1#
2# Secret Labs' Regular Expression Engine
3#
4# convert template to internal format
5#
6# Copyright (c) 1997-2001 by Secret Labs AB.  All rights reserved.
7#
8# See the sre.py file for information on usage and redistribution.
9#
10
11"""Internal support module for sre"""
12
13import _sre
14import sre_parse
15from sre_constants import *
16
17assert _sre.MAGIC == MAGIC, "SRE module mismatch"
18
19if _sre.CODESIZE == 2:
20    MAXCODE = 65535
21else:
22    MAXCODE = 0xFFFFFFFF
23
24_LITERAL_CODES = {LITERAL, NOT_LITERAL}
25_REPEATING_CODES = {REPEAT, MIN_REPEAT, MAX_REPEAT}
26_SUCCESS_CODES = {SUCCESS, FAILURE}
27_ASSERT_CODES = {ASSERT, ASSERT_NOT}
28
29# Sets of lowercase characters which have the same uppercase.
30_equivalences = (
31    # LATIN SMALL LETTER I, LATIN SMALL LETTER DOTLESS I
32    (0x69, 0x131), # iı
33    # LATIN SMALL LETTER S, LATIN SMALL LETTER LONG S
34    (0x73, 0x17f), # sſ
35    # MICRO SIGN, GREEK SMALL LETTER MU
36    (0xb5, 0x3bc), # µμ
37    # COMBINING GREEK YPOGEGRAMMENI, GREEK SMALL LETTER IOTA, GREEK PROSGEGRAMMENI
38    (0x345, 0x3b9, 0x1fbe), # \u0345ιι
39    # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS, GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA
40    (0x390, 0x1fd3), # ΐΐ
41    # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS, GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA
42    (0x3b0, 0x1fe3), # ΰΰ
43    # GREEK SMALL LETTER BETA, GREEK BETA SYMBOL
44    (0x3b2, 0x3d0), # βϐ
45    # GREEK SMALL LETTER EPSILON, GREEK LUNATE EPSILON SYMBOL
46    (0x3b5, 0x3f5), # εϵ
47    # GREEK SMALL LETTER THETA, GREEK THETA SYMBOL
48    (0x3b8, 0x3d1), # θϑ
49    # GREEK SMALL LETTER KAPPA, GREEK KAPPA SYMBOL
50    (0x3ba, 0x3f0), # κϰ
51    # GREEK SMALL LETTER PI, GREEK PI SYMBOL
52    (0x3c0, 0x3d6), # πϖ
53    # GREEK SMALL LETTER RHO, GREEK RHO SYMBOL
54    (0x3c1, 0x3f1), # ρϱ
55    # GREEK SMALL LETTER FINAL SIGMA, GREEK SMALL LETTER SIGMA
56    (0x3c2, 0x3c3), # ςσ
57    # GREEK SMALL LETTER PHI, GREEK PHI SYMBOL
58    (0x3c6, 0x3d5), # φϕ
59    # LATIN SMALL LETTER S WITH DOT ABOVE, LATIN SMALL LETTER LONG S WITH DOT ABOVE
60    (0x1e61, 0x1e9b), # ṡẛ
61    # LATIN SMALL LIGATURE LONG S T, LATIN SMALL LIGATURE ST
62    (0xfb05, 0xfb06), # ſtst
63)
64
65# Maps the lowercase code to lowercase codes which have the same uppercase.
66_ignorecase_fixes = {i: tuple(j for j in t if i != j)
67                     for t in _equivalences for i in t}
68
69def _compile(code, pattern, flags):
70    # internal: compile a (sub)pattern
71    emit = code.append
72    _len = len
73    LITERAL_CODES = _LITERAL_CODES
74    REPEATING_CODES = _REPEATING_CODES
75    SUCCESS_CODES = _SUCCESS_CODES
76    ASSERT_CODES = _ASSERT_CODES
77    if (flags & SRE_FLAG_IGNORECASE and
78            not (flags & SRE_FLAG_LOCALE) and
79            flags & SRE_FLAG_UNICODE):
80        fixes = _ignorecase_fixes
81    else:
82        fixes = None
83    for op, av in pattern:
84        if op in LITERAL_CODES:
85            if flags & SRE_FLAG_IGNORECASE:
86                lo = _sre.getlower(av, flags)
87                if fixes and lo in fixes:
88                    emit(IN_IGNORE)
89                    skip = _len(code); emit(0)
90                    if op is NOT_LITERAL:
91                        emit(NEGATE)
92                    for k in (lo,) + fixes[lo]:
93                        emit(LITERAL)
94                        emit(k)
95                    emit(FAILURE)
96                    code[skip] = _len(code) - skip
97                else:
98                    emit(OP_IGNORE[op])
99                    emit(lo)
100            else:
101                emit(op)
102                emit(av)
103        elif op is IN:
104            if flags & SRE_FLAG_IGNORECASE:
105                emit(OP_IGNORE[op])
106                def fixup(literal, flags=flags):
107                    return _sre.getlower(literal, flags)
108            else:
109                emit(op)
110                fixup = None
111            skip = _len(code); emit(0)
112            _compile_charset(av, flags, code, fixup, fixes)
113            code[skip] = _len(code) - skip
114        elif op is ANY:
115            if flags & SRE_FLAG_DOTALL:
116                emit(ANY_ALL)
117            else:
118                emit(ANY)
119        elif op in REPEATING_CODES:
120            if flags & SRE_FLAG_TEMPLATE:
121                raise error("internal: unsupported template operator")
122            elif _simple(av) and op is not REPEAT:
123                if op is MAX_REPEAT:
124                    emit(REPEAT_ONE)
125                else:
126                    emit(MIN_REPEAT_ONE)
127                skip = _len(code); emit(0)
128                emit(av[0])
129                emit(av[1])
130                _compile(code, av[2], flags)
131                emit(SUCCESS)
132                code[skip] = _len(code) - skip
133            else:
134                emit(REPEAT)
135                skip = _len(code); emit(0)
136                emit(av[0])
137                emit(av[1])
138                _compile(code, av[2], flags)
139                code[skip] = _len(code) - skip
140                if op is MAX_REPEAT:
141                    emit(MAX_UNTIL)
142                else:
143                    emit(MIN_UNTIL)
144        elif op is SUBPATTERN:
145            if av[0]:
146                emit(MARK)
147                emit((av[0]-1)*2)
148            # _compile_info(code, av[1], flags)
149            _compile(code, av[1], flags)
150            if av[0]:
151                emit(MARK)
152                emit((av[0]-1)*2+1)
153        elif op in SUCCESS_CODES:
154            emit(op)
155        elif op in ASSERT_CODES:
156            emit(op)
157            skip = _len(code); emit(0)
158            if av[0] >= 0:
159                emit(0) # look ahead
160            else:
161                lo, hi = av[1].getwidth()
162                if lo != hi:
163                    raise error("look-behind requires fixed-width pattern")
164                emit(lo) # look behind
165            _compile(code, av[1], flags)
166            emit(SUCCESS)
167            code[skip] = _len(code) - skip
168        elif op is CALL:
169            emit(op)
170            skip = _len(code); emit(0)
171            _compile(code, av, flags)
172            emit(SUCCESS)
173            code[skip] = _len(code) - skip
174        elif op is AT:
175            emit(op)
176            if flags & SRE_FLAG_MULTILINE:
177                av = AT_MULTILINE.get(av, av)
178            if flags & SRE_FLAG_LOCALE:
179                av = AT_LOCALE.get(av, av)
180            elif flags & SRE_FLAG_UNICODE:
181                av = AT_UNICODE.get(av, av)
182            emit(av)
183        elif op is BRANCH:
184            emit(op)
185            tail = []
186            tailappend = tail.append
187            for av in av[1]:
188                skip = _len(code); emit(0)
189                # _compile_info(code, av, flags)
190                _compile(code, av, flags)
191                emit(JUMP)
192                tailappend(_len(code)); emit(0)
193                code[skip] = _len(code) - skip
194            emit(0) # end of branch
195            for tail in tail:
196                code[tail] = _len(code) - tail
197        elif op is CATEGORY:
198            emit(op)
199            if flags & SRE_FLAG_LOCALE:
200                av = CH_LOCALE[av]
201            elif flags & SRE_FLAG_UNICODE:
202                av = CH_UNICODE[av]
203            emit(av)
204        elif op is GROUPREF:
205            if flags & SRE_FLAG_IGNORECASE:
206                emit(OP_IGNORE[op])
207            else:
208                emit(op)
209            emit(av-1)
210        elif op is GROUPREF_EXISTS:
211            emit(op)
212            emit(av[0]-1)
213            skipyes = _len(code); emit(0)
214            _compile(code, av[1], flags)
215            if av[2]:
216                emit(JUMP)
217                skipno = _len(code); emit(0)
218                code[skipyes] = _len(code) - skipyes + 1
219                _compile(code, av[2], flags)
220                code[skipno] = _len(code) - skipno
221            else:
222                code[skipyes] = _len(code) - skipyes + 1
223        else:
224            raise ValueError("unsupported operand type", op)
225
226def _compile_charset(charset, flags, code, fixup=None, fixes=None):
227    # compile charset subprogram
228    emit = code.append
229    for op, av in _optimize_charset(charset, fixup, fixes):
230        emit(op)
231        if op is NEGATE:
232            pass
233        elif op is LITERAL:
234            emit(av)
235        elif op is RANGE or op is RANGE_IGNORE:
236            emit(av[0])
237            emit(av[1])
238        elif op is CHARSET:
239            code.extend(av)
240        elif op is BIGCHARSET:
241            code.extend(av)
242        elif op is CATEGORY:
243            if flags & SRE_FLAG_LOCALE:
244                emit(CH_LOCALE[av])
245            elif flags & SRE_FLAG_UNICODE:
246                emit(CH_UNICODE[av])
247            else:
248                emit(av)
249        else:
250            raise error("internal: unsupported set operator")
251    emit(FAILURE)
252
253def _optimize_charset(charset, fixup, fixes):
254    # internal: optimize character set
255    out = []
256    tail = []
257    charmap = bytearray(256)
258    for op, av in charset:
259        while True:
260            try:
261                if op is LITERAL:
262                    if fixup:
263                        lo = fixup(av)
264                        charmap[lo] = 1
265                        if fixes and lo in fixes:
266                            for k in fixes[lo]:
267                                charmap[k] = 1
268                    else:
269                        charmap[av] = 1
270                elif op is RANGE:
271                    r = range(av[0], av[1]+1)
272                    if fixup:
273                        r = map(fixup, r)
274                    if fixup and fixes:
275                        for i in r:
276                            charmap[i] = 1
277                            if i in fixes:
278                                for k in fixes[i]:
279                                    charmap[k] = 1
280                    else:
281                        for i in r:
282                            charmap[i] = 1
283                elif op is NEGATE:
284                    out.append((op, av))
285                else:
286                    tail.append((op, av))
287            except IndexError:
288                if len(charmap) == 256:
289                    # character set contains non-UCS1 character codes
290                    charmap += b'\0' * 0xff00
291                    continue
292                # Character set contains non-BMP character codes.
293                # There are only two ranges of cased non-BMP characters:
294                # 10400-1044F (Deseret) and 118A0-118DF (Warang Citi),
295                # and for both ranges RANGE_IGNORE works.
296                if fixup and op is RANGE:
297                    op = RANGE_IGNORE
298                tail.append((op, av))
299            break
300
301    # compress character map
302    runs = []
303    q = 0
304    while True:
305        p = charmap.find(1, q)
306        if p < 0:
307            break
308        if len(runs) >= 2:
309            runs = None
310            break
311        q = charmap.find(0, p)
312        if q < 0:
313            runs.append((p, len(charmap)))
314            break
315        runs.append((p, q))
316    if runs is not None:
317        # use literal/range
318        for p, q in runs:
319            if q - p == 1:
320                out.append((LITERAL, p))
321            else:
322                out.append((RANGE, (p, q - 1)))
323        out += tail
324        # if the case was changed or new representation is more compact
325        if fixup or len(out) < len(charset):
326            return out
327        # else original character set is good enough
328        return charset
329
330    # use bitmap
331    if len(charmap) == 256:
332        data = _mk_bitmap(charmap)
333        out.append((CHARSET, data))
334        out += tail
335        return out
336
337    # To represent a big charset, first a bitmap of all characters in the
338    # set is constructed. Then, this bitmap is sliced into chunks of 256
339    # characters, duplicate chunks are eliminated, and each chunk is
340    # given a number. In the compiled expression, the charset is
341    # represented by a 32-bit word sequence, consisting of one word for
342    # the number of different chunks, a sequence of 256 bytes (64 words)
343    # of chunk numbers indexed by their original chunk position, and a
344    # sequence of 256-bit chunks (8 words each).
345
346    # Compression is normally good: in a typical charset, large ranges of
347    # Unicode will be either completely excluded (e.g. if only cyrillic
348    # letters are to be matched), or completely included (e.g. if large
349    # subranges of Kanji match). These ranges will be represented by
350    # chunks of all one-bits or all zero-bits.
351
352    # Matching can be also done efficiently: the more significant byte of
353    # the Unicode character is an index into the chunk number, and the
354    # less significant byte is a bit index in the chunk (just like the
355    # CHARSET matching).
356
357    charmap = bytes(charmap) # should be hashable
358    comps = {}
359    mapping = bytearray(256)
360    block = 0
361    data = bytearray()
362    for i in range(0, 65536, 256):
363        chunk = charmap[i: i + 256]
364        if chunk in comps:
365            mapping[i // 256] = comps[chunk]
366        else:
367            mapping[i // 256] = comps[chunk] = block
368            block += 1
369            data += chunk
370    data = _mk_bitmap(data)
371    data[0:0] = [block] + _bytes_to_codes(mapping)
372    out.append((BIGCHARSET, data))
373    out += tail
374    return out
375
376_CODEBITS = _sre.CODESIZE * 8
377_BITS_TRANS = b'0' + b'1' * 255
378def _mk_bitmap(bits, _CODEBITS=_CODEBITS, _int=int):
379    s = bits.translate(_BITS_TRANS)[::-1]
380    return [_int(s[i - _CODEBITS: i], 2)
381            for i in range(len(s), 0, -_CODEBITS)]
382
383def _bytes_to_codes(b):
384    # Convert block indices to word array
385    a = memoryview(b).cast('I')
386    assert a.itemsize == _sre.CODESIZE
387    assert len(a) * a.itemsize == len(b)
388    return a.tolist()
389
390def _simple(av):
391    # check if av is a "simple" operator
392    lo, hi = av[2].getwidth()
393    return lo == hi == 1 and av[2][0][0] != SUBPATTERN
394
395def _generate_overlap_table(prefix):
396    """
397    Generate an overlap table for the following prefix.
398    An overlap table is a table of the same size as the prefix which
399    informs about the potential self-overlap for each index in the prefix:
400    - if overlap[i] == 0, prefix[i:] can't overlap prefix[0:...]
401    - if overlap[i] == k with 0 < k <= i, prefix[i-k+1:i+1] overlaps with
402      prefix[0:k]
403    """
404    table = [0] * len(prefix)
405    for i in range(1, len(prefix)):
406        idx = table[i - 1]
407        while prefix[i] != prefix[idx]:
408            if idx == 0:
409                table[i] = 0
410                break
411            idx = table[idx - 1]
412        else:
413            table[i] = idx + 1
414    return table
415
416def _compile_info(code, pattern, flags):
417    # internal: compile an info block.  in the current version,
418    # this contains min/max pattern width, and an optional literal
419    # prefix or a character map
420    lo, hi = pattern.getwidth()
421    if lo == 0:
422        return # not worth it
423    # look for a literal prefix
424    prefix = []
425    prefixappend = prefix.append
426    prefix_skip = 0
427    charset = [] # not used
428    charsetappend = charset.append
429    if not (flags & SRE_FLAG_IGNORECASE):
430        # look for literal prefix
431        for op, av in pattern.data:
432            if op is LITERAL:
433                if len(prefix) == prefix_skip:
434                    prefix_skip = prefix_skip + 1
435                prefixappend(av)
436            elif op is SUBPATTERN and len(av[1]) == 1:
437                op, av = av[1][0]
438                if op is LITERAL:
439                    prefixappend(av)
440                else:
441                    break
442            else:
443                break
444        # if no prefix, look for charset prefix
445        if not prefix and pattern.data:
446            op, av = pattern.data[0]
447            if op is SUBPATTERN and av[1]:
448                op, av = av[1][0]
449                if op is LITERAL:
450                    charsetappend((op, av))
451                elif op is BRANCH:
452                    c = []
453                    cappend = c.append
454                    for p in av[1]:
455                        if not p:
456                            break
457                        op, av = p[0]
458                        if op is LITERAL:
459                            cappend((op, av))
460                        else:
461                            break
462                    else:
463                        charset = c
464            elif op is BRANCH:
465                c = []
466                cappend = c.append
467                for p in av[1]:
468                    if not p:
469                        break
470                    op, av = p[0]
471                    if op is LITERAL:
472                        cappend((op, av))
473                    else:
474                        break
475                else:
476                    charset = c
477            elif op is IN:
478                charset = av
479##     if prefix:
480##         print "*** PREFIX", prefix, prefix_skip
481##     if charset:
482##         print "*** CHARSET", charset
483    # add an info block
484    emit = code.append
485    emit(INFO)
486    skip = len(code); emit(0)
487    # literal flag
488    mask = 0
489    if prefix:
490        mask = SRE_INFO_PREFIX
491        if len(prefix) == prefix_skip == len(pattern.data):
492            mask = mask + SRE_INFO_LITERAL
493    elif charset:
494        mask = mask + SRE_INFO_CHARSET
495    emit(mask)
496    # pattern length
497    if lo < MAXCODE:
498        emit(lo)
499    else:
500        emit(MAXCODE)
501        prefix = prefix[:MAXCODE]
502    if hi < MAXCODE:
503        emit(hi)
504    else:
505        emit(0)
506    # add literal prefix
507    if prefix:
508        emit(len(prefix)) # length
509        emit(prefix_skip) # skip
510        code.extend(prefix)
511        # generate overlap table
512        code.extend(_generate_overlap_table(prefix))
513    elif charset:
514        _compile_charset(charset, flags, code)
515    code[skip] = len(code) - skip
516
517def isstring(obj):
518    return isinstance(obj, (str, bytes))
519
520def _code(p, flags):
521
522    flags = p.pattern.flags | flags
523    code = []
524
525    # compile info block
526    _compile_info(code, p, flags)
527
528    # compile the pattern
529    _compile(code, p.data, flags)
530
531    code.append(SUCCESS)
532
533    return code
534
535def compile(p, flags=0):
536    # internal: convert pattern list to internal format
537
538    if isstring(p):
539        pattern = p
540        p = sre_parse.parse(p, flags)
541    else:
542        pattern = None
543
544    code = _code(p, flags)
545
546    # print(code)
547
548    # map in either direction
549    groupindex = p.pattern.groupdict
550    indexgroup = [None] * p.pattern.groups
551    for k, i in groupindex.items():
552        indexgroup[i] = k
553
554    return _sre.compile(
555        pattern, flags | p.pattern.flags, code,
556        p.pattern.groups-1,
557        groupindex, indexgroup
558        )
559