sre_compile.py revision 83e802796c80f46be616b48020356f7f51be533d
1#
2# Secret Labs' Regular Expression Engine
3#
4# convert template to internal format
5#
6# Copyright (c) 1997-2001 by Secret Labs AB.  All rights reserved.
7#
8# See the sre.py file for information on usage and redistribution.
9#
10
11"""Internal support module for sre"""
12
13import _sre
14import sre_parse
15from sre_constants import *
16
17assert _sre.MAGIC == MAGIC, "SRE module mismatch"
18
19_LITERAL_CODES = {LITERAL, NOT_LITERAL}
20_REPEATING_CODES = {REPEAT, MIN_REPEAT, MAX_REPEAT}
21_SUCCESS_CODES = {SUCCESS, FAILURE}
22_ASSERT_CODES = {ASSERT, ASSERT_NOT}
23
24# Sets of lowercase characters which have the same uppercase.
25_equivalences = (
26    # LATIN SMALL LETTER I, LATIN SMALL LETTER DOTLESS I
27    (0x69, 0x131), # iı
28    # LATIN SMALL LETTER S, LATIN SMALL LETTER LONG S
29    (0x73, 0x17f), # sſ
30    # MICRO SIGN, GREEK SMALL LETTER MU
31    (0xb5, 0x3bc), # µμ
32    # COMBINING GREEK YPOGEGRAMMENI, GREEK SMALL LETTER IOTA, GREEK PROSGEGRAMMENI
33    (0x345, 0x3b9, 0x1fbe), # \u0345ιι
34    # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS, GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA
35    (0x390, 0x1fd3), # ΐΐ
36    # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS, GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA
37    (0x3b0, 0x1fe3), # ΰΰ
38    # GREEK SMALL LETTER BETA, GREEK BETA SYMBOL
39    (0x3b2, 0x3d0), # βϐ
40    # GREEK SMALL LETTER EPSILON, GREEK LUNATE EPSILON SYMBOL
41    (0x3b5, 0x3f5), # εϵ
42    # GREEK SMALL LETTER THETA, GREEK THETA SYMBOL
43    (0x3b8, 0x3d1), # θϑ
44    # GREEK SMALL LETTER KAPPA, GREEK KAPPA SYMBOL
45    (0x3ba, 0x3f0), # κϰ
46    # GREEK SMALL LETTER PI, GREEK PI SYMBOL
47    (0x3c0, 0x3d6), # πϖ
48    # GREEK SMALL LETTER RHO, GREEK RHO SYMBOL
49    (0x3c1, 0x3f1), # ρϱ
50    # GREEK SMALL LETTER FINAL SIGMA, GREEK SMALL LETTER SIGMA
51    (0x3c2, 0x3c3), # ςσ
52    # GREEK SMALL LETTER PHI, GREEK PHI SYMBOL
53    (0x3c6, 0x3d5), # φϕ
54    # LATIN SMALL LETTER S WITH DOT ABOVE, LATIN SMALL LETTER LONG S WITH DOT ABOVE
55    (0x1e61, 0x1e9b), # ṡẛ
56    # LATIN SMALL LIGATURE LONG S T, LATIN SMALL LIGATURE ST
57    (0xfb05, 0xfb06), # ſtst
58)
59
60# Maps the lowercase code to lowercase codes which have the same uppercase.
61_ignorecase_fixes = {i: tuple(j for j in t if i != j)
62                     for t in _equivalences for i in t}
63
64def _compile(code, pattern, flags):
65    # internal: compile a (sub)pattern
66    emit = code.append
67    _len = len
68    LITERAL_CODES = _LITERAL_CODES
69    REPEATING_CODES = _REPEATING_CODES
70    SUCCESS_CODES = _SUCCESS_CODES
71    ASSERT_CODES = _ASSERT_CODES
72    if (flags & SRE_FLAG_IGNORECASE and
73            not (flags & SRE_FLAG_LOCALE) and
74            flags & SRE_FLAG_UNICODE):
75        fixes = _ignorecase_fixes
76    else:
77        fixes = None
78    for op, av in pattern:
79        if op in LITERAL_CODES:
80            if flags & SRE_FLAG_IGNORECASE:
81                lo = _sre.getlower(av, flags)
82                if fixes and lo in fixes:
83                    emit(IN_IGNORE)
84                    skip = _len(code); emit(0)
85                    if op is NOT_LITERAL:
86                        emit(NEGATE)
87                    for k in (lo,) + fixes[lo]:
88                        emit(LITERAL)
89                        emit(k)
90                    emit(FAILURE)
91                    code[skip] = _len(code) - skip
92                else:
93                    emit(OP_IGNORE[op])
94                    emit(lo)
95            else:
96                emit(op)
97                emit(av)
98        elif op is IN:
99            if flags & SRE_FLAG_IGNORECASE:
100                emit(OP_IGNORE[op])
101                def fixup(literal, flags=flags):
102                    return _sre.getlower(literal, flags)
103            else:
104                emit(op)
105                fixup = None
106            skip = _len(code); emit(0)
107            _compile_charset(av, flags, code, fixup, fixes)
108            code[skip] = _len(code) - skip
109        elif op is ANY:
110            if flags & SRE_FLAG_DOTALL:
111                emit(ANY_ALL)
112            else:
113                emit(ANY)
114        elif op in REPEATING_CODES:
115            if flags & SRE_FLAG_TEMPLATE:
116                raise error("internal: unsupported template operator")
117            elif _simple(av) and op is not REPEAT:
118                if op is MAX_REPEAT:
119                    emit(REPEAT_ONE)
120                else:
121                    emit(MIN_REPEAT_ONE)
122                skip = _len(code); emit(0)
123                emit(av[0])
124                emit(av[1])
125                _compile(code, av[2], flags)
126                emit(SUCCESS)
127                code[skip] = _len(code) - skip
128            else:
129                emit(REPEAT)
130                skip = _len(code); emit(0)
131                emit(av[0])
132                emit(av[1])
133                _compile(code, av[2], flags)
134                code[skip] = _len(code) - skip
135                if op is MAX_REPEAT:
136                    emit(MAX_UNTIL)
137                else:
138                    emit(MIN_UNTIL)
139        elif op is SUBPATTERN:
140            if av[0]:
141                emit(MARK)
142                emit((av[0]-1)*2)
143            # _compile_info(code, av[1], flags)
144            _compile(code, av[1], flags)
145            if av[0]:
146                emit(MARK)
147                emit((av[0]-1)*2+1)
148        elif op in SUCCESS_CODES:
149            emit(op)
150        elif op in ASSERT_CODES:
151            emit(op)
152            skip = _len(code); emit(0)
153            if av[0] >= 0:
154                emit(0) # look ahead
155            else:
156                lo, hi = av[1].getwidth()
157                if lo != hi:
158                    raise error("look-behind requires fixed-width pattern")
159                emit(lo) # look behind
160            _compile(code, av[1], flags)
161            emit(SUCCESS)
162            code[skip] = _len(code) - skip
163        elif op is CALL:
164            emit(op)
165            skip = _len(code); emit(0)
166            _compile(code, av, flags)
167            emit(SUCCESS)
168            code[skip] = _len(code) - skip
169        elif op is AT:
170            emit(op)
171            if flags & SRE_FLAG_MULTILINE:
172                av = AT_MULTILINE.get(av, av)
173            if flags & SRE_FLAG_LOCALE:
174                av = AT_LOCALE.get(av, av)
175            elif flags & SRE_FLAG_UNICODE:
176                av = AT_UNICODE.get(av, av)
177            emit(av)
178        elif op is BRANCH:
179            emit(op)
180            tail = []
181            tailappend = tail.append
182            for av in av[1]:
183                skip = _len(code); emit(0)
184                # _compile_info(code, av, flags)
185                _compile(code, av, flags)
186                emit(JUMP)
187                tailappend(_len(code)); emit(0)
188                code[skip] = _len(code) - skip
189            emit(FAILURE) # end of branch
190            for tail in tail:
191                code[tail] = _len(code) - tail
192        elif op is CATEGORY:
193            emit(op)
194            if flags & SRE_FLAG_LOCALE:
195                av = CH_LOCALE[av]
196            elif flags & SRE_FLAG_UNICODE:
197                av = CH_UNICODE[av]
198            emit(av)
199        elif op is GROUPREF:
200            if flags & SRE_FLAG_IGNORECASE:
201                emit(OP_IGNORE[op])
202            else:
203                emit(op)
204            emit(av-1)
205        elif op is GROUPREF_EXISTS:
206            emit(op)
207            emit(av[0]-1)
208            skipyes = _len(code); emit(0)
209            _compile(code, av[1], flags)
210            if av[2]:
211                emit(JUMP)
212                skipno = _len(code); emit(0)
213                code[skipyes] = _len(code) - skipyes + 1
214                _compile(code, av[2], flags)
215                code[skipno] = _len(code) - skipno
216            else:
217                code[skipyes] = _len(code) - skipyes + 1
218        else:
219            raise ValueError("unsupported operand type", op)
220
221def _compile_charset(charset, flags, code, fixup=None, fixes=None):
222    # compile charset subprogram
223    emit = code.append
224    for op, av in _optimize_charset(charset, fixup, fixes):
225        emit(op)
226        if op is NEGATE:
227            pass
228        elif op is LITERAL:
229            emit(av)
230        elif op is RANGE or op is RANGE_IGNORE:
231            emit(av[0])
232            emit(av[1])
233        elif op is CHARSET:
234            code.extend(av)
235        elif op is BIGCHARSET:
236            code.extend(av)
237        elif op is CATEGORY:
238            if flags & SRE_FLAG_LOCALE:
239                emit(CH_LOCALE[av])
240            elif flags & SRE_FLAG_UNICODE:
241                emit(CH_UNICODE[av])
242            else:
243                emit(av)
244        else:
245            raise error("internal: unsupported set operator")
246    emit(FAILURE)
247
248def _optimize_charset(charset, fixup, fixes):
249    # internal: optimize character set
250    out = []
251    tail = []
252    charmap = bytearray(256)
253    for op, av in charset:
254        while True:
255            try:
256                if op is LITERAL:
257                    if fixup:
258                        lo = fixup(av)
259                        charmap[lo] = 1
260                        if fixes and lo in fixes:
261                            for k in fixes[lo]:
262                                charmap[k] = 1
263                    else:
264                        charmap[av] = 1
265                elif op is RANGE:
266                    r = range(av[0], av[1]+1)
267                    if fixup:
268                        r = map(fixup, r)
269                    if fixup and fixes:
270                        for i in r:
271                            charmap[i] = 1
272                            if i in fixes:
273                                for k in fixes[i]:
274                                    charmap[k] = 1
275                    else:
276                        for i in r:
277                            charmap[i] = 1
278                elif op is NEGATE:
279                    out.append((op, av))
280                else:
281                    tail.append((op, av))
282            except IndexError:
283                if len(charmap) == 256:
284                    # character set contains non-UCS1 character codes
285                    charmap += b'\0' * 0xff00
286                    continue
287                # Character set contains non-BMP character codes.
288                # There are only two ranges of cased non-BMP characters:
289                # 10400-1044F (Deseret) and 118A0-118DF (Warang Citi),
290                # and for both ranges RANGE_IGNORE works.
291                if fixup and op is RANGE:
292                    op = RANGE_IGNORE
293                tail.append((op, av))
294            break
295
296    # compress character map
297    runs = []
298    q = 0
299    while True:
300        p = charmap.find(1, q)
301        if p < 0:
302            break
303        if len(runs) >= 2:
304            runs = None
305            break
306        q = charmap.find(0, p)
307        if q < 0:
308            runs.append((p, len(charmap)))
309            break
310        runs.append((p, q))
311    if runs is not None:
312        # use literal/range
313        for p, q in runs:
314            if q - p == 1:
315                out.append((LITERAL, p))
316            else:
317                out.append((RANGE, (p, q - 1)))
318        out += tail
319        # if the case was changed or new representation is more compact
320        if fixup or len(out) < len(charset):
321            return out
322        # else original character set is good enough
323        return charset
324
325    # use bitmap
326    if len(charmap) == 256:
327        data = _mk_bitmap(charmap)
328        out.append((CHARSET, data))
329        out += tail
330        return out
331
332    # To represent a big charset, first a bitmap of all characters in the
333    # set is constructed. Then, this bitmap is sliced into chunks of 256
334    # characters, duplicate chunks are eliminated, and each chunk is
335    # given a number. In the compiled expression, the charset is
336    # represented by a 32-bit word sequence, consisting of one word for
337    # the number of different chunks, a sequence of 256 bytes (64 words)
338    # of chunk numbers indexed by their original chunk position, and a
339    # sequence of 256-bit chunks (8 words each).
340
341    # Compression is normally good: in a typical charset, large ranges of
342    # Unicode will be either completely excluded (e.g. if only cyrillic
343    # letters are to be matched), or completely included (e.g. if large
344    # subranges of Kanji match). These ranges will be represented by
345    # chunks of all one-bits or all zero-bits.
346
347    # Matching can be also done efficiently: the more significant byte of
348    # the Unicode character is an index into the chunk number, and the
349    # less significant byte is a bit index in the chunk (just like the
350    # CHARSET matching).
351
352    charmap = bytes(charmap) # should be hashable
353    comps = {}
354    mapping = bytearray(256)
355    block = 0
356    data = bytearray()
357    for i in range(0, 65536, 256):
358        chunk = charmap[i: i + 256]
359        if chunk in comps:
360            mapping[i // 256] = comps[chunk]
361        else:
362            mapping[i // 256] = comps[chunk] = block
363            block += 1
364            data += chunk
365    data = _mk_bitmap(data)
366    data[0:0] = [block] + _bytes_to_codes(mapping)
367    out.append((BIGCHARSET, data))
368    out += tail
369    return out
370
371_CODEBITS = _sre.CODESIZE * 8
372MAXCODE = (1 << _CODEBITS) - 1
373_BITS_TRANS = b'0' + b'1' * 255
374def _mk_bitmap(bits, _CODEBITS=_CODEBITS, _int=int):
375    s = bits.translate(_BITS_TRANS)[::-1]
376    return [_int(s[i - _CODEBITS: i], 2)
377            for i in range(len(s), 0, -_CODEBITS)]
378
379def _bytes_to_codes(b):
380    # Convert block indices to word array
381    a = memoryview(b).cast('I')
382    assert a.itemsize == _sre.CODESIZE
383    assert len(a) * a.itemsize == len(b)
384    return a.tolist()
385
386def _simple(av):
387    # check if av is a "simple" operator
388    lo, hi = av[2].getwidth()
389    return lo == hi == 1 and av[2][0][0] != SUBPATTERN
390
391def _generate_overlap_table(prefix):
392    """
393    Generate an overlap table for the following prefix.
394    An overlap table is a table of the same size as the prefix which
395    informs about the potential self-overlap for each index in the prefix:
396    - if overlap[i] == 0, prefix[i:] can't overlap prefix[0:...]
397    - if overlap[i] == k with 0 < k <= i, prefix[i-k+1:i+1] overlaps with
398      prefix[0:k]
399    """
400    table = [0] * len(prefix)
401    for i in range(1, len(prefix)):
402        idx = table[i - 1]
403        while prefix[i] != prefix[idx]:
404            if idx == 0:
405                table[i] = 0
406                break
407            idx = table[idx - 1]
408        else:
409            table[i] = idx + 1
410    return table
411
412def _compile_info(code, pattern, flags):
413    # internal: compile an info block.  in the current version,
414    # this contains min/max pattern width, and an optional literal
415    # prefix or a character map
416    lo, hi = pattern.getwidth()
417    if hi > MAXCODE:
418        hi = MAXCODE
419    if lo == 0:
420        code.extend([INFO, 4, 0, lo, hi])
421        return
422    # look for a literal prefix
423    prefix = []
424    prefixappend = prefix.append
425    prefix_skip = 0
426    charset = [] # not used
427    charsetappend = charset.append
428    if not (flags & SRE_FLAG_IGNORECASE):
429        # look for literal prefix
430        for op, av in pattern.data:
431            if op is LITERAL:
432                if len(prefix) == prefix_skip:
433                    prefix_skip = prefix_skip + 1
434                prefixappend(av)
435            elif op is SUBPATTERN and len(av[1]) == 1:
436                op, av = av[1][0]
437                if op is LITERAL:
438                    prefixappend(av)
439                else:
440                    break
441            else:
442                break
443        # if no prefix, look for charset prefix
444        if not prefix and pattern.data:
445            op, av = pattern.data[0]
446            if op is SUBPATTERN and av[1]:
447                op, av = av[1][0]
448                if op is LITERAL:
449                    charsetappend((op, av))
450                elif op is BRANCH:
451                    c = []
452                    cappend = c.append
453                    for p in av[1]:
454                        if not p:
455                            break
456                        op, av = p[0]
457                        if op is LITERAL:
458                            cappend((op, av))
459                        else:
460                            break
461                    else:
462                        charset = c
463            elif op is BRANCH:
464                c = []
465                cappend = c.append
466                for p in av[1]:
467                    if not p:
468                        break
469                    op, av = p[0]
470                    if op is LITERAL:
471                        cappend((op, av))
472                    else:
473                        break
474                else:
475                    charset = c
476            elif op is IN:
477                charset = av
478##     if prefix:
479##         print("*** PREFIX", prefix, prefix_skip)
480##     if charset:
481##         print("*** CHARSET", charset)
482    # add an info block
483    emit = code.append
484    emit(INFO)
485    skip = len(code); emit(0)
486    # literal flag
487    mask = 0
488    if prefix:
489        mask = SRE_INFO_PREFIX
490        if len(prefix) == prefix_skip == len(pattern.data):
491            mask = mask | SRE_INFO_LITERAL
492    elif charset:
493        mask = mask | SRE_INFO_CHARSET
494    emit(mask)
495    # pattern length
496    if lo < MAXCODE:
497        emit(lo)
498    else:
499        emit(MAXCODE)
500        prefix = prefix[:MAXCODE]
501    emit(min(hi, MAXCODE))
502    # add literal prefix
503    if prefix:
504        emit(len(prefix)) # length
505        emit(prefix_skip) # skip
506        code.extend(prefix)
507        # generate overlap table
508        code.extend(_generate_overlap_table(prefix))
509    elif charset:
510        _compile_charset(charset, flags, code)
511    code[skip] = len(code) - skip
512
513def isstring(obj):
514    return isinstance(obj, (str, bytes))
515
516def _code(p, flags):
517
518    flags = p.pattern.flags | flags
519    code = []
520
521    # compile info block
522    _compile_info(code, p, flags)
523
524    # compile the pattern
525    _compile(code, p.data, flags)
526
527    code.append(SUCCESS)
528
529    return code
530
531def compile(p, flags=0):
532    # internal: convert pattern list to internal format
533
534    if isstring(p):
535        pattern = p
536        p = sre_parse.parse(p, flags)
537    else:
538        pattern = None
539
540    code = _code(p, flags)
541
542    # print(code)
543
544    # map in either direction
545    groupindex = p.pattern.groupdict
546    indexgroup = [None] * p.pattern.groups
547    for k, i in groupindex.items():
548        indexgroup[i] = k
549
550    return _sre.compile(
551        pattern, flags | p.pattern.flags, code,
552        p.pattern.groups-1,
553        groupindex, indexgroup
554        )
555