1e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka# -*- coding: utf-8 -*- 27627c0de6968471996ce05aab200115d56efa1d5Guido van Rossum# 37627c0de6968471996ce05aab200115d56efa1d5Guido van Rossum# Secret Labs' Regular Expression Engine 47627c0de6968471996ce05aab200115d56efa1d5Guido van Rossum# 57627c0de6968471996ce05aab200115d56efa1d5Guido van Rossum# convert template to internal format 67627c0de6968471996ce05aab200115d56efa1d5Guido van Rossum# 7770617b23e286f1147f9480b5f625e88e7badd50Fredrik Lundh# Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved. 87627c0de6968471996ce05aab200115d56efa1d5Guido van Rossum# 929c4ba9ada44d62988c62c85c8046985f10a1c85Fredrik Lundh# See the sre.py file for information on usage and redistribution. 107627c0de6968471996ce05aab200115d56efa1d5Guido van Rossum# 117627c0de6968471996ce05aab200115d56efa1d5Guido van Rossum 12b8f22749853cf79bfbe3709309e67d1a448f4cabFred Drake"""Internal support module for sre""" 13b8f22749853cf79bfbe3709309e67d1a448f4cabFred Drake 144fb7027ec00daa8c5f891888dedc1b387853c1e0Fredrik Lundhimport _sre, sys 154b798bdf8ab3a4a4b3b11ea60a8f0b1c54e43224Amaury Forgeot d'Arcimport sre_parse 167627c0de6968471996ce05aab200115d56efa1d5Guido van Rossumfrom sre_constants import * 177627c0de6968471996ce05aab200115d56efa1d5Guido van Rossum 18b35ffc0417a0861ccf466c0503c4151725a0267aFredrik Lundhassert _sre.MAGIC == MAGIC, "SRE module mismatch" 19b35ffc0417a0861ccf466c0503c4151725a0267aFredrik Lundh 2078e2f06cc66178887ee0d6d243370efa241a675aMartin v. Löwisif _sre.CODESIZE == 2: 2178e2f06cc66178887ee0d6d243370efa241a675aMartin v. Löwis MAXCODE = 65535 2278e2f06cc66178887ee0d6d243370efa241a675aMartin v. Löwiselse: 2378e2f06cc66178887ee0d6d243370efa241a675aMartin v. Löwis MAXCODE = 0xFFFFFFFFL 243562f1176403653ebfbef6275d449ad42d1b843aFredrik Lundh 25049ade2997aee8cd6564e05d29dbe0b390ebf27bRaymond Hettinger_LITERAL_CODES = set([LITERAL, NOT_LITERAL]) 26049ade2997aee8cd6564e05d29dbe0b390ebf27bRaymond Hettinger_REPEATING_CODES = set([REPEAT, MIN_REPEAT, MAX_REPEAT]) 27049ade2997aee8cd6564e05d29dbe0b390ebf27bRaymond Hettinger_SUCCESS_CODES = set([SUCCESS, FAILURE]) 28049ade2997aee8cd6564e05d29dbe0b390ebf27bRaymond Hettinger_ASSERT_CODES = set([ASSERT, ASSERT_NOT]) 29049ade2997aee8cd6564e05d29dbe0b390ebf27bRaymond Hettinger 30e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka# Sets of lowercase characters which have the same uppercase. 31e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka_equivalences = ( 32e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka # LATIN SMALL LETTER I, LATIN SMALL LETTER DOTLESS I 33e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka (0x69, 0x131), # iı 34e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka # LATIN SMALL LETTER S, LATIN SMALL LETTER LONG S 35e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka (0x73, 0x17f), # sſ 36e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka # MICRO SIGN, GREEK SMALL LETTER MU 37e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka (0xb5, 0x3bc), # µμ 38e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka # COMBINING GREEK YPOGEGRAMMENI, GREEK SMALL LETTER IOTA, GREEK PROSGEGRAMMENI 39e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka (0x345, 0x3b9, 0x1fbe), # \u0345ιι 40e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka # GREEK SMALL LETTER BETA, GREEK BETA SYMBOL 41e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka (0x3b2, 0x3d0), # βϐ 42e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka # GREEK SMALL LETTER EPSILON, GREEK LUNATE EPSILON SYMBOL 43e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka (0x3b5, 0x3f5), # εϵ 44e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka # GREEK SMALL LETTER THETA, GREEK THETA SYMBOL 45e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka (0x3b8, 0x3d1), # θϑ 46e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka # GREEK SMALL LETTER KAPPA, GREEK KAPPA SYMBOL 47e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka (0x3ba, 0x3f0), # κϰ 48e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka # GREEK SMALL LETTER PI, GREEK PI SYMBOL 49e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka (0x3c0, 0x3d6), # πϖ 50e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka # GREEK SMALL LETTER RHO, GREEK RHO SYMBOL 51e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka (0x3c1, 0x3f1), # ρϱ 52e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka # GREEK SMALL LETTER FINAL SIGMA, GREEK SMALL LETTER SIGMA 53e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka (0x3c2, 0x3c3), # ςσ 54e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka # GREEK SMALL LETTER PHI, GREEK PHI SYMBOL 55e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka (0x3c6, 0x3d5), # φϕ 56e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka # LATIN SMALL LETTER S WITH DOT ABOVE, LATIN SMALL LETTER LONG S WITH DOT ABOVE 57e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka (0x1e61, 0x1e9b), # ṡẛ 58e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka) 59e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka 60e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka# Maps the lowercase code to lowercase codes which have the same uppercase. 61e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka_ignorecase_fixes = {i: tuple(j for j in t if i != j) 62e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka for t in _equivalences for i in t} 63e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka 64436c3d58a2570f3b599e59b4071f944f774ec441Fredrik Lundhdef _compile(code, pattern, flags): 6529c08beab08ae3e8b9686a119f5cf0afe99ed918Fredrik Lundh # internal: compile a (sub)pattern 66be2211e9401a0be96915c473ef99041beb5a4992Fredrik Lundh emit = code.append 6701c9f8c35f583338e3638906ceeef9d2f29a0254Raymond Hettinger _len = len 68049ade2997aee8cd6564e05d29dbe0b390ebf27bRaymond Hettinger LITERAL_CODES = _LITERAL_CODES 69049ade2997aee8cd6564e05d29dbe0b390ebf27bRaymond Hettinger REPEATING_CODES = _REPEATING_CODES 70049ade2997aee8cd6564e05d29dbe0b390ebf27bRaymond Hettinger SUCCESS_CODES = _SUCCESS_CODES 71049ade2997aee8cd6564e05d29dbe0b390ebf27bRaymond Hettinger ASSERT_CODES = _ASSERT_CODES 72e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka if (flags & SRE_FLAG_IGNORECASE and 73e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka not (flags & SRE_FLAG_LOCALE) and 74e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka flags & SRE_FLAG_UNICODE): 75e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka fixes = _ignorecase_fixes 76e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka else: 77e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka fixes = None 787627c0de6968471996ce05aab200115d56efa1d5Guido van Rossum for op, av in pattern: 7901c9f8c35f583338e3638906ceeef9d2f29a0254Raymond Hettinger if op in LITERAL_CODES: 8090a07913229ada1bb3011cfa08a1e56bca31daafFredrik Lundh if flags & SRE_FLAG_IGNORECASE: 81e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka lo = _sre.getlower(av, flags) 82e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka if fixes and lo in fixes: 83e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka emit(OPCODES[IN_IGNORE]) 84e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka skip = _len(code); emit(0) 85e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka if op is NOT_LITERAL: 86e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka emit(OPCODES[NEGATE]) 87e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka for k in (lo,) + fixes[lo]: 88e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka emit(OPCODES[LITERAL]) 89e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka emit(k) 90e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka emit(OPCODES[FAILURE]) 91e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka code[skip] = _len(code) - skip 92e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka else: 93e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka emit(OPCODES[OP_IGNORE[op]]) 94e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka emit(lo) 9590a07913229ada1bb3011cfa08a1e56bca31daafFredrik Lundh else: 9690a07913229ada1bb3011cfa08a1e56bca31daafFredrik Lundh emit(OPCODES[op]) 972e24044f9db23c3d2195a129f53f2deb73a4e4afFredrik Lundh emit(av) 9890a07913229ada1bb3011cfa08a1e56bca31daafFredrik Lundh elif op is IN: 9990a07913229ada1bb3011cfa08a1e56bca31daafFredrik Lundh if flags & SRE_FLAG_IGNORECASE: 10090a07913229ada1bb3011cfa08a1e56bca31daafFredrik Lundh emit(OPCODES[OP_IGNORE[op]]) 10190a07913229ada1bb3011cfa08a1e56bca31daafFredrik Lundh def fixup(literal, flags=flags): 1020640e1161f37fd3415e9efdbde1e293efb98978cFredrik Lundh return _sre.getlower(literal, flags) 10390a07913229ada1bb3011cfa08a1e56bca31daafFredrik Lundh else: 10490a07913229ada1bb3011cfa08a1e56bca31daafFredrik Lundh emit(OPCODES[op]) 105e9e54ae2228e195e046e6325de018374b3247d2dSerhiy Storchaka fixup = None 10601c9f8c35f583338e3638906ceeef9d2f29a0254Raymond Hettinger skip = _len(code); emit(0) 107e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka _compile_charset(av, flags, code, fixup, fixes) 10801c9f8c35f583338e3638906ceeef9d2f29a0254Raymond Hettinger code[skip] = _len(code) - skip 10943b3b49b5ab486295baef3a35cd8e836f735c065Fredrik Lundh elif op is ANY: 11043b3b49b5ab486295baef3a35cd8e836f735c065Fredrik Lundh if flags & SRE_FLAG_DOTALL: 111e186983842f0b27606b141010513fa8e3d0cc5dbFredrik Lundh emit(OPCODES[ANY_ALL]) 11243b3b49b5ab486295baef3a35cd8e836f735c065Fredrik Lundh else: 113e186983842f0b27606b141010513fa8e3d0cc5dbFredrik Lundh emit(OPCODES[ANY]) 11401c9f8c35f583338e3638906ceeef9d2f29a0254Raymond Hettinger elif op in REPEATING_CODES: 11590a07913229ada1bb3011cfa08a1e56bca31daafFredrik Lundh if flags & SRE_FLAG_TEMPLATE: 11629c4ba9ada44d62988c62c85c8046985f10a1c85Fredrik Lundh raise error, "internal: unsupported template operator" 11790a07913229ada1bb3011cfa08a1e56bca31daafFredrik Lundh emit(OPCODES[REPEAT]) 11801c9f8c35f583338e3638906ceeef9d2f29a0254Raymond Hettinger skip = _len(code); emit(0) 11990a07913229ada1bb3011cfa08a1e56bca31daafFredrik Lundh emit(av[0]) 12090a07913229ada1bb3011cfa08a1e56bca31daafFredrik Lundh emit(av[1]) 12190a07913229ada1bb3011cfa08a1e56bca31daafFredrik Lundh _compile(code, av[2], flags) 12290a07913229ada1bb3011cfa08a1e56bca31daafFredrik Lundh emit(OPCODES[SUCCESS]) 12301c9f8c35f583338e3638906ceeef9d2f29a0254Raymond Hettinger code[skip] = _len(code) - skip 12401c9f8c35f583338e3638906ceeef9d2f29a0254Raymond Hettinger elif _simple(av) and op is not REPEAT: 12501c9f8c35f583338e3638906ceeef9d2f29a0254Raymond Hettinger if op is MAX_REPEAT: 12641c99e7f96f7a0f192839801c568d8a80dcc7091Guido van Rossum emit(OPCODES[REPEAT_ONE]) 12741c99e7f96f7a0f192839801c568d8a80dcc7091Guido van Rossum else: 12841c99e7f96f7a0f192839801c568d8a80dcc7091Guido van Rossum emit(OPCODES[MIN_REPEAT_ONE]) 12901c9f8c35f583338e3638906ceeef9d2f29a0254Raymond Hettinger skip = _len(code); emit(0) 130e186983842f0b27606b141010513fa8e3d0cc5dbFredrik Lundh emit(av[0]) 131e186983842f0b27606b141010513fa8e3d0cc5dbFredrik Lundh emit(av[1]) 132e186983842f0b27606b141010513fa8e3d0cc5dbFredrik Lundh _compile(code, av[2], flags) 133e186983842f0b27606b141010513fa8e3d0cc5dbFredrik Lundh emit(OPCODES[SUCCESS]) 13401c9f8c35f583338e3638906ceeef9d2f29a0254Raymond Hettinger code[skip] = _len(code) - skip 13590a07913229ada1bb3011cfa08a1e56bca31daafFredrik Lundh else: 136e186983842f0b27606b141010513fa8e3d0cc5dbFredrik Lundh emit(OPCODES[REPEAT]) 13701c9f8c35f583338e3638906ceeef9d2f29a0254Raymond Hettinger skip = _len(code); emit(0) 138e186983842f0b27606b141010513fa8e3d0cc5dbFredrik Lundh emit(av[0]) 139e186983842f0b27606b141010513fa8e3d0cc5dbFredrik Lundh emit(av[1]) 140e186983842f0b27606b141010513fa8e3d0cc5dbFredrik Lundh _compile(code, av[2], flags) 14101c9f8c35f583338e3638906ceeef9d2f29a0254Raymond Hettinger code[skip] = _len(code) - skip 14201c9f8c35f583338e3638906ceeef9d2f29a0254Raymond Hettinger if op is MAX_REPEAT: 143e186983842f0b27606b141010513fa8e3d0cc5dbFredrik Lundh emit(OPCODES[MAX_UNTIL]) 14490a07913229ada1bb3011cfa08a1e56bca31daafFredrik Lundh else: 145e186983842f0b27606b141010513fa8e3d0cc5dbFredrik Lundh emit(OPCODES[MIN_UNTIL]) 14690a07913229ada1bb3011cfa08a1e56bca31daafFredrik Lundh elif op is SUBPATTERN: 14729c4ba9ada44d62988c62c85c8046985f10a1c85Fredrik Lundh if av[0]: 14890a07913229ada1bb3011cfa08a1e56bca31daafFredrik Lundh emit(OPCODES[MARK]) 14929c4ba9ada44d62988c62c85c8046985f10a1c85Fredrik Lundh emit((av[0]-1)*2) 1507898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh # _compile_info(code, av[1], flags) 15190a07913229ada1bb3011cfa08a1e56bca31daafFredrik Lundh _compile(code, av[1], flags) 15229c4ba9ada44d62988c62c85c8046985f10a1c85Fredrik Lundh if av[0]: 15390a07913229ada1bb3011cfa08a1e56bca31daafFredrik Lundh emit(OPCODES[MARK]) 15429c4ba9ada44d62988c62c85c8046985f10a1c85Fredrik Lundh emit((av[0]-1)*2+1) 15501c9f8c35f583338e3638906ceeef9d2f29a0254Raymond Hettinger elif op in SUCCESS_CODES: 15643b3b49b5ab486295baef3a35cd8e836f735c065Fredrik Lundh emit(OPCODES[op]) 15701c9f8c35f583338e3638906ceeef9d2f29a0254Raymond Hettinger elif op in ASSERT_CODES: 1586f013982366154ce570f69b6117dbcc6b1d5d89aFredrik Lundh emit(OPCODES[op]) 15901c9f8c35f583338e3638906ceeef9d2f29a0254Raymond Hettinger skip = _len(code); emit(0) 1606f013982366154ce570f69b6117dbcc6b1d5d89aFredrik Lundh if av[0] >= 0: 1616f013982366154ce570f69b6117dbcc6b1d5d89aFredrik Lundh emit(0) # look ahead 1626f013982366154ce570f69b6117dbcc6b1d5d89aFredrik Lundh else: 1636f013982366154ce570f69b6117dbcc6b1d5d89aFredrik Lundh lo, hi = av[1].getwidth() 1646f013982366154ce570f69b6117dbcc6b1d5d89aFredrik Lundh if lo != hi: 1656f013982366154ce570f69b6117dbcc6b1d5d89aFredrik Lundh raise error, "look-behind requires fixed-width pattern" 1666f013982366154ce570f69b6117dbcc6b1d5d89aFredrik Lundh emit(lo) # look behind 1676f013982366154ce570f69b6117dbcc6b1d5d89aFredrik Lundh _compile(code, av[1], flags) 1686f013982366154ce570f69b6117dbcc6b1d5d89aFredrik Lundh emit(OPCODES[SUCCESS]) 16901c9f8c35f583338e3638906ceeef9d2f29a0254Raymond Hettinger code[skip] = _len(code) - skip 1706f013982366154ce570f69b6117dbcc6b1d5d89aFredrik Lundh elif op is CALL: 17143b3b49b5ab486295baef3a35cd8e836f735c065Fredrik Lundh emit(OPCODES[op]) 17201c9f8c35f583338e3638906ceeef9d2f29a0254Raymond Hettinger skip = _len(code); emit(0) 17343b3b49b5ab486295baef3a35cd8e836f735c065Fredrik Lundh _compile(code, av, flags) 17443b3b49b5ab486295baef3a35cd8e836f735c065Fredrik Lundh emit(OPCODES[SUCCESS]) 17501c9f8c35f583338e3638906ceeef9d2f29a0254Raymond Hettinger code[skip] = _len(code) - skip 17643b3b49b5ab486295baef3a35cd8e836f735c065Fredrik Lundh elif op is AT: 17743b3b49b5ab486295baef3a35cd8e836f735c065Fredrik Lundh emit(OPCODES[op]) 17843b3b49b5ab486295baef3a35cd8e836f735c065Fredrik Lundh if flags & SRE_FLAG_MULTILINE: 179b25e1ad253a4d96aea31a7a3fb78522ea354f43aFredrik Lundh av = AT_MULTILINE.get(av, av) 180b25e1ad253a4d96aea31a7a3fb78522ea354f43aFredrik Lundh if flags & SRE_FLAG_LOCALE: 181b25e1ad253a4d96aea31a7a3fb78522ea354f43aFredrik Lundh av = AT_LOCALE.get(av, av) 182b25e1ad253a4d96aea31a7a3fb78522ea354f43aFredrik Lundh elif flags & SRE_FLAG_UNICODE: 183b25e1ad253a4d96aea31a7a3fb78522ea354f43aFredrik Lundh av = AT_UNICODE.get(av, av) 184b25e1ad253a4d96aea31a7a3fb78522ea354f43aFredrik Lundh emit(ATCODES[av]) 18543b3b49b5ab486295baef3a35cd8e836f735c065Fredrik Lundh elif op is BRANCH: 18629c4ba9ada44d62988c62c85c8046985f10a1c85Fredrik Lundh emit(OPCODES[op]) 18743b3b49b5ab486295baef3a35cd8e836f735c065Fredrik Lundh tail = [] 18801c9f8c35f583338e3638906ceeef9d2f29a0254Raymond Hettinger tailappend = tail.append 18943b3b49b5ab486295baef3a35cd8e836f735c065Fredrik Lundh for av in av[1]: 19001c9f8c35f583338e3638906ceeef9d2f29a0254Raymond Hettinger skip = _len(code); emit(0) 1917898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh # _compile_info(code, av, flags) 19243b3b49b5ab486295baef3a35cd8e836f735c065Fredrik Lundh _compile(code, av, flags) 19343b3b49b5ab486295baef3a35cd8e836f735c065Fredrik Lundh emit(OPCODES[JUMP]) 19401c9f8c35f583338e3638906ceeef9d2f29a0254Raymond Hettinger tailappend(_len(code)); emit(0) 19501c9f8c35f583338e3638906ceeef9d2f29a0254Raymond Hettinger code[skip] = _len(code) - skip 19643b3b49b5ab486295baef3a35cd8e836f735c065Fredrik Lundh emit(0) # end of branch 19743b3b49b5ab486295baef3a35cd8e836f735c065Fredrik Lundh for tail in tail: 19801c9f8c35f583338e3638906ceeef9d2f29a0254Raymond Hettinger code[tail] = _len(code) - tail 19943b3b49b5ab486295baef3a35cd8e836f735c065Fredrik Lundh elif op is CATEGORY: 20043b3b49b5ab486295baef3a35cd8e836f735c065Fredrik Lundh emit(OPCODES[op]) 20143b3b49b5ab486295baef3a35cd8e836f735c065Fredrik Lundh if flags & SRE_FLAG_LOCALE: 202b25e1ad253a4d96aea31a7a3fb78522ea354f43aFredrik Lundh av = CH_LOCALE[av] 20343b3b49b5ab486295baef3a35cd8e836f735c065Fredrik Lundh elif flags & SRE_FLAG_UNICODE: 204b25e1ad253a4d96aea31a7a3fb78522ea354f43aFredrik Lundh av = CH_UNICODE[av] 205b25e1ad253a4d96aea31a7a3fb78522ea354f43aFredrik Lundh emit(CHCODES[av]) 20672b82ba16dea929b3fa9db5208b2353e8449c2d5Fredrik Lundh elif op is GROUPREF: 20743b3b49b5ab486295baef3a35cd8e836f735c065Fredrik Lundh if flags & SRE_FLAG_IGNORECASE: 20843b3b49b5ab486295baef3a35cd8e836f735c065Fredrik Lundh emit(OPCODES[OP_IGNORE[op]]) 20943b3b49b5ab486295baef3a35cd8e836f735c065Fredrik Lundh else: 21043b3b49b5ab486295baef3a35cd8e836f735c065Fredrik Lundh emit(OPCODES[op]) 21143b3b49b5ab486295baef3a35cd8e836f735c065Fredrik Lundh emit(av-1) 212ad3fc44ccb40f2ad33c0d09f5a2dfbd4feb442ebGustavo Niemeyer elif op is GROUPREF_EXISTS: 213ad3fc44ccb40f2ad33c0d09f5a2dfbd4feb442ebGustavo Niemeyer emit(OPCODES[op]) 214c30faa812c507c94d744419bce7c497f1a283d95Andrew M. Kuchling emit(av[0]-1) 21501c9f8c35f583338e3638906ceeef9d2f29a0254Raymond Hettinger skipyes = _len(code); emit(0) 216ad3fc44ccb40f2ad33c0d09f5a2dfbd4feb442ebGustavo Niemeyer _compile(code, av[1], flags) 217ad3fc44ccb40f2ad33c0d09f5a2dfbd4feb442ebGustavo Niemeyer if av[2]: 218ad3fc44ccb40f2ad33c0d09f5a2dfbd4feb442ebGustavo Niemeyer emit(OPCODES[JUMP]) 21901c9f8c35f583338e3638906ceeef9d2f29a0254Raymond Hettinger skipno = _len(code); emit(0) 22001c9f8c35f583338e3638906ceeef9d2f29a0254Raymond Hettinger code[skipyes] = _len(code) - skipyes + 1 221ad3fc44ccb40f2ad33c0d09f5a2dfbd4feb442ebGustavo Niemeyer _compile(code, av[2], flags) 22201c9f8c35f583338e3638906ceeef9d2f29a0254Raymond Hettinger code[skipno] = _len(code) - skipno 223ad3fc44ccb40f2ad33c0d09f5a2dfbd4feb442ebGustavo Niemeyer else: 22401c9f8c35f583338e3638906ceeef9d2f29a0254Raymond Hettinger code[skipyes] = _len(code) - skipyes + 1 22590a07913229ada1bb3011cfa08a1e56bca31daafFredrik Lundh else: 22690a07913229ada1bb3011cfa08a1e56bca31daafFredrik Lundh raise ValueError, ("unsupported operand type", op) 2277627c0de6968471996ce05aab200115d56efa1d5Guido van Rossum 228e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchakadef _compile_charset(charset, flags, code, fixup=None, fixes=None): 2297898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh # compile charset subprogram 2307898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh emit = code.append 231e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka for op, av in _optimize_charset(charset, fixup, fixes, 232e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka flags & SRE_FLAG_UNICODE): 2337898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh emit(OPCODES[op]) 2347898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh if op is NEGATE: 2357898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh pass 2367898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh elif op is LITERAL: 237e9e54ae2228e195e046e6325de018374b3247d2dSerhiy Storchaka emit(av) 2387898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh elif op is RANGE: 239e9e54ae2228e195e046e6325de018374b3247d2dSerhiy Storchaka emit(av[0]) 240e9e54ae2228e195e046e6325de018374b3247d2dSerhiy Storchaka emit(av[1]) 2417898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh elif op is CHARSET: 2427898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh code.extend(av) 24319af43d78a8bd85dc39ea62cc4bc130778cfc643Fredrik Lundh elif op is BIGCHARSET: 24419af43d78a8bd85dc39ea62cc4bc130778cfc643Fredrik Lundh code.extend(av) 2457898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh elif op is CATEGORY: 2467898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh if flags & SRE_FLAG_LOCALE: 2477898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh emit(CHCODES[CH_LOCALE[av]]) 2487898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh elif flags & SRE_FLAG_UNICODE: 2497898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh emit(CHCODES[CH_UNICODE[av]]) 2507898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh else: 2517898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh emit(CHCODES[av]) 2527898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh else: 2537898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh raise error, "internal: unsupported set operator" 2547898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh emit(OPCODES[FAILURE]) 2557898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh 256e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchakadef _optimize_charset(charset, fixup, fixes, isunicode): 2577898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh # internal: optimize character set 2587898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh out = [] 259c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka tail = [] 260c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka charmap = bytearray(256) 261c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka for op, av in charset: 262c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka while True: 263c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka try: 264c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka if op is LITERAL: 265e9e54ae2228e195e046e6325de018374b3247d2dSerhiy Storchaka if fixup: 266e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka i = fixup(av) 267e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka charmap[i] = 1 268e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka if fixes and i in fixes: 269e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka for k in fixes[i]: 270e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka charmap[k] = 1 271e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka else: 272e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka charmap[av] = 1 273c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka elif op is RANGE: 274e9e54ae2228e195e046e6325de018374b3247d2dSerhiy Storchaka r = range(av[0], av[1]+1) 275e9e54ae2228e195e046e6325de018374b3247d2dSerhiy Storchaka if fixup: 276e9e54ae2228e195e046e6325de018374b3247d2dSerhiy Storchaka r = map(fixup, r) 277e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka if fixup and fixes: 278e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka for i in r: 279e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka charmap[i] = 1 280e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka if i in fixes: 281e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka for k in fixes[i]: 282e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka charmap[k] = 1 283e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka else: 284e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka for i in r: 285e927757df62e62f5d45640ee76e4048703d8d419Serhiy Storchaka charmap[i] = 1 286c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka elif op is NEGATE: 287c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka out.append((op, av)) 288c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka else: 289c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka tail.append((op, av)) 290c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka except IndexError: 291c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka if len(charmap) == 256: 292c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka # character set contains non-UCS1 character codes 293c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka charmap += b'\0' * 0xff00 294c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka continue 295c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka # character set contains non-BMP character codes 296e9e54ae2228e195e046e6325de018374b3247d2dSerhiy Storchaka if fixup and isunicode and op is RANGE: 297e9e54ae2228e195e046e6325de018374b3247d2dSerhiy Storchaka lo, hi = av 298e9e54ae2228e195e046e6325de018374b3247d2dSerhiy Storchaka ranges = [av] 299e9e54ae2228e195e046e6325de018374b3247d2dSerhiy Storchaka # There are only two ranges of cased astral characters: 300e9e54ae2228e195e046e6325de018374b3247d2dSerhiy Storchaka # 10400-1044F (Deseret) and 118A0-118DF (Warang Citi). 301e9e54ae2228e195e046e6325de018374b3247d2dSerhiy Storchaka _fixup_range(max(0x10000, lo), min(0x11fff, hi), 302e9e54ae2228e195e046e6325de018374b3247d2dSerhiy Storchaka ranges, fixup) 303e9e54ae2228e195e046e6325de018374b3247d2dSerhiy Storchaka for lo, hi in ranges: 304e9e54ae2228e195e046e6325de018374b3247d2dSerhiy Storchaka if lo == hi: 305e9e54ae2228e195e046e6325de018374b3247d2dSerhiy Storchaka tail.append((LITERAL, hi)) 306e9e54ae2228e195e046e6325de018374b3247d2dSerhiy Storchaka else: 307e9e54ae2228e195e046e6325de018374b3247d2dSerhiy Storchaka tail.append((RANGE, (lo, hi))) 308e9e54ae2228e195e046e6325de018374b3247d2dSerhiy Storchaka else: 309e9e54ae2228e195e046e6325de018374b3247d2dSerhiy Storchaka tail.append((op, av)) 310c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka break 311c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka 3127898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh # compress character map 3137898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh runs = [] 314c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka q = 0 315c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka while True: 316c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka p = charmap.find(b'\1', q) 317c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka if p < 0: 318c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka break 319c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka if len(runs) >= 2: 320c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka runs = None 321c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka break 322c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka q = charmap.find(b'\0', p) 323c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka if q < 0: 324c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka runs.append((p, len(charmap))) 325c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka break 326c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka runs.append((p, q)) 327c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka if runs is not None: 3287898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh # use literal/range 329c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka for p, q in runs: 330c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka if q - p == 1: 331c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka out.append((LITERAL, p)) 3327898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh else: 333c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka out.append((RANGE, (p, q - 1))) 334c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka out += tail 335e9e54ae2228e195e046e6325de018374b3247d2dSerhiy Storchaka # if the case was changed or new representation is more compact 336e9e54ae2228e195e046e6325de018374b3247d2dSerhiy Storchaka if fixup or len(out) < len(charset): 3377898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh return out 338e9e54ae2228e195e046e6325de018374b3247d2dSerhiy Storchaka # else original character set is good enough 339c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka return charset 340c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka 341c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka # use bitmap 342c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka if len(charmap) == 256: 34319af43d78a8bd85dc39ea62cc4bc130778cfc643Fredrik Lundh data = _mk_bitmap(charmap) 344c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka out.append((CHARSET, data)) 345c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka out += tail 3467898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh return out 3477898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh 348c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka # To represent a big charset, first a bitmap of all characters in the 349c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka # set is constructed. Then, this bitmap is sliced into chunks of 256 350c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka # characters, duplicate chunks are eliminated, and each chunk is 351c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka # given a number. In the compiled expression, the charset is 352c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka # represented by a 32-bit word sequence, consisting of one word for 353c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka # the number of different chunks, a sequence of 256 bytes (64 words) 354c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka # of chunk numbers indexed by their original chunk position, and a 355c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka # sequence of 256-bit chunks (8 words each). 35619af43d78a8bd85dc39ea62cc4bc130778cfc643Fredrik Lundh 357c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka # Compression is normally good: in a typical charset, large ranges of 358c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka # Unicode will be either completely excluded (e.g. if only cyrillic 359c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka # letters are to be matched), or completely included (e.g. if large 360c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka # subranges of Kanji match). These ranges will be represented by 361c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka # chunks of all one-bits or all zero-bits. 36219af43d78a8bd85dc39ea62cc4bc130778cfc643Fredrik Lundh 363c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka # Matching can be also done efficiently: the more significant byte of 364c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka # the Unicode character is an index into the chunk number, and the 365c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka # less significant byte is a bit index in the chunk (just like the 366c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka # CHARSET matching). 36719af43d78a8bd85dc39ea62cc4bc130778cfc643Fredrik Lundh 368c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka # In UCS-4 mode, the BIGCHARSET opcode still supports only subsets 369c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka # of the basic multilingual plane; an efficient representation 370c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka # for all of Unicode has not yet been developed. 37119af43d78a8bd85dc39ea62cc4bc130778cfc643Fredrik Lundh 372c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka charmap = bytes(charmap) # should be hashable 37319af43d78a8bd85dc39ea62cc4bc130778cfc643Fredrik Lundh comps = {} 374c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka mapping = bytearray(256) 37519af43d78a8bd85dc39ea62cc4bc130778cfc643Fredrik Lundh block = 0 376c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka data = bytearray() 377c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka for i in range(0, 65536, 256): 378c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka chunk = charmap[i: i + 256] 379c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka if chunk in comps: 380c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka mapping[i // 256] = comps[chunk] 381c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka else: 382c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka mapping[i // 256] = comps[chunk] = block 383c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka block += 1 384c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka data += chunk 385c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka data = _mk_bitmap(data) 386c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka data[0:0] = [block] + _bytes_to_codes(mapping) 387c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka out.append((BIGCHARSET, data)) 388c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka out += tail 389c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka return out 390c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka 391e9e54ae2228e195e046e6325de018374b3247d2dSerhiy Storchakadef _fixup_range(lo, hi, ranges, fixup): 392e9e54ae2228e195e046e6325de018374b3247d2dSerhiy Storchaka for i in map(fixup, range(lo, hi+1)): 393e9e54ae2228e195e046e6325de018374b3247d2dSerhiy Storchaka for k, (lo, hi) in enumerate(ranges): 394e9e54ae2228e195e046e6325de018374b3247d2dSerhiy Storchaka if i < lo: 395e9e54ae2228e195e046e6325de018374b3247d2dSerhiy Storchaka if l == lo - 1: 396e9e54ae2228e195e046e6325de018374b3247d2dSerhiy Storchaka ranges[k] = (i, hi) 397e9e54ae2228e195e046e6325de018374b3247d2dSerhiy Storchaka else: 398e9e54ae2228e195e046e6325de018374b3247d2dSerhiy Storchaka ranges.insert(k, (i, i)) 399e9e54ae2228e195e046e6325de018374b3247d2dSerhiy Storchaka break 400e9e54ae2228e195e046e6325de018374b3247d2dSerhiy Storchaka elif i > hi: 401e9e54ae2228e195e046e6325de018374b3247d2dSerhiy Storchaka if i == hi + 1: 402e9e54ae2228e195e046e6325de018374b3247d2dSerhiy Storchaka ranges[k] = (lo, i) 403e9e54ae2228e195e046e6325de018374b3247d2dSerhiy Storchaka break 404e9e54ae2228e195e046e6325de018374b3247d2dSerhiy Storchaka else: 405e9e54ae2228e195e046e6325de018374b3247d2dSerhiy Storchaka break 406e9e54ae2228e195e046e6325de018374b3247d2dSerhiy Storchaka else: 407e9e54ae2228e195e046e6325de018374b3247d2dSerhiy Storchaka ranges.append((i, i)) 408e9e54ae2228e195e046e6325de018374b3247d2dSerhiy Storchaka 409c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka_CODEBITS = _sre.CODESIZE * 8 410c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka_BITS_TRANS = b'0' + b'1' * 255 411c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchakadef _mk_bitmap(bits, _CODEBITS=_CODEBITS, _int=int): 412c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka s = bytes(bits).translate(_BITS_TRANS)[::-1] 413c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka return [_int(s[i - _CODEBITS: i], 2) 414c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka for i in range(len(s), 0, -_CODEBITS)] 415c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka 416c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchakadef _bytes_to_codes(b): 417c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka # Convert block indices to word array 418c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka import array 4197d9c6c7e8c1e381de7e96989c1332cf98d766f3aMartin v. Löwis if _sre.CODESIZE == 2: 42078e2f06cc66178887ee0d6d243370efa241a675aMartin v. Löwis code = 'H' 42178e2f06cc66178887ee0d6d243370efa241a675aMartin v. Löwis else: 4227d9c6c7e8c1e381de7e96989c1332cf98d766f3aMartin v. Löwis code = 'I' 423c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka a = array.array(code, bytes(b)) 424c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka assert a.itemsize == _sre.CODESIZE 425c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka assert len(a) * a.itemsize == len(b) 426c04fcd40bd8fd2c9e427faded617214f8ae18402Serhiy Storchaka return a.tolist() 42719af43d78a8bd85dc39ea62cc4bc130778cfc643Fredrik Lundh 4287898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundhdef _simple(av): 4297898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh # check if av is a "simple" operator 4307898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh lo, hi = av[2].getwidth() 4317898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh return lo == hi == 1 and av[2][0][0] != SUBPATTERN 4327898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh 43329c08beab08ae3e8b9686a119f5cf0afe99ed918Fredrik Lundhdef _compile_info(code, pattern, flags): 43429c08beab08ae3e8b9686a119f5cf0afe99ed918Fredrik Lundh # internal: compile an info block. in the current version, 4353562f1176403653ebfbef6275d449ad42d1b843aFredrik Lundh # this contains min/max pattern width, and an optional literal 4363562f1176403653ebfbef6275d449ad42d1b843aFredrik Lundh # prefix or a character map 43729c08beab08ae3e8b9686a119f5cf0afe99ed918Fredrik Lundh lo, hi = pattern.getwidth() 43829c08beab08ae3e8b9686a119f5cf0afe99ed918Fredrik Lundh if lo == 0: 43990a07913229ada1bb3011cfa08a1e56bca31daafFredrik Lundh return # not worth it 44029c08beab08ae3e8b9686a119f5cf0afe99ed918Fredrik Lundh # look for a literal prefix 44129c08beab08ae3e8b9686a119f5cf0afe99ed918Fredrik Lundh prefix = [] 44201c9f8c35f583338e3638906ceeef9d2f29a0254Raymond Hettinger prefixappend = prefix.append 4437898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh prefix_skip = 0 4443562f1176403653ebfbef6275d449ad42d1b843aFredrik Lundh charset = [] # not used 44501c9f8c35f583338e3638906ceeef9d2f29a0254Raymond Hettinger charsetappend = charset.append 44629c08beab08ae3e8b9686a119f5cf0afe99ed918Fredrik Lundh if not (flags & SRE_FLAG_IGNORECASE): 4477898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh # look for literal prefix 44890a07913229ada1bb3011cfa08a1e56bca31daafFredrik Lundh for op, av in pattern.data: 44990a07913229ada1bb3011cfa08a1e56bca31daafFredrik Lundh if op is LITERAL: 4507898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh if len(prefix) == prefix_skip: 4517898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh prefix_skip = prefix_skip + 1 45201c9f8c35f583338e3638906ceeef9d2f29a0254Raymond Hettinger prefixappend(av) 4537898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh elif op is SUBPATTERN and len(av[1]) == 1: 4547898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh op, av = av[1][0] 4557898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh if op is LITERAL: 45601c9f8c35f583338e3638906ceeef9d2f29a0254Raymond Hettinger prefixappend(av) 4577898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh else: 4587898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh break 45990a07913229ada1bb3011cfa08a1e56bca31daafFredrik Lundh else: 46090a07913229ada1bb3011cfa08a1e56bca31daafFredrik Lundh break 4617898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh # if no prefix, look for charset prefix 4627898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh if not prefix and pattern.data: 4637898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh op, av = pattern.data[0] 4647898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh if op is SUBPATTERN and av[1]: 4657898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh op, av = av[1][0] 4667898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh if op is LITERAL: 46701c9f8c35f583338e3638906ceeef9d2f29a0254Raymond Hettinger charsetappend((op, av)) 4687898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh elif op is BRANCH: 4697898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh c = [] 47001c9f8c35f583338e3638906ceeef9d2f29a0254Raymond Hettinger cappend = c.append 4717898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh for p in av[1]: 4727898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh if not p: 4737898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh break 4747898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh op, av = p[0] 4757898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh if op is LITERAL: 47601c9f8c35f583338e3638906ceeef9d2f29a0254Raymond Hettinger cappend((op, av)) 4777898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh else: 4787898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh break 4797898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh else: 4807898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh charset = c 4817898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh elif op is BRANCH: 4827898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh c = [] 48301c9f8c35f583338e3638906ceeef9d2f29a0254Raymond Hettinger cappend = c.append 4847898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh for p in av[1]: 4857898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh if not p: 4867898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh break 4877898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh op, av = p[0] 4887898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh if op is LITERAL: 48901c9f8c35f583338e3638906ceeef9d2f29a0254Raymond Hettinger cappend((op, av)) 4907898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh else: 4917898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh break 4927898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh else: 4937898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh charset = c 4947898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh elif op is IN: 4957898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh charset = av 4967898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh## if prefix: 4977898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh## print "*** PREFIX", prefix, prefix_skip 4987898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh## if charset: 4997898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh## print "*** CHARSET", charset 50029c08beab08ae3e8b9686a119f5cf0afe99ed918Fredrik Lundh # add an info block 50129c08beab08ae3e8b9686a119f5cf0afe99ed918Fredrik Lundh emit = code.append 50229c08beab08ae3e8b9686a119f5cf0afe99ed918Fredrik Lundh emit(OPCODES[INFO]) 50329c08beab08ae3e8b9686a119f5cf0afe99ed918Fredrik Lundh skip = len(code); emit(0) 50429c08beab08ae3e8b9686a119f5cf0afe99ed918Fredrik Lundh # literal flag 50529c08beab08ae3e8b9686a119f5cf0afe99ed918Fredrik Lundh mask = 0 5063562f1176403653ebfbef6275d449ad42d1b843aFredrik Lundh if prefix: 5073562f1176403653ebfbef6275d449ad42d1b843aFredrik Lundh mask = SRE_INFO_PREFIX 5087898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh if len(prefix) == prefix_skip == len(pattern.data): 5093562f1176403653ebfbef6275d449ad42d1b843aFredrik Lundh mask = mask + SRE_INFO_LITERAL 5103562f1176403653ebfbef6275d449ad42d1b843aFredrik Lundh elif charset: 5113562f1176403653ebfbef6275d449ad42d1b843aFredrik Lundh mask = mask + SRE_INFO_CHARSET 51229c08beab08ae3e8b9686a119f5cf0afe99ed918Fredrik Lundh emit(mask) 51329c08beab08ae3e8b9686a119f5cf0afe99ed918Fredrik Lundh # pattern length 5143562f1176403653ebfbef6275d449ad42d1b843aFredrik Lundh if lo < MAXCODE: 5153562f1176403653ebfbef6275d449ad42d1b843aFredrik Lundh emit(lo) 5163562f1176403653ebfbef6275d449ad42d1b843aFredrik Lundh else: 5173562f1176403653ebfbef6275d449ad42d1b843aFredrik Lundh emit(MAXCODE) 5183562f1176403653ebfbef6275d449ad42d1b843aFredrik Lundh prefix = prefix[:MAXCODE] 5193562f1176403653ebfbef6275d449ad42d1b843aFredrik Lundh if hi < MAXCODE: 52090a07913229ada1bb3011cfa08a1e56bca31daafFredrik Lundh emit(hi) 52129c08beab08ae3e8b9686a119f5cf0afe99ed918Fredrik Lundh else: 52290a07913229ada1bb3011cfa08a1e56bca31daafFredrik Lundh emit(0) 52329c08beab08ae3e8b9686a119f5cf0afe99ed918Fredrik Lundh # add literal prefix 52429c08beab08ae3e8b9686a119f5cf0afe99ed918Fredrik Lundh if prefix: 5257898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh emit(len(prefix)) # length 5267898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh emit(prefix_skip) # skip 5277898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh code.extend(prefix) 5287898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh # generate overlap table 5297898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh table = [-1] + ([0]*len(prefix)) 53001c9f8c35f583338e3638906ceeef9d2f29a0254Raymond Hettinger for i in xrange(len(prefix)): 5317898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh table[i+1] = table[i]+1 5327898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh while table[i+1] > 0 and prefix[i] != prefix[table[i+1]-1]: 5337898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh table[i+1] = table[table[i+1]-1]+1 5347898c3e6852565046a9b8b063d35d66777bf5176Fredrik Lundh code.extend(table[1:]) # don't store first entry 5353562f1176403653ebfbef6275d449ad42d1b843aFredrik Lundh elif charset: 536577fb5a1db508444eed7bd78fbf76c1f64ed643bGuido van Rossum _compile_charset(charset, flags, code) 53729c08beab08ae3e8b9686a119f5cf0afe99ed918Fredrik Lundh code[skip] = len(code) - skip 53829c08beab08ae3e8b9686a119f5cf0afe99ed918Fredrik Lundh 5398a3ebf8ca87a7d2989148d7c218974ab4235ca5dFredrik Lundhtry: 54012723baceab61f8812d68575c962696cc4e77fa1Just van Rossum unicode 5418a3ebf8ca87a7d2989148d7c218974ab4235ca5dFredrik Lundhexcept NameError: 54274902508dc395014dbdb9c2ed08263202e5d4e30Just van Rossum STRING_TYPES = (type(""),) 54312723baceab61f8812d68575c962696cc4e77fa1Just van Rossumelse: 54412723baceab61f8812d68575c962696cc4e77fa1Just van Rossum STRING_TYPES = (type(""), type(unicode(""))) 5458a3ebf8ca87a7d2989148d7c218974ab4235ca5dFredrik Lundh 54674902508dc395014dbdb9c2ed08263202e5d4e30Just van Rossumdef isstring(obj): 54774902508dc395014dbdb9c2ed08263202e5d4e30Just van Rossum for tp in STRING_TYPES: 54874902508dc395014dbdb9c2ed08263202e5d4e30Just van Rossum if isinstance(obj, tp): 54974902508dc395014dbdb9c2ed08263202e5d4e30Just van Rossum return 1 55074902508dc395014dbdb9c2ed08263202e5d4e30Just van Rossum return 0 55174902508dc395014dbdb9c2ed08263202e5d4e30Just van Rossum 5522f2c67d7e5934bdf96835f3c4774388b3e654314Fredrik Lundhdef _code(p, flags): 55329c08beab08ae3e8b9686a119f5cf0afe99ed918Fredrik Lundh 554b1aa19515ffdb84c6633ee0344196fd8bd50ade0Jeremy Hylton flags = p.pattern.flags | flags 555be2211e9401a0be96915c473ef99041beb5a4992Fredrik Lundh code = [] 55629c08beab08ae3e8b9686a119f5cf0afe99ed918Fredrik Lundh 55729c08beab08ae3e8b9686a119f5cf0afe99ed918Fredrik Lundh # compile info block 55829c08beab08ae3e8b9686a119f5cf0afe99ed918Fredrik Lundh _compile_info(code, p, flags) 55929c08beab08ae3e8b9686a119f5cf0afe99ed918Fredrik Lundh 56029c08beab08ae3e8b9686a119f5cf0afe99ed918Fredrik Lundh # compile the pattern 561b1aa19515ffdb84c6633ee0344196fd8bd50ade0Jeremy Hylton _compile(code, p.data, flags) 56229c08beab08ae3e8b9686a119f5cf0afe99ed918Fredrik Lundh 563b1aa19515ffdb84c6633ee0344196fd8bd50ade0Jeremy Hylton code.append(OPCODES[SUCCESS]) 56429c08beab08ae3e8b9686a119f5cf0afe99ed918Fredrik Lundh 56529c4ba9ada44d62988c62c85c8046985f10a1c85Fredrik Lundh return code 56629c4ba9ada44d62988c62c85c8046985f10a1c85Fredrik Lundh 56729c4ba9ada44d62988c62c85c8046985f10a1c85Fredrik Lundhdef compile(p, flags=0): 56829c4ba9ada44d62988c62c85c8046985f10a1c85Fredrik Lundh # internal: convert pattern list to internal format 56929c4ba9ada44d62988c62c85c8046985f10a1c85Fredrik Lundh 57074902508dc395014dbdb9c2ed08263202e5d4e30Just van Rossum if isstring(p): 57129c4ba9ada44d62988c62c85c8046985f10a1c85Fredrik Lundh pattern = p 57229c4ba9ada44d62988c62c85c8046985f10a1c85Fredrik Lundh p = sre_parse.parse(p, flags) 57329c4ba9ada44d62988c62c85c8046985f10a1c85Fredrik Lundh else: 57429c4ba9ada44d62988c62c85c8046985f10a1c85Fredrik Lundh pattern = None 57529c4ba9ada44d62988c62c85c8046985f10a1c85Fredrik Lundh 5762f2c67d7e5934bdf96835f3c4774388b3e654314Fredrik Lundh code = _code(p, flags) 57729c4ba9ada44d62988c62c85c8046985f10a1c85Fredrik Lundh 5788a3ebf8ca87a7d2989148d7c218974ab4235ca5dFredrik Lundh # print code 5798a3ebf8ca87a7d2989148d7c218974ab4235ca5dFredrik Lundh 580770617b23e286f1147f9480b5f625e88e7badd50Fredrik Lundh # XXX: <fl> get rid of this limitation! 5815e7d51b62cc86314987b833a2a800b9528e906d7Fredrik Lundh if p.pattern.groups > 100: 5825e7d51b62cc86314987b833a2a800b9528e906d7Fredrik Lundh raise AssertionError( 5835e7d51b62cc86314987b833a2a800b9528e906d7Fredrik Lundh "sorry, but this version only supports 100 named groups" 5845e7d51b62cc86314987b833a2a800b9528e906d7Fredrik Lundh ) 58529c08beab08ae3e8b9686a119f5cf0afe99ed918Fredrik Lundh 586c2301730b8e07ece0b7c4ff6b6b06a4895d370c7Fredrik Lundh # map in either direction 587c2301730b8e07ece0b7c4ff6b6b06a4895d370c7Fredrik Lundh groupindex = p.pattern.groupdict 588c2301730b8e07ece0b7c4ff6b6b06a4895d370c7Fredrik Lundh indexgroup = [None] * p.pattern.groups 589c2301730b8e07ece0b7c4ff6b6b06a4895d370c7Fredrik Lundh for k, i in groupindex.items(): 590c2301730b8e07ece0b7c4ff6b6b06a4895d370c7Fredrik Lundh indexgroup[i] = k 591c2301730b8e07ece0b7c4ff6b6b06a4895d370c7Fredrik Lundh 592b1aa19515ffdb84c6633ee0344196fd8bd50ade0Jeremy Hylton return _sre.compile( 593ae04c3356ed2aec0e9e2c39096a3ccd05722575aGuido van Rossum pattern, flags | p.pattern.flags, code, 5946f013982366154ce570f69b6117dbcc6b1d5d89aFredrik Lundh p.pattern.groups-1, 5956f013982366154ce570f69b6117dbcc6b1d5d89aFredrik Lundh groupindex, indexgroup 59690a07913229ada1bb3011cfa08a1e56bca31daafFredrik Lundh ) 597