1from test.support import verbose, run_unittest, gc_collect, bigmemtest, _2G, \
2        cpython_only, captured_stdout
3import io
4import locale
5import re
6import sre_compile
7import string
8import sys
9import traceback
10import unittest
11import warnings
12from re import Scanner
13from weakref import proxy
14
15# Misc tests from Tim Peters' re.doc
16
17# WARNING: Don't change details in these tests if you don't know
18# what you're doing. Some of these tests were carefully modeled to
19# cover most of the code.
20
21class S(str):
22    def __getitem__(self, index):
23        return S(super().__getitem__(index))
24
25class B(bytes):
26    def __getitem__(self, index):
27        return B(super().__getitem__(index))
28
29class ReTests(unittest.TestCase):
30
31    def assertTypedEqual(self, actual, expect, msg=None):
32        self.assertEqual(actual, expect, msg)
33        def recurse(actual, expect):
34            if isinstance(expect, (tuple, list)):
35                for x, y in zip(actual, expect):
36                    recurse(x, y)
37            else:
38                self.assertIs(type(actual), type(expect), msg)
39        recurse(actual, expect)
40
41    def checkPatternError(self, pattern, errmsg, pos=None):
42        with self.assertRaises(re.error) as cm:
43            re.compile(pattern)
44        with self.subTest(pattern=pattern):
45            err = cm.exception
46            self.assertEqual(err.msg, errmsg)
47            if pos is not None:
48                self.assertEqual(err.pos, pos)
49
50    def checkTemplateError(self, pattern, repl, string, errmsg, pos=None):
51        with self.assertRaises(re.error) as cm:
52            re.sub(pattern, repl, string)
53        with self.subTest(pattern=pattern, repl=repl):
54            err = cm.exception
55            self.assertEqual(err.msg, errmsg)
56            if pos is not None:
57                self.assertEqual(err.pos, pos)
58
59    def test_keep_buffer(self):
60        # See bug 14212
61        b = bytearray(b'x')
62        it = re.finditer(b'a', b)
63        with self.assertRaises(BufferError):
64            b.extend(b'x'*400)
65        list(it)
66        del it
67        gc_collect()
68        b.extend(b'x'*400)
69
70    def test_weakref(self):
71        s = 'QabbbcR'
72        x = re.compile('ab+c')
73        y = proxy(x)
74        self.assertEqual(x.findall('QabbbcR'), y.findall('QabbbcR'))
75
76    def test_search_star_plus(self):
77        self.assertEqual(re.search('x*', 'axx').span(0), (0, 0))
78        self.assertEqual(re.search('x*', 'axx').span(), (0, 0))
79        self.assertEqual(re.search('x+', 'axx').span(0), (1, 3))
80        self.assertEqual(re.search('x+', 'axx').span(), (1, 3))
81        self.assertIsNone(re.search('x', 'aaa'))
82        self.assertEqual(re.match('a*', 'xxx').span(0), (0, 0))
83        self.assertEqual(re.match('a*', 'xxx').span(), (0, 0))
84        self.assertEqual(re.match('x*', 'xxxa').span(0), (0, 3))
85        self.assertEqual(re.match('x*', 'xxxa').span(), (0, 3))
86        self.assertIsNone(re.match('a+', 'xxx'))
87
88    def bump_num(self, matchobj):
89        int_value = int(matchobj.group(0))
90        return str(int_value + 1)
91
92    def test_basic_re_sub(self):
93        self.assertTypedEqual(re.sub('y', 'a', 'xyz'), 'xaz')
94        self.assertTypedEqual(re.sub('y', S('a'), S('xyz')), 'xaz')
95        self.assertTypedEqual(re.sub(b'y', b'a', b'xyz'), b'xaz')
96        self.assertTypedEqual(re.sub(b'y', B(b'a'), B(b'xyz')), b'xaz')
97        self.assertTypedEqual(re.sub(b'y', bytearray(b'a'), bytearray(b'xyz')), b'xaz')
98        self.assertTypedEqual(re.sub(b'y', memoryview(b'a'), memoryview(b'xyz')), b'xaz')
99        for y in ("\xe0", "\u0430", "\U0001d49c"):
100            self.assertEqual(re.sub(y, 'a', 'x%sz' % y), 'xaz')
101
102        self.assertEqual(re.sub("(?i)b+", "x", "bbbb BBBB"), 'x x')
103        self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y'),
104                         '9.3 -3 24x100y')
105        self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y', 3),
106                         '9.3 -3 23x99y')
107        self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y', count=3),
108                         '9.3 -3 23x99y')
109
110        self.assertEqual(re.sub('.', lambda m: r"\n", 'x'), '\\n')
111        self.assertEqual(re.sub('.', r"\n", 'x'), '\n')
112
113        s = r"\1\1"
114        self.assertEqual(re.sub('(.)', s, 'x'), 'xx')
115        self.assertEqual(re.sub('(.)', re.escape(s), 'x'), s)
116        self.assertEqual(re.sub('(.)', lambda m: s, 'x'), s)
117
118        self.assertEqual(re.sub('(?P<a>x)', r'\g<a>\g<a>', 'xx'), 'xxxx')
119        self.assertEqual(re.sub('(?P<a>x)', r'\g<a>\g<1>', 'xx'), 'xxxx')
120        self.assertEqual(re.sub('(?P<unk>x)', r'\g<unk>\g<unk>', 'xx'), 'xxxx')
121        self.assertEqual(re.sub('(?P<unk>x)', r'\g<1>\g<1>', 'xx'), 'xxxx')
122
123        self.assertEqual(re.sub('a', r'\t\n\v\r\f\a\b', 'a'), '\t\n\v\r\f\a\b')
124        self.assertEqual(re.sub('a', '\t\n\v\r\f\a\b', 'a'), '\t\n\v\r\f\a\b')
125        self.assertEqual(re.sub('a', '\t\n\v\r\f\a\b', 'a'),
126                         (chr(9)+chr(10)+chr(11)+chr(13)+chr(12)+chr(7)+chr(8)))
127        for c in 'cdehijklmopqsuwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ':
128            with self.subTest(c):
129                with self.assertWarns(DeprecationWarning):
130                    self.assertEqual(re.sub('a', '\\' + c, 'a'), '\\' + c)
131
132        self.assertEqual(re.sub(r'^\s*', 'X', 'test'), 'Xtest')
133
134    def test_bug_449964(self):
135        # fails for group followed by other escape
136        self.assertEqual(re.sub(r'(?P<unk>x)', r'\g<1>\g<1>\b', 'xx'),
137                         'xx\bxx\b')
138
139    def test_bug_449000(self):
140        # Test for sub() on escaped characters
141        self.assertEqual(re.sub(r'\r\n', r'\n', 'abc\r\ndef\r\n'),
142                         'abc\ndef\n')
143        self.assertEqual(re.sub('\r\n', r'\n', 'abc\r\ndef\r\n'),
144                         'abc\ndef\n')
145        self.assertEqual(re.sub(r'\r\n', '\n', 'abc\r\ndef\r\n'),
146                         'abc\ndef\n')
147        self.assertEqual(re.sub('\r\n', '\n', 'abc\r\ndef\r\n'),
148                         'abc\ndef\n')
149
150    def test_bug_1661(self):
151        # Verify that flags do not get silently ignored with compiled patterns
152        pattern = re.compile('.')
153        self.assertRaises(ValueError, re.match, pattern, 'A', re.I)
154        self.assertRaises(ValueError, re.search, pattern, 'A', re.I)
155        self.assertRaises(ValueError, re.findall, pattern, 'A', re.I)
156        self.assertRaises(ValueError, re.compile, pattern, re.I)
157
158    def test_bug_3629(self):
159        # A regex that triggered a bug in the sre-code validator
160        re.compile("(?P<quote>)(?(quote))")
161
162    def test_sub_template_numeric_escape(self):
163        # bug 776311 and friends
164        self.assertEqual(re.sub('x', r'\0', 'x'), '\0')
165        self.assertEqual(re.sub('x', r'\000', 'x'), '\000')
166        self.assertEqual(re.sub('x', r'\001', 'x'), '\001')
167        self.assertEqual(re.sub('x', r'\008', 'x'), '\0' + '8')
168        self.assertEqual(re.sub('x', r'\009', 'x'), '\0' + '9')
169        self.assertEqual(re.sub('x', r'\111', 'x'), '\111')
170        self.assertEqual(re.sub('x', r'\117', 'x'), '\117')
171        self.assertEqual(re.sub('x', r'\377', 'x'), '\377')
172
173        self.assertEqual(re.sub('x', r'\1111', 'x'), '\1111')
174        self.assertEqual(re.sub('x', r'\1111', 'x'), '\111' + '1')
175
176        self.assertEqual(re.sub('x', r'\00', 'x'), '\x00')
177        self.assertEqual(re.sub('x', r'\07', 'x'), '\x07')
178        self.assertEqual(re.sub('x', r'\08', 'x'), '\0' + '8')
179        self.assertEqual(re.sub('x', r'\09', 'x'), '\0' + '9')
180        self.assertEqual(re.sub('x', r'\0a', 'x'), '\0' + 'a')
181
182        self.checkTemplateError('x', r'\400', 'x',
183                                r'octal escape value \400 outside of '
184                                r'range 0-0o377', 0)
185        self.checkTemplateError('x', r'\777', 'x',
186                                r'octal escape value \777 outside of '
187                                r'range 0-0o377', 0)
188
189        self.checkTemplateError('x', r'\1', 'x', 'invalid group reference 1', 1)
190        self.checkTemplateError('x', r'\8', 'x', 'invalid group reference 8', 1)
191        self.checkTemplateError('x', r'\9', 'x', 'invalid group reference 9', 1)
192        self.checkTemplateError('x', r'\11', 'x', 'invalid group reference 11', 1)
193        self.checkTemplateError('x', r'\18', 'x', 'invalid group reference 18', 1)
194        self.checkTemplateError('x', r'\1a', 'x', 'invalid group reference 1', 1)
195        self.checkTemplateError('x', r'\90', 'x', 'invalid group reference 90', 1)
196        self.checkTemplateError('x', r'\99', 'x', 'invalid group reference 99', 1)
197        self.checkTemplateError('x', r'\118', 'x', 'invalid group reference 11', 1)
198        self.checkTemplateError('x', r'\11a', 'x', 'invalid group reference 11', 1)
199        self.checkTemplateError('x', r'\181', 'x', 'invalid group reference 18', 1)
200        self.checkTemplateError('x', r'\800', 'x', 'invalid group reference 80', 1)
201        self.checkTemplateError('x', r'\8', '', 'invalid group reference 8', 1)
202
203        # in python2.3 (etc), these loop endlessly in sre_parser.py
204        self.assertEqual(re.sub('(((((((((((x)))))))))))', r'\11', 'x'), 'x')
205        self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\118', 'xyz'),
206                         'xz8')
207        self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\11a', 'xyz'),
208                         'xza')
209
210    def test_qualified_re_sub(self):
211        self.assertEqual(re.sub('a', 'b', 'aaaaa'), 'bbbbb')
212        self.assertEqual(re.sub('a', 'b', 'aaaaa', 1), 'baaaa')
213        self.assertEqual(re.sub('a', 'b', 'aaaaa', count=1), 'baaaa')
214
215    def test_bug_114660(self):
216        self.assertEqual(re.sub(r'(\S)\s+(\S)', r'\1 \2', 'hello  there'),
217                         'hello there')
218
219    def test_bug_462270(self):
220        # Test for empty sub() behaviour, see SF bug #462270
221        self.assertEqual(re.sub('x*', '-', 'abxd'), '-a-b-d-')
222        self.assertEqual(re.sub('x+', '-', 'abxd'), 'ab-d')
223
224    def test_symbolic_groups(self):
225        re.compile(r'(?P<a>x)(?P=a)(?(a)y)')
226        re.compile(r'(?P<a1>x)(?P=a1)(?(a1)y)')
227        re.compile(r'(?P<a1>x)\1(?(1)y)')
228        self.checkPatternError(r'(?P<a>)(?P<a>)',
229                               "redefinition of group name 'a' as group 2; "
230                               "was group 1")
231        self.checkPatternError(r'(?P<a>(?P=a))',
232                               "cannot refer to an open group", 10)
233        self.checkPatternError(r'(?Pxy)', 'unknown extension ?Px')
234        self.checkPatternError(r'(?P<a>)(?P=a', 'missing ), unterminated name', 11)
235        self.checkPatternError(r'(?P=', 'missing group name', 4)
236        self.checkPatternError(r'(?P=)', 'missing group name', 4)
237        self.checkPatternError(r'(?P=1)', "bad character in group name '1'", 4)
238        self.checkPatternError(r'(?P=a)', "unknown group name 'a'")
239        self.checkPatternError(r'(?P=a1)', "unknown group name 'a1'")
240        self.checkPatternError(r'(?P=a.)', "bad character in group name 'a.'", 4)
241        self.checkPatternError(r'(?P<)', 'missing >, unterminated name', 4)
242        self.checkPatternError(r'(?P<a', 'missing >, unterminated name', 4)
243        self.checkPatternError(r'(?P<', 'missing group name', 4)
244        self.checkPatternError(r'(?P<>)', 'missing group name', 4)
245        self.checkPatternError(r'(?P<1>)', "bad character in group name '1'", 4)
246        self.checkPatternError(r'(?P<a.>)', "bad character in group name 'a.'", 4)
247        self.checkPatternError(r'(?(', 'missing group name', 3)
248        self.checkPatternError(r'(?())', 'missing group name', 3)
249        self.checkPatternError(r'(?(a))', "unknown group name 'a'", 3)
250        self.checkPatternError(r'(?(-1))', "bad character in group name '-1'", 3)
251        self.checkPatternError(r'(?(1a))', "bad character in group name '1a'", 3)
252        self.checkPatternError(r'(?(a.))', "bad character in group name 'a.'", 3)
253        # New valid/invalid identifiers in Python 3
254        re.compile('(?P<µ>x)(?P=µ)(?(µ)y)')
255        re.compile('(?P<��������������>x)(?P=��������������)(?(��������������)y)')
256        self.checkPatternError('(?P<©>x)', "bad character in group name '©'", 4)
257        # Support > 100 groups.
258        pat = '|'.join('x(?P<a%d>%x)y' % (i, i) for i in range(1, 200 + 1))
259        pat = '(?:%s)(?(200)z|t)' % pat
260        self.assertEqual(re.match(pat, 'xc8yz').span(), (0, 5))
261
262    def test_symbolic_refs(self):
263        self.checkTemplateError('(?P<a>x)', r'\g<a', 'xx',
264                                'missing >, unterminated name', 3)
265        self.checkTemplateError('(?P<a>x)', r'\g<', 'xx',
266                                'missing group name', 3)
267        self.checkTemplateError('(?P<a>x)', r'\g', 'xx', 'missing <', 2)
268        self.checkTemplateError('(?P<a>x)', r'\g<a a>', 'xx',
269                                "bad character in group name 'a a'", 3)
270        self.checkTemplateError('(?P<a>x)', r'\g<>', 'xx',
271                                'missing group name', 3)
272        self.checkTemplateError('(?P<a>x)', r'\g<1a1>', 'xx',
273                                "bad character in group name '1a1'", 3)
274        self.checkTemplateError('(?P<a>x)', r'\g<2>', 'xx',
275                                'invalid group reference 2', 3)
276        self.checkTemplateError('(?P<a>x)', r'\2', 'xx',
277                                'invalid group reference 2', 1)
278        with self.assertRaisesRegex(IndexError, "unknown group name 'ab'"):
279            re.sub('(?P<a>x)', r'\g<ab>', 'xx')
280        self.assertEqual(re.sub('(?P<a>x)|(?P<b>y)', r'\g<b>', 'xx'), '')
281        self.assertEqual(re.sub('(?P<a>x)|(?P<b>y)', r'\2', 'xx'), '')
282        self.checkTemplateError('(?P<a>x)', r'\g<-1>', 'xx',
283                                "bad character in group name '-1'", 3)
284        # New valid/invalid identifiers in Python 3
285        self.assertEqual(re.sub('(?P<µ>x)', r'\g<µ>', 'xx'), 'xx')
286        self.assertEqual(re.sub('(?P<��������������>x)', r'\g<��������������>', 'xx'), 'xx')
287        self.checkTemplateError('(?P<a>x)', r'\g<©>', 'xx',
288                                "bad character in group name '©'", 3)
289        # Support > 100 groups.
290        pat = '|'.join('x(?P<a%d>%x)y' % (i, i) for i in range(1, 200 + 1))
291        self.assertEqual(re.sub(pat, r'\g<200>', 'xc8yzxc8y'), 'c8zc8')
292
293    def test_re_subn(self):
294        self.assertEqual(re.subn("(?i)b+", "x", "bbbb BBBB"), ('x x', 2))
295        self.assertEqual(re.subn("b+", "x", "bbbb BBBB"), ('x BBBB', 1))
296        self.assertEqual(re.subn("b+", "x", "xyz"), ('xyz', 0))
297        self.assertEqual(re.subn("b*", "x", "xyz"), ('xxxyxzx', 4))
298        self.assertEqual(re.subn("b*", "x", "xyz", 2), ('xxxyz', 2))
299        self.assertEqual(re.subn("b*", "x", "xyz", count=2), ('xxxyz', 2))
300
301    def test_re_split(self):
302        for string in ":a:b::c", S(":a:b::c"):
303            self.assertTypedEqual(re.split(":", string),
304                                  ['', 'a', 'b', '', 'c'])
305            self.assertTypedEqual(re.split(":+", string),
306                                  ['', 'a', 'b', 'c'])
307            self.assertTypedEqual(re.split("(:+)", string),
308                                  ['', ':', 'a', ':', 'b', '::', 'c'])
309        for string in (b":a:b::c", B(b":a:b::c"), bytearray(b":a:b::c"),
310                       memoryview(b":a:b::c")):
311            self.assertTypedEqual(re.split(b":", string),
312                                  [b'', b'a', b'b', b'', b'c'])
313            self.assertTypedEqual(re.split(b":+", string),
314                                  [b'', b'a', b'b', b'c'])
315            self.assertTypedEqual(re.split(b"(:+)", string),
316                                  [b'', b':', b'a', b':', b'b', b'::', b'c'])
317        for a, b, c in ("\xe0\xdf\xe7", "\u0430\u0431\u0432",
318                        "\U0001d49c\U0001d49e\U0001d4b5"):
319            string = ":%s:%s::%s" % (a, b, c)
320            self.assertEqual(re.split(":", string), ['', a, b, '', c])
321            self.assertEqual(re.split(":+", string), ['', a, b, c])
322            self.assertEqual(re.split("(:+)", string),
323                             ['', ':', a, ':', b, '::', c])
324
325        self.assertEqual(re.split("(?::+)", ":a:b::c"), ['', 'a', 'b', 'c'])
326        self.assertEqual(re.split("(:)+", ":a:b::c"),
327                         ['', ':', 'a', ':', 'b', ':', 'c'])
328        self.assertEqual(re.split("([b:]+)", ":a:b::c"),
329                         ['', ':', 'a', ':b::', 'c'])
330        self.assertEqual(re.split("(b)|(:+)", ":a:b::c"),
331                         ['', None, ':', 'a', None, ':', '', 'b', None, '',
332                          None, '::', 'c'])
333        self.assertEqual(re.split("(?:b)|(?::+)", ":a:b::c"),
334                         ['', 'a', '', '', 'c'])
335
336        for sep, expected in [
337            (':*', ['', 'a', 'b', 'c']),
338            ('(?::*)', ['', 'a', 'b', 'c']),
339            ('(:*)', ['', ':', 'a', ':', 'b', '::', 'c']),
340            ('(:)*', ['', ':', 'a', ':', 'b', ':', 'c']),
341        ]:
342            with self.subTest(sep=sep), self.assertWarns(FutureWarning):
343                self.assertTypedEqual(re.split(sep, ':a:b::c'), expected)
344
345        for sep, expected in [
346            ('', [':a:b::c']),
347            (r'\b', [':a:b::c']),
348            (r'(?=:)', [':a:b::c']),
349            (r'(?<=:)', [':a:b::c']),
350        ]:
351            with self.subTest(sep=sep), self.assertRaises(ValueError):
352                self.assertTypedEqual(re.split(sep, ':a:b::c'), expected)
353
354    def test_qualified_re_split(self):
355        self.assertEqual(re.split(":", ":a:b::c", 2), ['', 'a', 'b::c'])
356        self.assertEqual(re.split(":", ":a:b::c", maxsplit=2), ['', 'a', 'b::c'])
357        self.assertEqual(re.split(':', 'a:b:c:d', maxsplit=2), ['a', 'b', 'c:d'])
358        self.assertEqual(re.split("(:)", ":a:b::c", maxsplit=2),
359                         ['', ':', 'a', ':', 'b::c'])
360        self.assertEqual(re.split("(:+)", ":a:b::c", maxsplit=2),
361                         ['', ':', 'a', ':', 'b::c'])
362        with self.assertWarns(FutureWarning):
363            self.assertEqual(re.split("(:*)", ":a:b::c", maxsplit=2),
364                             ['', ':', 'a', ':', 'b::c'])
365
366    def test_re_findall(self):
367        self.assertEqual(re.findall(":+", "abc"), [])
368        for string in "a:b::c:::d", S("a:b::c:::d"):
369            self.assertTypedEqual(re.findall(":+", string),
370                                  [":", "::", ":::"])
371            self.assertTypedEqual(re.findall("(:+)", string),
372                                  [":", "::", ":::"])
373            self.assertTypedEqual(re.findall("(:)(:*)", string),
374                                  [(":", ""), (":", ":"), (":", "::")])
375        for string in (b"a:b::c:::d", B(b"a:b::c:::d"), bytearray(b"a:b::c:::d"),
376                       memoryview(b"a:b::c:::d")):
377            self.assertTypedEqual(re.findall(b":+", string),
378                                  [b":", b"::", b":::"])
379            self.assertTypedEqual(re.findall(b"(:+)", string),
380                                  [b":", b"::", b":::"])
381            self.assertTypedEqual(re.findall(b"(:)(:*)", string),
382                                  [(b":", b""), (b":", b":"), (b":", b"::")])
383        for x in ("\xe0", "\u0430", "\U0001d49c"):
384            xx = x * 2
385            xxx = x * 3
386            string = "a%sb%sc%sd" % (x, xx, xxx)
387            self.assertEqual(re.findall("%s+" % x, string), [x, xx, xxx])
388            self.assertEqual(re.findall("(%s+)" % x, string), [x, xx, xxx])
389            self.assertEqual(re.findall("(%s)(%s*)" % (x, x), string),
390                             [(x, ""), (x, x), (x, xx)])
391
392    def test_bug_117612(self):
393        self.assertEqual(re.findall(r"(a|(b))", "aba"),
394                         [("a", ""),("b", "b"),("a", "")])
395
396    def test_re_match(self):
397        for string in 'a', S('a'):
398            self.assertEqual(re.match('a', string).groups(), ())
399            self.assertEqual(re.match('(a)', string).groups(), ('a',))
400            self.assertEqual(re.match('(a)', string).group(0), 'a')
401            self.assertEqual(re.match('(a)', string).group(1), 'a')
402            self.assertEqual(re.match('(a)', string).group(1, 1), ('a', 'a'))
403        for string in b'a', B(b'a'), bytearray(b'a'), memoryview(b'a'):
404            self.assertEqual(re.match(b'a', string).groups(), ())
405            self.assertEqual(re.match(b'(a)', string).groups(), (b'a',))
406            self.assertEqual(re.match(b'(a)', string).group(0), b'a')
407            self.assertEqual(re.match(b'(a)', string).group(1), b'a')
408            self.assertEqual(re.match(b'(a)', string).group(1, 1), (b'a', b'a'))
409        for a in ("\xe0", "\u0430", "\U0001d49c"):
410            self.assertEqual(re.match(a, a).groups(), ())
411            self.assertEqual(re.match('(%s)' % a, a).groups(), (a,))
412            self.assertEqual(re.match('(%s)' % a, a).group(0), a)
413            self.assertEqual(re.match('(%s)' % a, a).group(1), a)
414            self.assertEqual(re.match('(%s)' % a, a).group(1, 1), (a, a))
415
416        pat = re.compile('((a)|(b))(c)?')
417        self.assertEqual(pat.match('a').groups(), ('a', 'a', None, None))
418        self.assertEqual(pat.match('b').groups(), ('b', None, 'b', None))
419        self.assertEqual(pat.match('ac').groups(), ('a', 'a', None, 'c'))
420        self.assertEqual(pat.match('bc').groups(), ('b', None, 'b', 'c'))
421        self.assertEqual(pat.match('bc').groups(""), ('b', "", 'b', 'c'))
422
423        pat = re.compile('(?:(?P<a1>a)|(?P<b2>b))(?P<c3>c)?')
424        self.assertEqual(pat.match('a').group(1, 2, 3), ('a', None, None))
425        self.assertEqual(pat.match('b').group('a1', 'b2', 'c3'),
426                         (None, 'b', None))
427        self.assertEqual(pat.match('ac').group(1, 'b2', 3), ('a', None, 'c'))
428
429    def test_group(self):
430        class Index:
431            def __init__(self, value):
432                self.value = value
433            def __index__(self):
434                return self.value
435        # A single group
436        m = re.match('(a)(b)', 'ab')
437        self.assertEqual(m.group(), 'ab')
438        self.assertEqual(m.group(0), 'ab')
439        self.assertEqual(m.group(1), 'a')
440        self.assertEqual(m.group(Index(1)), 'a')
441        self.assertRaises(IndexError, m.group, -1)
442        self.assertRaises(IndexError, m.group, 3)
443        self.assertRaises(IndexError, m.group, 1<<1000)
444        self.assertRaises(IndexError, m.group, Index(1<<1000))
445        self.assertRaises(IndexError, m.group, 'x')
446        # Multiple groups
447        self.assertEqual(m.group(2, 1), ('b', 'a'))
448        self.assertEqual(m.group(Index(2), Index(1)), ('b', 'a'))
449
450    def test_match_getitem(self):
451        pat = re.compile('(?:(?P<a1>a)|(?P<b2>b))(?P<c3>c)?')
452
453        m = pat.match('a')
454        self.assertEqual(m['a1'], 'a')
455        self.assertEqual(m['b2'], None)
456        self.assertEqual(m['c3'], None)
457        self.assertEqual('a1={a1} b2={b2} c3={c3}'.format_map(m), 'a1=a b2=None c3=None')
458        self.assertEqual(m[0], 'a')
459        self.assertEqual(m[1], 'a')
460        self.assertEqual(m[2], None)
461        self.assertEqual(m[3], None)
462        with self.assertRaisesRegex(IndexError, 'no such group'):
463            m['X']
464        with self.assertRaisesRegex(IndexError, 'no such group'):
465            m[-1]
466        with self.assertRaisesRegex(IndexError, 'no such group'):
467            m[4]
468        with self.assertRaisesRegex(IndexError, 'no such group'):
469            m[0, 1]
470        with self.assertRaisesRegex(IndexError, 'no such group'):
471            m[(0,)]
472        with self.assertRaisesRegex(IndexError, 'no such group'):
473            m[(0, 1)]
474        with self.assertRaisesRegex(KeyError, 'a2'):
475            'a1={a2}'.format_map(m)
476
477        m = pat.match('ac')
478        self.assertEqual(m['a1'], 'a')
479        self.assertEqual(m['b2'], None)
480        self.assertEqual(m['c3'], 'c')
481        self.assertEqual('a1={a1} b2={b2} c3={c3}'.format_map(m), 'a1=a b2=None c3=c')
482        self.assertEqual(m[0], 'ac')
483        self.assertEqual(m[1], 'a')
484        self.assertEqual(m[2], None)
485        self.assertEqual(m[3], 'c')
486
487        # Cannot assign.
488        with self.assertRaises(TypeError):
489            m[0] = 1
490
491        # No len().
492        self.assertRaises(TypeError, len, m)
493
494    def test_re_fullmatch(self):
495        # Issue 16203: Proposal: add re.fullmatch() method.
496        self.assertEqual(re.fullmatch(r"a", "a").span(), (0, 1))
497        for string in "ab", S("ab"):
498            self.assertEqual(re.fullmatch(r"a|ab", string).span(), (0, 2))
499        for string in b"ab", B(b"ab"), bytearray(b"ab"), memoryview(b"ab"):
500            self.assertEqual(re.fullmatch(br"a|ab", string).span(), (0, 2))
501        for a, b in "\xe0\xdf", "\u0430\u0431", "\U0001d49c\U0001d49e":
502            r = r"%s|%s" % (a, a + b)
503            self.assertEqual(re.fullmatch(r, a + b).span(), (0, 2))
504        self.assertEqual(re.fullmatch(r".*?$", "abc").span(), (0, 3))
505        self.assertEqual(re.fullmatch(r".*?", "abc").span(), (0, 3))
506        self.assertEqual(re.fullmatch(r"a.*?b", "ab").span(), (0, 2))
507        self.assertEqual(re.fullmatch(r"a.*?b", "abb").span(), (0, 3))
508        self.assertEqual(re.fullmatch(r"a.*?b", "axxb").span(), (0, 4))
509        self.assertIsNone(re.fullmatch(r"a+", "ab"))
510        self.assertIsNone(re.fullmatch(r"abc$", "abc\n"))
511        self.assertIsNone(re.fullmatch(r"abc\Z", "abc\n"))
512        self.assertIsNone(re.fullmatch(r"(?m)abc$", "abc\n"))
513        self.assertEqual(re.fullmatch(r"ab(?=c)cd", "abcd").span(), (0, 4))
514        self.assertEqual(re.fullmatch(r"ab(?<=b)cd", "abcd").span(), (0, 4))
515        self.assertEqual(re.fullmatch(r"(?=a|ab)ab", "ab").span(), (0, 2))
516
517        self.assertEqual(
518            re.compile(r"bc").fullmatch("abcd", pos=1, endpos=3).span(), (1, 3))
519        self.assertEqual(
520            re.compile(r".*?$").fullmatch("abcd", pos=1, endpos=3).span(), (1, 3))
521        self.assertEqual(
522            re.compile(r".*?").fullmatch("abcd", pos=1, endpos=3).span(), (1, 3))
523
524    def test_re_groupref_exists(self):
525        self.assertEqual(re.match(r'^(\()?([^()]+)(?(1)\))$', '(a)').groups(),
526                         ('(', 'a'))
527        self.assertEqual(re.match(r'^(\()?([^()]+)(?(1)\))$', 'a').groups(),
528                         (None, 'a'))
529        self.assertIsNone(re.match(r'^(\()?([^()]+)(?(1)\))$', 'a)'))
530        self.assertIsNone(re.match(r'^(\()?([^()]+)(?(1)\))$', '(a'))
531        self.assertEqual(re.match('^(?:(a)|c)((?(1)b|d))$', 'ab').groups(),
532                         ('a', 'b'))
533        self.assertEqual(re.match(r'^(?:(a)|c)((?(1)b|d))$', 'cd').groups(),
534                         (None, 'd'))
535        self.assertEqual(re.match(r'^(?:(a)|c)((?(1)|d))$', 'cd').groups(),
536                         (None, 'd'))
537        self.assertEqual(re.match(r'^(?:(a)|c)((?(1)|d))$', 'a').groups(),
538                         ('a', ''))
539
540        # Tests for bug #1177831: exercise groups other than the first group
541        p = re.compile('(?P<g1>a)(?P<g2>b)?((?(g2)c|d))')
542        self.assertEqual(p.match('abc').groups(),
543                         ('a', 'b', 'c'))
544        self.assertEqual(p.match('ad').groups(),
545                         ('a', None, 'd'))
546        self.assertIsNone(p.match('abd'))
547        self.assertIsNone(p.match('ac'))
548
549        # Support > 100 groups.
550        pat = '|'.join('x(?P<a%d>%x)y' % (i, i) for i in range(1, 200 + 1))
551        pat = '(?:%s)(?(200)z)' % pat
552        self.assertEqual(re.match(pat, 'xc8yz').span(), (0, 5))
553
554        self.checkPatternError(r'(?P<a>)(?(0))', 'bad group number', 10)
555        self.checkPatternError(r'()(?(1)a|b',
556                               'missing ), unterminated subpattern', 2)
557        self.checkPatternError(r'()(?(1)a|b|c)',
558                               'conditional backref with more than '
559                               'two branches', 10)
560
561    def test_re_groupref_overflow(self):
562        from sre_constants import MAXGROUPS
563        self.checkTemplateError('()', r'\g<%s>' % MAXGROUPS, 'xx',
564                                'invalid group reference %d' % MAXGROUPS, 3)
565        self.checkPatternError(r'(?P<a>)(?(%d))' % MAXGROUPS,
566                               'invalid group reference %d' % MAXGROUPS, 10)
567
568    def test_re_groupref(self):
569        self.assertEqual(re.match(r'^(\|)?([^()]+)\1$', '|a|').groups(),
570                         ('|', 'a'))
571        self.assertEqual(re.match(r'^(\|)?([^()]+)\1?$', 'a').groups(),
572                         (None, 'a'))
573        self.assertIsNone(re.match(r'^(\|)?([^()]+)\1$', 'a|'))
574        self.assertIsNone(re.match(r'^(\|)?([^()]+)\1$', '|a'))
575        self.assertEqual(re.match(r'^(?:(a)|c)(\1)$', 'aa').groups(),
576                         ('a', 'a'))
577        self.assertEqual(re.match(r'^(?:(a)|c)(\1)?$', 'c').groups(),
578                         (None, None))
579
580        self.checkPatternError(r'(abc\1)', 'cannot refer to an open group', 4)
581
582    def test_groupdict(self):
583        self.assertEqual(re.match('(?P<first>first) (?P<second>second)',
584                                  'first second').groupdict(),
585                         {'first':'first', 'second':'second'})
586
587    def test_expand(self):
588        self.assertEqual(re.match("(?P<first>first) (?P<second>second)",
589                                  "first second")
590                                  .expand(r"\2 \1 \g<second> \g<first>"),
591                         "second first second first")
592        self.assertEqual(re.match("(?P<first>first)|(?P<second>second)",
593                                  "first")
594                                  .expand(r"\2 \g<second>"),
595                         " ")
596
597    def test_repeat_minmax(self):
598        self.assertIsNone(re.match(r"^(\w){1}$", "abc"))
599        self.assertIsNone(re.match(r"^(\w){1}?$", "abc"))
600        self.assertIsNone(re.match(r"^(\w){1,2}$", "abc"))
601        self.assertIsNone(re.match(r"^(\w){1,2}?$", "abc"))
602
603        self.assertEqual(re.match(r"^(\w){3}$", "abc").group(1), "c")
604        self.assertEqual(re.match(r"^(\w){1,3}$", "abc").group(1), "c")
605        self.assertEqual(re.match(r"^(\w){1,4}$", "abc").group(1), "c")
606        self.assertEqual(re.match(r"^(\w){3,4}?$", "abc").group(1), "c")
607        self.assertEqual(re.match(r"^(\w){3}?$", "abc").group(1), "c")
608        self.assertEqual(re.match(r"^(\w){1,3}?$", "abc").group(1), "c")
609        self.assertEqual(re.match(r"^(\w){1,4}?$", "abc").group(1), "c")
610        self.assertEqual(re.match(r"^(\w){3,4}?$", "abc").group(1), "c")
611
612        self.assertIsNone(re.match(r"^x{1}$", "xxx"))
613        self.assertIsNone(re.match(r"^x{1}?$", "xxx"))
614        self.assertIsNone(re.match(r"^x{1,2}$", "xxx"))
615        self.assertIsNone(re.match(r"^x{1,2}?$", "xxx"))
616
617        self.assertTrue(re.match(r"^x{3}$", "xxx"))
618        self.assertTrue(re.match(r"^x{1,3}$", "xxx"))
619        self.assertTrue(re.match(r"^x{3,3}$", "xxx"))
620        self.assertTrue(re.match(r"^x{1,4}$", "xxx"))
621        self.assertTrue(re.match(r"^x{3,4}?$", "xxx"))
622        self.assertTrue(re.match(r"^x{3}?$", "xxx"))
623        self.assertTrue(re.match(r"^x{1,3}?$", "xxx"))
624        self.assertTrue(re.match(r"^x{1,4}?$", "xxx"))
625        self.assertTrue(re.match(r"^x{3,4}?$", "xxx"))
626
627        self.assertIsNone(re.match(r"^x{}$", "xxx"))
628        self.assertTrue(re.match(r"^x{}$", "x{}"))
629
630        self.checkPatternError(r'x{2,1}',
631                               'min repeat greater than max repeat', 2)
632
633    def test_getattr(self):
634        self.assertEqual(re.compile("(?i)(a)(b)").pattern, "(?i)(a)(b)")
635        self.assertEqual(re.compile("(?i)(a)(b)").flags, re.I | re.U)
636        self.assertEqual(re.compile("(?i)(a)(b)").groups, 2)
637        self.assertEqual(re.compile("(?i)(a)(b)").groupindex, {})
638        self.assertEqual(re.compile("(?i)(?P<first>a)(?P<other>b)").groupindex,
639                         {'first': 1, 'other': 2})
640
641        self.assertEqual(re.match("(a)", "a").pos, 0)
642        self.assertEqual(re.match("(a)", "a").endpos, 1)
643        self.assertEqual(re.match("(a)", "a").string, "a")
644        self.assertEqual(re.match("(a)", "a").regs, ((0, 1), (0, 1)))
645        self.assertTrue(re.match("(a)", "a").re)
646
647        # Issue 14260. groupindex should be non-modifiable mapping.
648        p = re.compile(r'(?i)(?P<first>a)(?P<other>b)')
649        self.assertEqual(sorted(p.groupindex), ['first', 'other'])
650        self.assertEqual(p.groupindex['other'], 2)
651        with self.assertRaises(TypeError):
652            p.groupindex['other'] = 0
653        self.assertEqual(p.groupindex['other'], 2)
654
655    def test_special_escapes(self):
656        self.assertEqual(re.search(r"\b(b.)\b",
657                                   "abcd abc bcd bx").group(1), "bx")
658        self.assertEqual(re.search(r"\B(b.)\B",
659                                   "abc bcd bc abxd").group(1), "bx")
660        self.assertEqual(re.search(r"\b(b.)\b",
661                                   "abcd abc bcd bx", re.ASCII).group(1), "bx")
662        self.assertEqual(re.search(r"\B(b.)\B",
663                                   "abc bcd bc abxd", re.ASCII).group(1), "bx")
664        self.assertEqual(re.search(r"^abc$", "\nabc\n", re.M).group(0), "abc")
665        self.assertEqual(re.search(r"^\Aabc\Z$", "abc", re.M).group(0), "abc")
666        self.assertIsNone(re.search(r"^\Aabc\Z$", "\nabc\n", re.M))
667        self.assertEqual(re.search(br"\b(b.)\b",
668                                   b"abcd abc bcd bx").group(1), b"bx")
669        self.assertEqual(re.search(br"\B(b.)\B",
670                                   b"abc bcd bc abxd").group(1), b"bx")
671        self.assertEqual(re.search(br"\b(b.)\b",
672                                   b"abcd abc bcd bx", re.LOCALE).group(1), b"bx")
673        self.assertEqual(re.search(br"\B(b.)\B",
674                                   b"abc bcd bc abxd", re.LOCALE).group(1), b"bx")
675        self.assertEqual(re.search(br"^abc$", b"\nabc\n", re.M).group(0), b"abc")
676        self.assertEqual(re.search(br"^\Aabc\Z$", b"abc", re.M).group(0), b"abc")
677        self.assertIsNone(re.search(br"^\Aabc\Z$", b"\nabc\n", re.M))
678        self.assertEqual(re.search(r"\d\D\w\W\s\S",
679                                   "1aa! a").group(0), "1aa! a")
680        self.assertEqual(re.search(br"\d\D\w\W\s\S",
681                                   b"1aa! a").group(0), b"1aa! a")
682        self.assertEqual(re.search(r"\d\D\w\W\s\S",
683                                   "1aa! a", re.ASCII).group(0), "1aa! a")
684        self.assertEqual(re.search(br"\d\D\w\W\s\S",
685                                   b"1aa! a", re.LOCALE).group(0), b"1aa! a")
686
687    def test_other_escapes(self):
688        self.checkPatternError("\\", 'bad escape (end of pattern)', 0)
689        self.assertEqual(re.match(r"\(", '(').group(), '(')
690        self.assertIsNone(re.match(r"\(", ')'))
691        self.assertEqual(re.match(r"\\", '\\').group(), '\\')
692        self.assertEqual(re.match(r"[\]]", ']').group(), ']')
693        self.assertIsNone(re.match(r"[\]]", '['))
694        self.assertEqual(re.match(r"[a\-c]", '-').group(), '-')
695        self.assertIsNone(re.match(r"[a\-c]", 'b'))
696        self.assertEqual(re.match(r"[\^a]+", 'a^').group(), 'a^')
697        self.assertIsNone(re.match(r"[\^a]+", 'b'))
698        re.purge()  # for warnings
699        for c in 'ceghijklmopqyzCEFGHIJKLMNOPQRTVXY':
700            with self.subTest(c):
701                self.assertRaises(re.error, re.compile, '\\%c' % c)
702        for c in 'ceghijklmopqyzABCEFGHIJKLMNOPQRTVXYZ':
703            with self.subTest(c):
704                self.assertRaises(re.error, re.compile, '[\\%c]' % c)
705
706    def test_string_boundaries(self):
707        # See http://bugs.python.org/issue10713
708        self.assertEqual(re.search(r"\b(abc)\b", "abc").group(1),
709                         "abc")
710        # There's a word boundary at the start of a string.
711        self.assertTrue(re.match(r"\b", "abc"))
712        # A non-empty string includes a non-boundary zero-length match.
713        self.assertTrue(re.search(r"\B", "abc"))
714        # There is no non-boundary match at the start of a string.
715        self.assertFalse(re.match(r"\B", "abc"))
716        # However, an empty string contains no word boundaries, and also no
717        # non-boundaries.
718        self.assertIsNone(re.search(r"\B", ""))
719        # This one is questionable and different from the perlre behaviour,
720        # but describes current behavior.
721        self.assertIsNone(re.search(r"\b", ""))
722        # A single word-character string has two boundaries, but no
723        # non-boundary gaps.
724        self.assertEqual(len(re.findall(r"\b", "a")), 2)
725        self.assertEqual(len(re.findall(r"\B", "a")), 0)
726        # If there are no words, there are no boundaries
727        self.assertEqual(len(re.findall(r"\b", " ")), 0)
728        self.assertEqual(len(re.findall(r"\b", "   ")), 0)
729        # Can match around the whitespace.
730        self.assertEqual(len(re.findall(r"\B", " ")), 2)
731
732    def test_bigcharset(self):
733        self.assertEqual(re.match("([\u2222\u2223])",
734                                  "\u2222").group(1), "\u2222")
735        r = '[%s]' % ''.join(map(chr, range(256, 2**16, 255)))
736        self.assertEqual(re.match(r, "\uff01").group(), "\uff01")
737
738    def test_big_codesize(self):
739        # Issue #1160
740        r = re.compile('|'.join(('%d'%x for x in range(10000))))
741        self.assertTrue(r.match('1000'))
742        self.assertTrue(r.match('9999'))
743
744    def test_anyall(self):
745        self.assertEqual(re.match("a.b", "a\nb", re.DOTALL).group(0),
746                         "a\nb")
747        self.assertEqual(re.match("a.*b", "a\n\nb", re.DOTALL).group(0),
748                         "a\n\nb")
749
750    def test_lookahead(self):
751        self.assertEqual(re.match(r"(a(?=\s[^a]))", "a b").group(1), "a")
752        self.assertEqual(re.match(r"(a(?=\s[^a]*))", "a b").group(1), "a")
753        self.assertEqual(re.match(r"(a(?=\s[abc]))", "a b").group(1), "a")
754        self.assertEqual(re.match(r"(a(?=\s[abc]*))", "a bc").group(1), "a")
755        self.assertEqual(re.match(r"(a)(?=\s\1)", "a a").group(1), "a")
756        self.assertEqual(re.match(r"(a)(?=\s\1*)", "a aa").group(1), "a")
757        self.assertEqual(re.match(r"(a)(?=\s(abc|a))", "a a").group(1), "a")
758
759        self.assertEqual(re.match(r"(a(?!\s[^a]))", "a a").group(1), "a")
760        self.assertEqual(re.match(r"(a(?!\s[abc]))", "a d").group(1), "a")
761        self.assertEqual(re.match(r"(a)(?!\s\1)", "a b").group(1), "a")
762        self.assertEqual(re.match(r"(a)(?!\s(abc|a))", "a b").group(1), "a")
763
764        # Group reference.
765        self.assertTrue(re.match(r'(a)b(?=\1)a', 'aba'))
766        self.assertIsNone(re.match(r'(a)b(?=\1)c', 'abac'))
767        # Conditional group reference.
768        self.assertTrue(re.match(r'(?:(a)|(x))b(?=(?(2)x|c))c', 'abc'))
769        self.assertIsNone(re.match(r'(?:(a)|(x))b(?=(?(2)c|x))c', 'abc'))
770        self.assertTrue(re.match(r'(?:(a)|(x))b(?=(?(2)x|c))c', 'abc'))
771        self.assertIsNone(re.match(r'(?:(a)|(x))b(?=(?(1)b|x))c', 'abc'))
772        self.assertTrue(re.match(r'(?:(a)|(x))b(?=(?(1)c|x))c', 'abc'))
773        # Group used before defined.
774        self.assertTrue(re.match(r'(a)b(?=(?(2)x|c))(c)', 'abc'))
775        self.assertIsNone(re.match(r'(a)b(?=(?(2)b|x))(c)', 'abc'))
776        self.assertTrue(re.match(r'(a)b(?=(?(1)c|x))(c)', 'abc'))
777
778    def test_lookbehind(self):
779        self.assertTrue(re.match(r'ab(?<=b)c', 'abc'))
780        self.assertIsNone(re.match(r'ab(?<=c)c', 'abc'))
781        self.assertIsNone(re.match(r'ab(?<!b)c', 'abc'))
782        self.assertTrue(re.match(r'ab(?<!c)c', 'abc'))
783        # Group reference.
784        self.assertTrue(re.match(r'(a)a(?<=\1)c', 'aac'))
785        self.assertIsNone(re.match(r'(a)b(?<=\1)a', 'abaa'))
786        self.assertIsNone(re.match(r'(a)a(?<!\1)c', 'aac'))
787        self.assertTrue(re.match(r'(a)b(?<!\1)a', 'abaa'))
788        # Conditional group reference.
789        self.assertIsNone(re.match(r'(?:(a)|(x))b(?<=(?(2)x|c))c', 'abc'))
790        self.assertIsNone(re.match(r'(?:(a)|(x))b(?<=(?(2)b|x))c', 'abc'))
791        self.assertTrue(re.match(r'(?:(a)|(x))b(?<=(?(2)x|b))c', 'abc'))
792        self.assertIsNone(re.match(r'(?:(a)|(x))b(?<=(?(1)c|x))c', 'abc'))
793        self.assertTrue(re.match(r'(?:(a)|(x))b(?<=(?(1)b|x))c', 'abc'))
794        # Group used before defined.
795        self.assertRaises(re.error, re.compile, r'(a)b(?<=(?(2)b|x))(c)')
796        self.assertIsNone(re.match(r'(a)b(?<=(?(1)c|x))(c)', 'abc'))
797        self.assertTrue(re.match(r'(a)b(?<=(?(1)b|x))(c)', 'abc'))
798        # Group defined in the same lookbehind pattern
799        self.assertRaises(re.error, re.compile, r'(a)b(?<=(.)\2)(c)')
800        self.assertRaises(re.error, re.compile, r'(a)b(?<=(?P<a>.)(?P=a))(c)')
801        self.assertRaises(re.error, re.compile, r'(a)b(?<=(a)(?(2)b|x))(c)')
802        self.assertRaises(re.error, re.compile, r'(a)b(?<=(.)(?<=\2))(c)')
803
804    def test_ignore_case(self):
805        self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC")
806        self.assertEqual(re.match(b"abc", b"ABC", re.I).group(0), b"ABC")
807        self.assertEqual(re.match(r"(a\s[^a])", "a b", re.I).group(1), "a b")
808        self.assertEqual(re.match(r"(a\s[^a]*)", "a bb", re.I).group(1), "a bb")
809        self.assertEqual(re.match(r"(a\s[abc])", "a b", re.I).group(1), "a b")
810        self.assertEqual(re.match(r"(a\s[abc]*)", "a bb", re.I).group(1), "a bb")
811        self.assertEqual(re.match(r"((a)\s\2)", "a a", re.I).group(1), "a a")
812        self.assertEqual(re.match(r"((a)\s\2*)", "a aa", re.I).group(1), "a aa")
813        self.assertEqual(re.match(r"((a)\s(abc|a))", "a a", re.I).group(1), "a a")
814        self.assertEqual(re.match(r"((a)\s(abc|a)*)", "a aa", re.I).group(1), "a aa")
815
816        assert '\u212a'.lower() == 'k' # 'K'
817        self.assertTrue(re.match(r'K', '\u212a', re.I))
818        self.assertTrue(re.match(r'k', '\u212a', re.I))
819        self.assertTrue(re.match(r'\u212a', 'K', re.I))
820        self.assertTrue(re.match(r'\u212a', 'k', re.I))
821        assert '\u017f'.upper() == 'S' # 'ſ'
822        self.assertTrue(re.match(r'S', '\u017f', re.I))
823        self.assertTrue(re.match(r's', '\u017f', re.I))
824        self.assertTrue(re.match(r'\u017f', 'S', re.I))
825        self.assertTrue(re.match(r'\u017f', 's', re.I))
826        assert '\ufb05'.upper() == '\ufb06'.upper() == 'ST' # 'ſt', 'st'
827        self.assertTrue(re.match(r'\ufb05', '\ufb06', re.I))
828        self.assertTrue(re.match(r'\ufb06', '\ufb05', re.I))
829
830    def test_ignore_case_set(self):
831        self.assertTrue(re.match(r'[19A]', 'A', re.I))
832        self.assertTrue(re.match(r'[19a]', 'a', re.I))
833        self.assertTrue(re.match(r'[19a]', 'A', re.I))
834        self.assertTrue(re.match(r'[19A]', 'a', re.I))
835        self.assertTrue(re.match(br'[19A]', b'A', re.I))
836        self.assertTrue(re.match(br'[19a]', b'a', re.I))
837        self.assertTrue(re.match(br'[19a]', b'A', re.I))
838        self.assertTrue(re.match(br'[19A]', b'a', re.I))
839        assert '\u212a'.lower() == 'k' # 'K'
840        self.assertTrue(re.match(r'[19K]', '\u212a', re.I))
841        self.assertTrue(re.match(r'[19k]', '\u212a', re.I))
842        self.assertTrue(re.match(r'[19\u212a]', 'K', re.I))
843        self.assertTrue(re.match(r'[19\u212a]', 'k', re.I))
844        assert '\u017f'.upper() == 'S' # 'ſ'
845        self.assertTrue(re.match(r'[19S]', '\u017f', re.I))
846        self.assertTrue(re.match(r'[19s]', '\u017f', re.I))
847        self.assertTrue(re.match(r'[19\u017f]', 'S', re.I))
848        self.assertTrue(re.match(r'[19\u017f]', 's', re.I))
849        assert '\ufb05'.upper() == '\ufb06'.upper() == 'ST' # 'ſt', 'st'
850        self.assertTrue(re.match(r'[19\ufb05]', '\ufb06', re.I))
851        self.assertTrue(re.match(r'[19\ufb06]', '\ufb05', re.I))
852
853    def test_ignore_case_range(self):
854        # Issues #3511, #17381.
855        self.assertTrue(re.match(r'[9-a]', '_', re.I))
856        self.assertIsNone(re.match(r'[9-A]', '_', re.I))
857        self.assertTrue(re.match(br'[9-a]', b'_', re.I))
858        self.assertIsNone(re.match(br'[9-A]', b'_', re.I))
859        self.assertTrue(re.match(r'[\xc0-\xde]', '\xd7', re.I))
860        self.assertIsNone(re.match(r'[\xc0-\xde]', '\xf7', re.I))
861        self.assertTrue(re.match(r'[\xe0-\xfe]', '\xf7', re.I))
862        self.assertIsNone(re.match(r'[\xe0-\xfe]', '\xd7', re.I))
863        self.assertTrue(re.match(r'[\u0430-\u045f]', '\u0450', re.I))
864        self.assertTrue(re.match(r'[\u0430-\u045f]', '\u0400', re.I))
865        self.assertTrue(re.match(r'[\u0400-\u042f]', '\u0450', re.I))
866        self.assertTrue(re.match(r'[\u0400-\u042f]', '\u0400', re.I))
867        self.assertTrue(re.match(r'[\U00010428-\U0001044f]', '\U00010428', re.I))
868        self.assertTrue(re.match(r'[\U00010428-\U0001044f]', '\U00010400', re.I))
869        self.assertTrue(re.match(r'[\U00010400-\U00010427]', '\U00010428', re.I))
870        self.assertTrue(re.match(r'[\U00010400-\U00010427]', '\U00010400', re.I))
871
872        assert '\u212a'.lower() == 'k' # 'K'
873        self.assertTrue(re.match(r'[J-M]', '\u212a', re.I))
874        self.assertTrue(re.match(r'[j-m]', '\u212a', re.I))
875        self.assertTrue(re.match(r'[\u2129-\u212b]', 'K', re.I))
876        self.assertTrue(re.match(r'[\u2129-\u212b]', 'k', re.I))
877        assert '\u017f'.upper() == 'S' # 'ſ'
878        self.assertTrue(re.match(r'[R-T]', '\u017f', re.I))
879        self.assertTrue(re.match(r'[r-t]', '\u017f', re.I))
880        self.assertTrue(re.match(r'[\u017e-\u0180]', 'S', re.I))
881        self.assertTrue(re.match(r'[\u017e-\u0180]', 's', re.I))
882        assert '\ufb05'.upper() == '\ufb06'.upper() == 'ST' # 'ſt', 'st'
883        self.assertTrue(re.match(r'[\ufb04-\ufb05]', '\ufb06', re.I))
884        self.assertTrue(re.match(r'[\ufb06-\ufb07]', '\ufb05', re.I))
885
886    def test_category(self):
887        self.assertEqual(re.match(r"(\s)", " ").group(1), " ")
888
889    def test_getlower(self):
890        import _sre
891        self.assertEqual(_sre.getlower(ord('A'), 0), ord('a'))
892        self.assertEqual(_sre.getlower(ord('A'), re.LOCALE), ord('a'))
893        self.assertEqual(_sre.getlower(ord('A'), re.UNICODE), ord('a'))
894        self.assertEqual(_sre.getlower(ord('A'), re.ASCII), ord('a'))
895
896        self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC")
897        self.assertEqual(re.match(b"abc", b"ABC", re.I).group(0), b"ABC")
898        self.assertEqual(re.match("abc", "ABC", re.I|re.A).group(0), "ABC")
899        self.assertEqual(re.match(b"abc", b"ABC", re.I|re.L).group(0), b"ABC")
900
901    def test_not_literal(self):
902        self.assertEqual(re.search(r"\s([^a])", " b").group(1), "b")
903        self.assertEqual(re.search(r"\s([^a]*)", " bb").group(1), "bb")
904
905    def test_search_coverage(self):
906        self.assertEqual(re.search(r"\s(b)", " b").group(1), "b")
907        self.assertEqual(re.search(r"a\s", "a ").group(0), "a ")
908
909    def assertMatch(self, pattern, text, match=None, span=None,
910                    matcher=re.match):
911        if match is None and span is None:
912            # the pattern matches the whole text
913            match = text
914            span = (0, len(text))
915        elif match is None or span is None:
916            raise ValueError('If match is not None, span should be specified '
917                             '(and vice versa).')
918        m = matcher(pattern, text)
919        self.assertTrue(m)
920        self.assertEqual(m.group(), match)
921        self.assertEqual(m.span(), span)
922
923    def test_re_escape(self):
924        alnum_chars = string.ascii_letters + string.digits + '_'
925        p = ''.join(chr(i) for i in range(256))
926        for c in p:
927            if c in alnum_chars:
928                self.assertEqual(re.escape(c), c)
929            elif c == '\x00':
930                self.assertEqual(re.escape(c), '\\000')
931            else:
932                self.assertEqual(re.escape(c), '\\' + c)
933            self.assertMatch(re.escape(c), c)
934        self.assertMatch(re.escape(p), p)
935
936    def test_re_escape_byte(self):
937        alnum_chars = (string.ascii_letters + string.digits + '_').encode('ascii')
938        p = bytes(range(256))
939        for i in p:
940            b = bytes([i])
941            if b in alnum_chars:
942                self.assertEqual(re.escape(b), b)
943            elif i == 0:
944                self.assertEqual(re.escape(b), b'\\000')
945            else:
946                self.assertEqual(re.escape(b), b'\\' + b)
947            self.assertMatch(re.escape(b), b)
948        self.assertMatch(re.escape(p), p)
949
950    def test_re_escape_non_ascii(self):
951        s = 'xxx\u2620\u2620\u2620xxx'
952        s_escaped = re.escape(s)
953        self.assertEqual(s_escaped, 'xxx\\\u2620\\\u2620\\\u2620xxx')
954        self.assertMatch(s_escaped, s)
955        self.assertMatch('.%s+.' % re.escape('\u2620'), s,
956                         'x\u2620\u2620\u2620x', (2, 7), re.search)
957
958    def test_re_escape_non_ascii_bytes(self):
959        b = 'y\u2620y\u2620y'.encode('utf-8')
960        b_escaped = re.escape(b)
961        self.assertEqual(b_escaped, b'y\\\xe2\\\x98\\\xa0y\\\xe2\\\x98\\\xa0y')
962        self.assertMatch(b_escaped, b)
963        res = re.findall(re.escape('\u2620'.encode('utf-8')), b)
964        self.assertEqual(len(res), 2)
965
966    def test_pickling(self):
967        import pickle
968        oldpat = re.compile('a(?:b|(c|e){1,2}?|d)+?(.)', re.UNICODE)
969        for proto in range(pickle.HIGHEST_PROTOCOL + 1):
970            pickled = pickle.dumps(oldpat, proto)
971            newpat = pickle.loads(pickled)
972            self.assertEqual(newpat, oldpat)
973        # current pickle expects the _compile() reconstructor in re module
974        from re import _compile
975
976    def test_constants(self):
977        self.assertEqual(re.I, re.IGNORECASE)
978        self.assertEqual(re.L, re.LOCALE)
979        self.assertEqual(re.M, re.MULTILINE)
980        self.assertEqual(re.S, re.DOTALL)
981        self.assertEqual(re.X, re.VERBOSE)
982
983    def test_flags(self):
984        for flag in [re.I, re.M, re.X, re.S, re.A, re.U]:
985            self.assertTrue(re.compile('^pattern$', flag))
986        for flag in [re.I, re.M, re.X, re.S, re.A, re.L]:
987            self.assertTrue(re.compile(b'^pattern$', flag))
988
989    def test_sre_character_literals(self):
990        for i in [0, 8, 16, 32, 64, 127, 128, 255, 256, 0xFFFF, 0x10000, 0x10FFFF]:
991            if i < 256:
992                self.assertTrue(re.match(r"\%03o" % i, chr(i)))
993                self.assertTrue(re.match(r"\%03o0" % i, chr(i)+"0"))
994                self.assertTrue(re.match(r"\%03o8" % i, chr(i)+"8"))
995                self.assertTrue(re.match(r"\x%02x" % i, chr(i)))
996                self.assertTrue(re.match(r"\x%02x0" % i, chr(i)+"0"))
997                self.assertTrue(re.match(r"\x%02xz" % i, chr(i)+"z"))
998            if i < 0x10000:
999                self.assertTrue(re.match(r"\u%04x" % i, chr(i)))
1000                self.assertTrue(re.match(r"\u%04x0" % i, chr(i)+"0"))
1001                self.assertTrue(re.match(r"\u%04xz" % i, chr(i)+"z"))
1002            self.assertTrue(re.match(r"\U%08x" % i, chr(i)))
1003            self.assertTrue(re.match(r"\U%08x0" % i, chr(i)+"0"))
1004            self.assertTrue(re.match(r"\U%08xz" % i, chr(i)+"z"))
1005        self.assertTrue(re.match(r"\0", "\000"))
1006        self.assertTrue(re.match(r"\08", "\0008"))
1007        self.assertTrue(re.match(r"\01", "\001"))
1008        self.assertTrue(re.match(r"\018", "\0018"))
1009        self.checkPatternError(r"\567",
1010                               r'octal escape value \567 outside of '
1011                               r'range 0-0o377', 0)
1012        self.checkPatternError(r"\911", 'invalid group reference 91', 1)
1013        self.checkPatternError(r"\x1", r'incomplete escape \x1', 0)
1014        self.checkPatternError(r"\x1z", r'incomplete escape \x1', 0)
1015        self.checkPatternError(r"\u123", r'incomplete escape \u123', 0)
1016        self.checkPatternError(r"\u123z", r'incomplete escape \u123', 0)
1017        self.checkPatternError(r"\U0001234", r'incomplete escape \U0001234', 0)
1018        self.checkPatternError(r"\U0001234z", r'incomplete escape \U0001234', 0)
1019        self.checkPatternError(r"\U00110000", r'bad escape \U00110000', 0)
1020
1021    def test_sre_character_class_literals(self):
1022        for i in [0, 8, 16, 32, 64, 127, 128, 255, 256, 0xFFFF, 0x10000, 0x10FFFF]:
1023            if i < 256:
1024                self.assertTrue(re.match(r"[\%o]" % i, chr(i)))
1025                self.assertTrue(re.match(r"[\%o8]" % i, chr(i)))
1026                self.assertTrue(re.match(r"[\%03o]" % i, chr(i)))
1027                self.assertTrue(re.match(r"[\%03o0]" % i, chr(i)))
1028                self.assertTrue(re.match(r"[\%03o8]" % i, chr(i)))
1029                self.assertTrue(re.match(r"[\x%02x]" % i, chr(i)))
1030                self.assertTrue(re.match(r"[\x%02x0]" % i, chr(i)))
1031                self.assertTrue(re.match(r"[\x%02xz]" % i, chr(i)))
1032            if i < 0x10000:
1033                self.assertTrue(re.match(r"[\u%04x]" % i, chr(i)))
1034                self.assertTrue(re.match(r"[\u%04x0]" % i, chr(i)))
1035                self.assertTrue(re.match(r"[\u%04xz]" % i, chr(i)))
1036            self.assertTrue(re.match(r"[\U%08x]" % i, chr(i)))
1037            self.assertTrue(re.match(r"[\U%08x0]" % i, chr(i)+"0"))
1038            self.assertTrue(re.match(r"[\U%08xz]" % i, chr(i)+"z"))
1039        self.checkPatternError(r"[\567]",
1040                               r'octal escape value \567 outside of '
1041                               r'range 0-0o377', 1)
1042        self.checkPatternError(r"[\911]", r'bad escape \9', 1)
1043        self.checkPatternError(r"[\x1z]", r'incomplete escape \x1', 1)
1044        self.checkPatternError(r"[\u123z]", r'incomplete escape \u123', 1)
1045        self.checkPatternError(r"[\U0001234z]", r'incomplete escape \U0001234', 1)
1046        self.checkPatternError(r"[\U00110000]", r'bad escape \U00110000', 1)
1047        self.assertTrue(re.match(r"[\U0001d49c-\U0001d4b5]", "\U0001d49e"))
1048
1049    def test_sre_byte_literals(self):
1050        for i in [0, 8, 16, 32, 64, 127, 128, 255]:
1051            self.assertTrue(re.match((r"\%03o" % i).encode(), bytes([i])))
1052            self.assertTrue(re.match((r"\%03o0" % i).encode(), bytes([i])+b"0"))
1053            self.assertTrue(re.match((r"\%03o8" % i).encode(), bytes([i])+b"8"))
1054            self.assertTrue(re.match((r"\x%02x" % i).encode(), bytes([i])))
1055            self.assertTrue(re.match((r"\x%02x0" % i).encode(), bytes([i])+b"0"))
1056            self.assertTrue(re.match((r"\x%02xz" % i).encode(), bytes([i])+b"z"))
1057        self.assertRaises(re.error, re.compile, br"\u1234")
1058        self.assertRaises(re.error, re.compile, br"\U00012345")
1059        self.assertTrue(re.match(br"\0", b"\000"))
1060        self.assertTrue(re.match(br"\08", b"\0008"))
1061        self.assertTrue(re.match(br"\01", b"\001"))
1062        self.assertTrue(re.match(br"\018", b"\0018"))
1063        self.checkPatternError(br"\567",
1064                               r'octal escape value \567 outside of '
1065                               r'range 0-0o377', 0)
1066        self.checkPatternError(br"\911", 'invalid group reference 91', 1)
1067        self.checkPatternError(br"\x1", r'incomplete escape \x1', 0)
1068        self.checkPatternError(br"\x1z", r'incomplete escape \x1', 0)
1069
1070    def test_sre_byte_class_literals(self):
1071        for i in [0, 8, 16, 32, 64, 127, 128, 255]:
1072            self.assertTrue(re.match((r"[\%o]" % i).encode(), bytes([i])))
1073            self.assertTrue(re.match((r"[\%o8]" % i).encode(), bytes([i])))
1074            self.assertTrue(re.match((r"[\%03o]" % i).encode(), bytes([i])))
1075            self.assertTrue(re.match((r"[\%03o0]" % i).encode(), bytes([i])))
1076            self.assertTrue(re.match((r"[\%03o8]" % i).encode(), bytes([i])))
1077            self.assertTrue(re.match((r"[\x%02x]" % i).encode(), bytes([i])))
1078            self.assertTrue(re.match((r"[\x%02x0]" % i).encode(), bytes([i])))
1079            self.assertTrue(re.match((r"[\x%02xz]" % i).encode(), bytes([i])))
1080        self.assertRaises(re.error, re.compile, br"[\u1234]")
1081        self.assertRaises(re.error, re.compile, br"[\U00012345]")
1082        self.checkPatternError(br"[\567]",
1083                               r'octal escape value \567 outside of '
1084                               r'range 0-0o377', 1)
1085        self.checkPatternError(br"[\911]", r'bad escape \9', 1)
1086        self.checkPatternError(br"[\x1z]", r'incomplete escape \x1', 1)
1087
1088    def test_character_set_errors(self):
1089        self.checkPatternError(r'[', 'unterminated character set', 0)
1090        self.checkPatternError(r'[^', 'unterminated character set', 0)
1091        self.checkPatternError(r'[a', 'unterminated character set', 0)
1092        # bug 545855 -- This pattern failed to cause a compile error as it
1093        # should, instead provoking a TypeError.
1094        self.checkPatternError(r"[a-", 'unterminated character set', 0)
1095        self.checkPatternError(r"[\w-b]", r'bad character range \w-b', 1)
1096        self.checkPatternError(r"[a-\w]", r'bad character range a-\w', 1)
1097        self.checkPatternError(r"[b-a]", 'bad character range b-a', 1)
1098
1099    def test_bug_113254(self):
1100        self.assertEqual(re.match(r'(a)|(b)', 'b').start(1), -1)
1101        self.assertEqual(re.match(r'(a)|(b)', 'b').end(1), -1)
1102        self.assertEqual(re.match(r'(a)|(b)', 'b').span(1), (-1, -1))
1103
1104    def test_bug_527371(self):
1105        # bug described in patches 527371/672491
1106        self.assertIsNone(re.match(r'(a)?a','a').lastindex)
1107        self.assertEqual(re.match(r'(a)(b)?b','ab').lastindex, 1)
1108        self.assertEqual(re.match(r'(?P<a>a)(?P<b>b)?b','ab').lastgroup, 'a')
1109        self.assertEqual(re.match(r"(?P<a>a(b))", "ab").lastgroup, 'a')
1110        self.assertEqual(re.match(r"((a))", "a").lastindex, 1)
1111
1112    def test_bug_418626(self):
1113        # bugs 418626 at al. -- Testing Greg Chapman's addition of op code
1114        # SRE_OP_MIN_REPEAT_ONE for eliminating recursion on simple uses of
1115        # pattern '*?' on a long string.
1116        self.assertEqual(re.match('.*?c', 10000*'ab'+'cd').end(0), 20001)
1117        self.assertEqual(re.match('.*?cd', 5000*'ab'+'c'+5000*'ab'+'cde').end(0),
1118                         20003)
1119        self.assertEqual(re.match('.*?cd', 20000*'abc'+'de').end(0), 60001)
1120        # non-simple '*?' still used to hit the recursion limit, before the
1121        # non-recursive scheme was implemented.
1122        self.assertEqual(re.search('(a|b)*?c', 10000*'ab'+'cd').end(0), 20001)
1123
1124    def test_bug_612074(self):
1125        pat="["+re.escape("\u2039")+"]"
1126        self.assertEqual(re.compile(pat) and 1, 1)
1127
1128    def test_stack_overflow(self):
1129        # nasty cases that used to overflow the straightforward recursive
1130        # implementation of repeated groups.
1131        self.assertEqual(re.match('(x)*', 50000*'x').group(1), 'x')
1132        self.assertEqual(re.match('(x)*y', 50000*'x'+'y').group(1), 'x')
1133        self.assertEqual(re.match('(x)*?y', 50000*'x'+'y').group(1), 'x')
1134
1135    def test_nothing_to_repeat(self):
1136        for reps in '*', '+', '?', '{1,2}':
1137            for mod in '', '?':
1138                self.checkPatternError('%s%s' % (reps, mod),
1139                                       'nothing to repeat', 0)
1140                self.checkPatternError('(?:%s%s)' % (reps, mod),
1141                                       'nothing to repeat', 3)
1142
1143    def test_multiple_repeat(self):
1144        for outer_reps in '*', '+', '{1,2}':
1145            for outer_mod in '', '?':
1146                outer_op = outer_reps + outer_mod
1147                for inner_reps in '*', '+', '?', '{1,2}':
1148                    for inner_mod in '', '?':
1149                        inner_op = inner_reps + inner_mod
1150                        self.checkPatternError(r'x%s%s' % (inner_op, outer_op),
1151                                'multiple repeat', 1 + len(inner_op))
1152
1153    def test_unlimited_zero_width_repeat(self):
1154        # Issue #9669
1155        self.assertIsNone(re.match(r'(?:a?)*y', 'z'))
1156        self.assertIsNone(re.match(r'(?:a?)+y', 'z'))
1157        self.assertIsNone(re.match(r'(?:a?){2,}y', 'z'))
1158        self.assertIsNone(re.match(r'(?:a?)*?y', 'z'))
1159        self.assertIsNone(re.match(r'(?:a?)+?y', 'z'))
1160        self.assertIsNone(re.match(r'(?:a?){2,}?y', 'z'))
1161
1162    def test_scanner(self):
1163        def s_ident(scanner, token): return token
1164        def s_operator(scanner, token): return "op%s" % token
1165        def s_float(scanner, token): return float(token)
1166        def s_int(scanner, token): return int(token)
1167
1168        scanner = Scanner([
1169            (r"[a-zA-Z_]\w*", s_ident),
1170            (r"\d+\.\d*", s_float),
1171            (r"\d+", s_int),
1172            (r"=|\+|-|\*|/", s_operator),
1173            (r"\s+", None),
1174            ])
1175
1176        self.assertTrue(scanner.scanner.scanner("").pattern)
1177
1178        self.assertEqual(scanner.scan("sum = 3*foo + 312.50 + bar"),
1179                         (['sum', 'op=', 3, 'op*', 'foo', 'op+', 312.5,
1180                           'op+', 'bar'], ''))
1181
1182    def test_bug_448951(self):
1183        # bug 448951 (similar to 429357, but with single char match)
1184        # (Also test greedy matches.)
1185        for op in '','?','*':
1186            self.assertEqual(re.match(r'((.%s):)?z'%op, 'z').groups(),
1187                             (None, None))
1188            self.assertEqual(re.match(r'((.%s):)?z'%op, 'a:z').groups(),
1189                             ('a:', 'a'))
1190
1191    def test_bug_725106(self):
1192        # capturing groups in alternatives in repeats
1193        self.assertEqual(re.match('^((a)|b)*', 'abc').groups(),
1194                         ('b', 'a'))
1195        self.assertEqual(re.match('^(([ab])|c)*', 'abc').groups(),
1196                         ('c', 'b'))
1197        self.assertEqual(re.match('^((d)|[ab])*', 'abc').groups(),
1198                         ('b', None))
1199        self.assertEqual(re.match('^((a)c|[ab])*', 'abc').groups(),
1200                         ('b', None))
1201        self.assertEqual(re.match('^((a)|b)*?c', 'abc').groups(),
1202                         ('b', 'a'))
1203        self.assertEqual(re.match('^(([ab])|c)*?d', 'abcd').groups(),
1204                         ('c', 'b'))
1205        self.assertEqual(re.match('^((d)|[ab])*?c', 'abc').groups(),
1206                         ('b', None))
1207        self.assertEqual(re.match('^((a)c|[ab])*?c', 'abc').groups(),
1208                         ('b', None))
1209
1210    def test_bug_725149(self):
1211        # mark_stack_base restoring before restoring marks
1212        self.assertEqual(re.match('(a)(?:(?=(b)*)c)*', 'abb').groups(),
1213                         ('a', None))
1214        self.assertEqual(re.match('(a)((?!(b)*))*', 'abb').groups(),
1215                         ('a', None, None))
1216
1217    def test_bug_764548(self):
1218        # bug 764548, re.compile() barfs on str/unicode subclasses
1219        class my_unicode(str): pass
1220        pat = re.compile(my_unicode("abc"))
1221        self.assertIsNone(pat.match("xyz"))
1222
1223    def test_finditer(self):
1224        iter = re.finditer(r":+", "a:b::c:::d")
1225        self.assertEqual([item.group(0) for item in iter],
1226                         [":", "::", ":::"])
1227
1228        pat = re.compile(r":+")
1229        iter = pat.finditer("a:b::c:::d", 1, 10)
1230        self.assertEqual([item.group(0) for item in iter],
1231                         [":", "::", ":::"])
1232
1233        pat = re.compile(r":+")
1234        iter = pat.finditer("a:b::c:::d", pos=1, endpos=10)
1235        self.assertEqual([item.group(0) for item in iter],
1236                         [":", "::", ":::"])
1237
1238        pat = re.compile(r":+")
1239        iter = pat.finditer("a:b::c:::d", endpos=10, pos=1)
1240        self.assertEqual([item.group(0) for item in iter],
1241                         [":", "::", ":::"])
1242
1243        pat = re.compile(r":+")
1244        iter = pat.finditer("a:b::c:::d", pos=3, endpos=8)
1245        self.assertEqual([item.group(0) for item in iter],
1246                         ["::", "::"])
1247
1248    def test_bug_926075(self):
1249        self.assertIsNot(re.compile('bug_926075'),
1250                         re.compile(b'bug_926075'))
1251
1252    def test_bug_931848(self):
1253        pattern = "[\u002E\u3002\uFF0E\uFF61]"
1254        self.assertEqual(re.compile(pattern).split("a.b.c"),
1255                         ['a','b','c'])
1256
1257    def test_bug_581080(self):
1258        iter = re.finditer(r"\s", "a b")
1259        self.assertEqual(next(iter).span(), (1,2))
1260        self.assertRaises(StopIteration, next, iter)
1261
1262        scanner = re.compile(r"\s").scanner("a b")
1263        self.assertEqual(scanner.search().span(), (1, 2))
1264        self.assertIsNone(scanner.search())
1265
1266    def test_bug_817234(self):
1267        iter = re.finditer(r".*", "asdf")
1268        self.assertEqual(next(iter).span(), (0, 4))
1269        self.assertEqual(next(iter).span(), (4, 4))
1270        self.assertRaises(StopIteration, next, iter)
1271
1272    def test_bug_6561(self):
1273        # '\d' should match characters in Unicode category 'Nd'
1274        # (Number, Decimal Digit), but not those in 'Nl' (Number,
1275        # Letter) or 'No' (Number, Other).
1276        decimal_digits = [
1277            '\u0037', # '\N{DIGIT SEVEN}', category 'Nd'
1278            '\u0e58', # '\N{THAI DIGIT SIX}', category 'Nd'
1279            '\uff10', # '\N{FULLWIDTH DIGIT ZERO}', category 'Nd'
1280            ]
1281        for x in decimal_digits:
1282            self.assertEqual(re.match(r'^\d$', x).group(0), x)
1283
1284        not_decimal_digits = [
1285            '\u2165', # '\N{ROMAN NUMERAL SIX}', category 'Nl'
1286            '\u3039', # '\N{HANGZHOU NUMERAL TWENTY}', category 'Nl'
1287            '\u2082', # '\N{SUBSCRIPT TWO}', category 'No'
1288            '\u32b4', # '\N{CIRCLED NUMBER THIRTY NINE}', category 'No'
1289            ]
1290        for x in not_decimal_digits:
1291            self.assertIsNone(re.match(r'^\d$', x))
1292
1293    def test_empty_array(self):
1294        # SF buf 1647541
1295        import array
1296        for typecode in 'bBuhHiIlLfd':
1297            a = array.array(typecode)
1298            self.assertIsNone(re.compile(b"bla").match(a))
1299            self.assertEqual(re.compile(b"").match(a).groups(), ())
1300
1301    def test_inline_flags(self):
1302        # Bug #1700
1303        upper_char = '\u1ea0' # Latin Capital Letter A with Dot Below
1304        lower_char = '\u1ea1' # Latin Small Letter A with Dot Below
1305
1306        p = re.compile(upper_char, re.I | re.U)
1307        q = p.match(lower_char)
1308        self.assertTrue(q)
1309
1310        p = re.compile(lower_char, re.I | re.U)
1311        q = p.match(upper_char)
1312        self.assertTrue(q)
1313
1314        p = re.compile('(?i)' + upper_char, re.U)
1315        q = p.match(lower_char)
1316        self.assertTrue(q)
1317
1318        p = re.compile('(?i)' + lower_char, re.U)
1319        q = p.match(upper_char)
1320        self.assertTrue(q)
1321
1322        p = re.compile('(?iu)' + upper_char)
1323        q = p.match(lower_char)
1324        self.assertTrue(q)
1325
1326        p = re.compile('(?iu)' + lower_char)
1327        q = p.match(upper_char)
1328        self.assertTrue(q)
1329
1330        self.assertTrue(re.match('(?ixu) ' + upper_char, lower_char))
1331        self.assertTrue(re.match('(?ixu) ' + lower_char, upper_char))
1332
1333        p = upper_char + '(?i)'
1334        with self.assertWarns(DeprecationWarning) as warns:
1335            self.assertTrue(re.match(p, lower_char))
1336        self.assertEqual(
1337            str(warns.warnings[0].message),
1338            'Flags not at the start of the expression %s' % p
1339        )
1340
1341        p = upper_char + '(?i)%s' % ('.?' * 100)
1342        with self.assertWarns(DeprecationWarning) as warns:
1343            self.assertTrue(re.match(p, lower_char))
1344        self.assertEqual(
1345            str(warns.warnings[0].message),
1346            'Flags not at the start of the expression %s (truncated)' % p[:20]
1347        )
1348
1349    def test_dollar_matches_twice(self):
1350        "$ matches the end of string, and just before the terminating \n"
1351        pattern = re.compile('$')
1352        self.assertEqual(pattern.sub('#', 'a\nb\n'), 'a\nb#\n#')
1353        self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a\nb\nc#')
1354        self.assertEqual(pattern.sub('#', '\n'), '#\n#')
1355
1356        pattern = re.compile('$', re.MULTILINE)
1357        self.assertEqual(pattern.sub('#', 'a\nb\n' ), 'a#\nb#\n#' )
1358        self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a#\nb#\nc#')
1359        self.assertEqual(pattern.sub('#', '\n'), '#\n#')
1360
1361    def test_bytes_str_mixing(self):
1362        # Mixing str and bytes is disallowed
1363        pat = re.compile('.')
1364        bpat = re.compile(b'.')
1365        self.assertRaises(TypeError, pat.match, b'b')
1366        self.assertRaises(TypeError, bpat.match, 'b')
1367        self.assertRaises(TypeError, pat.sub, b'b', 'c')
1368        self.assertRaises(TypeError, pat.sub, 'b', b'c')
1369        self.assertRaises(TypeError, pat.sub, b'b', b'c')
1370        self.assertRaises(TypeError, bpat.sub, b'b', 'c')
1371        self.assertRaises(TypeError, bpat.sub, 'b', b'c')
1372        self.assertRaises(TypeError, bpat.sub, 'b', 'c')
1373
1374    def test_ascii_and_unicode_flag(self):
1375        # String patterns
1376        for flags in (0, re.UNICODE):
1377            pat = re.compile('\xc0', flags | re.IGNORECASE)
1378            self.assertTrue(pat.match('\xe0'))
1379            pat = re.compile(r'\w', flags)
1380            self.assertTrue(pat.match('\xe0'))
1381        pat = re.compile('\xc0', re.ASCII | re.IGNORECASE)
1382        self.assertIsNone(pat.match('\xe0'))
1383        pat = re.compile('(?a)\xc0', re.IGNORECASE)
1384        self.assertIsNone(pat.match('\xe0'))
1385        pat = re.compile(r'\w', re.ASCII)
1386        self.assertIsNone(pat.match('\xe0'))
1387        pat = re.compile(r'(?a)\w')
1388        self.assertIsNone(pat.match('\xe0'))
1389        # Bytes patterns
1390        for flags in (0, re.ASCII):
1391            pat = re.compile(b'\xc0', flags | re.IGNORECASE)
1392            self.assertIsNone(pat.match(b'\xe0'))
1393            pat = re.compile(br'\w', flags)
1394            self.assertIsNone(pat.match(b'\xe0'))
1395        # Incompatibilities
1396        self.assertRaises(ValueError, re.compile, br'\w', re.UNICODE)
1397        self.assertRaises(ValueError, re.compile, br'(?u)\w')
1398        self.assertRaises(ValueError, re.compile, r'\w', re.UNICODE | re.ASCII)
1399        self.assertRaises(ValueError, re.compile, r'(?u)\w', re.ASCII)
1400        self.assertRaises(ValueError, re.compile, r'(?a)\w', re.UNICODE)
1401        self.assertRaises(ValueError, re.compile, r'(?au)\w')
1402
1403    def test_locale_flag(self):
1404        import locale
1405        enc = locale.getpreferredencoding(False)
1406        # Search non-ASCII letter
1407        for i in range(128, 256):
1408            try:
1409                c = bytes([i]).decode(enc)
1410                sletter = c.lower()
1411                if sletter == c: continue
1412                bletter = sletter.encode(enc)
1413                if len(bletter) != 1: continue
1414                if bletter.decode(enc) != sletter: continue
1415                bpat = re.escape(bytes([i]))
1416                break
1417            except (UnicodeError, TypeError):
1418                pass
1419        else:
1420            bletter = None
1421            bpat = b'A'
1422        # Bytes patterns
1423        pat = re.compile(bpat, re.LOCALE | re.IGNORECASE)
1424        if bletter:
1425            self.assertTrue(pat.match(bletter))
1426        pat = re.compile(b'(?L)' + bpat, re.IGNORECASE)
1427        if bletter:
1428            self.assertTrue(pat.match(bletter))
1429        pat = re.compile(bpat, re.IGNORECASE)
1430        if bletter:
1431            self.assertIsNone(pat.match(bletter))
1432        pat = re.compile(br'\w', re.LOCALE)
1433        if bletter:
1434            self.assertTrue(pat.match(bletter))
1435        pat = re.compile(br'(?L)\w')
1436        if bletter:
1437            self.assertTrue(pat.match(bletter))
1438        pat = re.compile(br'\w')
1439        if bletter:
1440            self.assertIsNone(pat.match(bletter))
1441        # Incompatibilities
1442        self.assertRaises(ValueError, re.compile, '', re.LOCALE)
1443        self.assertRaises(ValueError, re.compile, '(?L)')
1444        self.assertRaises(ValueError, re.compile, b'', re.LOCALE | re.ASCII)
1445        self.assertRaises(ValueError, re.compile, b'(?L)', re.ASCII)
1446        self.assertRaises(ValueError, re.compile, b'(?a)', re.LOCALE)
1447        self.assertRaises(ValueError, re.compile, b'(?aL)')
1448
1449    def test_scoped_flags(self):
1450        self.assertTrue(re.match(r'(?i:a)b', 'Ab'))
1451        self.assertIsNone(re.match(r'(?i:a)b', 'aB'))
1452        self.assertIsNone(re.match(r'(?-i:a)b', 'Ab', re.IGNORECASE))
1453        self.assertTrue(re.match(r'(?-i:a)b', 'aB', re.IGNORECASE))
1454        self.assertIsNone(re.match(r'(?i:(?-i:a)b)', 'Ab'))
1455        self.assertTrue(re.match(r'(?i:(?-i:a)b)', 'aB'))
1456
1457        self.assertTrue(re.match(r'(?x: a) b', 'a b'))
1458        self.assertIsNone(re.match(r'(?x: a) b', ' a b'))
1459        self.assertTrue(re.match(r'(?-x: a) b', ' ab', re.VERBOSE))
1460        self.assertIsNone(re.match(r'(?-x: a) b', 'ab', re.VERBOSE))
1461
1462        self.checkPatternError(r'(?a:\w)',
1463                               'bad inline flags: cannot turn on global flag', 3)
1464        self.checkPatternError(r'(?a)(?-a:\w)',
1465                               'bad inline flags: cannot turn off global flag', 8)
1466        self.checkPatternError(r'(?i-i:a)',
1467                               'bad inline flags: flag turned on and off', 5)
1468
1469        self.checkPatternError(r'(?-', 'missing flag', 3)
1470        self.checkPatternError(r'(?-+', 'missing flag', 3)
1471        self.checkPatternError(r'(?-z', 'unknown flag', 3)
1472        self.checkPatternError(r'(?-i', 'missing :', 4)
1473        self.checkPatternError(r'(?-i)', 'missing :', 4)
1474        self.checkPatternError(r'(?-i+', 'missing :', 4)
1475        self.checkPatternError(r'(?-iz', 'unknown flag', 4)
1476        self.checkPatternError(r'(?i:', 'missing ), unterminated subpattern', 0)
1477        self.checkPatternError(r'(?i', 'missing -, : or )', 3)
1478        self.checkPatternError(r'(?i+', 'missing -, : or )', 3)
1479        self.checkPatternError(r'(?iz', 'unknown flag', 3)
1480
1481    def test_bug_6509(self):
1482        # Replacement strings of both types must parse properly.
1483        # all strings
1484        pat = re.compile(r'a(\w)')
1485        self.assertEqual(pat.sub('b\\1', 'ac'), 'bc')
1486        pat = re.compile('a(.)')
1487        self.assertEqual(pat.sub('b\\1', 'a\u1234'), 'b\u1234')
1488        pat = re.compile('..')
1489        self.assertEqual(pat.sub(lambda m: 'str', 'a5'), 'str')
1490
1491        # all bytes
1492        pat = re.compile(br'a(\w)')
1493        self.assertEqual(pat.sub(b'b\\1', b'ac'), b'bc')
1494        pat = re.compile(b'a(.)')
1495        self.assertEqual(pat.sub(b'b\\1', b'a\xCD'), b'b\xCD')
1496        pat = re.compile(b'..')
1497        self.assertEqual(pat.sub(lambda m: b'bytes', b'a5'), b'bytes')
1498
1499    def test_dealloc(self):
1500        # issue 3299: check for segfault in debug build
1501        import _sre
1502        # the overflow limit is different on wide and narrow builds and it
1503        # depends on the definition of SRE_CODE (see sre.h).
1504        # 2**128 should be big enough to overflow on both. For smaller values
1505        # a RuntimeError is raised instead of OverflowError.
1506        long_overflow = 2**128
1507        self.assertRaises(TypeError, re.finditer, "a", {})
1508        with self.assertRaises(OverflowError):
1509            _sre.compile("abc", 0, [long_overflow], 0, [], [])
1510        with self.assertRaises(TypeError):
1511            _sre.compile({}, 0, [], 0, [], [])
1512
1513    def test_search_dot_unicode(self):
1514        self.assertTrue(re.search("123.*-", '123abc-'))
1515        self.assertTrue(re.search("123.*-", '123\xe9-'))
1516        self.assertTrue(re.search("123.*-", '123\u20ac-'))
1517        self.assertTrue(re.search("123.*-", '123\U0010ffff-'))
1518        self.assertTrue(re.search("123.*-", '123\xe9\u20ac\U0010ffff-'))
1519
1520    def test_compile(self):
1521        # Test return value when given string and pattern as parameter
1522        pattern = re.compile('random pattern')
1523        self.assertIsInstance(pattern, re._pattern_type)
1524        same_pattern = re.compile(pattern)
1525        self.assertIsInstance(same_pattern, re._pattern_type)
1526        self.assertIs(same_pattern, pattern)
1527        # Test behaviour when not given a string or pattern as parameter
1528        self.assertRaises(TypeError, re.compile, 0)
1529
1530    @bigmemtest(size=_2G, memuse=1)
1531    def test_large_search(self, size):
1532        # Issue #10182: indices were 32-bit-truncated.
1533        s = 'a' * size
1534        m = re.search('$', s)
1535        self.assertIsNotNone(m)
1536        self.assertEqual(m.start(), size)
1537        self.assertEqual(m.end(), size)
1538
1539    # The huge memuse is because of re.sub() using a list and a join()
1540    # to create the replacement result.
1541    @bigmemtest(size=_2G, memuse=16 + 2)
1542    def test_large_subn(self, size):
1543        # Issue #10182: indices were 32-bit-truncated.
1544        s = 'a' * size
1545        r, n = re.subn('', '', s)
1546        self.assertEqual(r, s)
1547        self.assertEqual(n, size + 1)
1548
1549    def test_bug_16688(self):
1550        # Issue 16688: Backreferences make case-insensitive regex fail on
1551        # non-ASCII strings.
1552        self.assertEqual(re.findall(r"(?i)(a)\1", "aa \u0100"), ['a'])
1553        self.assertEqual(re.match(r"(?s).{1,3}", "\u0100\u0100").span(), (0, 2))
1554
1555    def test_repeat_minmax_overflow(self):
1556        # Issue #13169
1557        string = "x" * 100000
1558        self.assertEqual(re.match(r".{65535}", string).span(), (0, 65535))
1559        self.assertEqual(re.match(r".{,65535}", string).span(), (0, 65535))
1560        self.assertEqual(re.match(r".{65535,}?", string).span(), (0, 65535))
1561        self.assertEqual(re.match(r".{65536}", string).span(), (0, 65536))
1562        self.assertEqual(re.match(r".{,65536}", string).span(), (0, 65536))
1563        self.assertEqual(re.match(r".{65536,}?", string).span(), (0, 65536))
1564        # 2**128 should be big enough to overflow both SRE_CODE and Py_ssize_t.
1565        self.assertRaises(OverflowError, re.compile, r".{%d}" % 2**128)
1566        self.assertRaises(OverflowError, re.compile, r".{,%d}" % 2**128)
1567        self.assertRaises(OverflowError, re.compile, r".{%d,}?" % 2**128)
1568        self.assertRaises(OverflowError, re.compile, r".{%d,%d}" % (2**129, 2**128))
1569
1570    @cpython_only
1571    def test_repeat_minmax_overflow_maxrepeat(self):
1572        try:
1573            from _sre import MAXREPEAT
1574        except ImportError:
1575            self.skipTest('requires _sre.MAXREPEAT constant')
1576        string = "x" * 100000
1577        self.assertIsNone(re.match(r".{%d}" % (MAXREPEAT - 1), string))
1578        self.assertEqual(re.match(r".{,%d}" % (MAXREPEAT - 1), string).span(),
1579                         (0, 100000))
1580        self.assertIsNone(re.match(r".{%d,}?" % (MAXREPEAT - 1), string))
1581        self.assertRaises(OverflowError, re.compile, r".{%d}" % MAXREPEAT)
1582        self.assertRaises(OverflowError, re.compile, r".{,%d}" % MAXREPEAT)
1583        self.assertRaises(OverflowError, re.compile, r".{%d,}?" % MAXREPEAT)
1584
1585    def test_backref_group_name_in_exception(self):
1586        # Issue 17341: Poor error message when compiling invalid regex
1587        self.checkPatternError('(?P=<foo>)',
1588                               "bad character in group name '<foo>'", 4)
1589
1590    def test_group_name_in_exception(self):
1591        # Issue 17341: Poor error message when compiling invalid regex
1592        self.checkPatternError('(?P<?foo>)',
1593                               "bad character in group name '?foo'", 4)
1594
1595    def test_issue17998(self):
1596        for reps in '*', '+', '?', '{1}':
1597            for mod in '', '?':
1598                pattern = '.' + reps + mod + 'yz'
1599                self.assertEqual(re.compile(pattern, re.S).findall('xyz'),
1600                                 ['xyz'], msg=pattern)
1601                pattern = pattern.encode()
1602                self.assertEqual(re.compile(pattern, re.S).findall(b'xyz'),
1603                                 [b'xyz'], msg=pattern)
1604
1605    def test_match_repr(self):
1606        for string in '[abracadabra]', S('[abracadabra]'):
1607            m = re.search(r'(.+)(.*?)\1', string)
1608            self.assertEqual(repr(m), "<%s.%s object; "
1609                             "span=(1, 12), match='abracadabra'>" %
1610                             (type(m).__module__, type(m).__qualname__))
1611        for string in (b'[abracadabra]', B(b'[abracadabra]'),
1612                       bytearray(b'[abracadabra]'),
1613                       memoryview(b'[abracadabra]')):
1614            m = re.search(br'(.+)(.*?)\1', string)
1615            self.assertEqual(repr(m), "<%s.%s object; "
1616                             "span=(1, 12), match=b'abracadabra'>" %
1617                             (type(m).__module__, type(m).__qualname__))
1618
1619        first, second = list(re.finditer("(aa)|(bb)", "aa bb"))
1620        self.assertEqual(repr(first), "<%s.%s object; "
1621                         "span=(0, 2), match='aa'>" %
1622                         (type(second).__module__, type(first).__qualname__))
1623        self.assertEqual(repr(second), "<%s.%s object; "
1624                         "span=(3, 5), match='bb'>" %
1625                         (type(second).__module__, type(second).__qualname__))
1626
1627
1628    def test_bug_2537(self):
1629        # issue 2537: empty submatches
1630        for outer_op in ('{0,}', '*', '+', '{1,187}'):
1631            for inner_op in ('{0,}', '*', '?'):
1632                r = re.compile("^((x|y)%s)%s" % (inner_op, outer_op))
1633                m = r.match("xyyzy")
1634                self.assertEqual(m.group(0), "xyy")
1635                self.assertEqual(m.group(1), "")
1636                self.assertEqual(m.group(2), "y")
1637
1638    def test_debug_flag(self):
1639        pat = r'(\.)(?:[ch]|py)(?(1)$|: )'
1640        with captured_stdout() as out:
1641            re.compile(pat, re.DEBUG)
1642        dump = '''\
1643SUBPATTERN 1 0 0
1644  LITERAL 46
1645SUBPATTERN None 0 0
1646  BRANCH
1647    IN
1648      LITERAL 99
1649      LITERAL 104
1650  OR
1651    LITERAL 112
1652    LITERAL 121
1653SUBPATTERN None 0 0
1654  GROUPREF_EXISTS 1
1655    AT AT_END
1656  ELSE
1657    LITERAL 58
1658    LITERAL 32
1659'''
1660        self.assertEqual(out.getvalue(), dump)
1661        # Debug output is output again even a second time (bypassing
1662        # the cache -- issue #20426).
1663        with captured_stdout() as out:
1664            re.compile(pat, re.DEBUG)
1665        self.assertEqual(out.getvalue(), dump)
1666
1667    def test_keyword_parameters(self):
1668        # Issue #20283: Accepting the string keyword parameter.
1669        pat = re.compile(r'(ab)')
1670        self.assertEqual(
1671            pat.match(string='abracadabra', pos=7, endpos=10).span(), (7, 9))
1672        self.assertEqual(
1673            pat.fullmatch(string='abracadabra', pos=7, endpos=9).span(), (7, 9))
1674        self.assertEqual(
1675            pat.search(string='abracadabra', pos=3, endpos=10).span(), (7, 9))
1676        self.assertEqual(
1677            pat.findall(string='abracadabra', pos=3, endpos=10), ['ab'])
1678        self.assertEqual(
1679            pat.split(string='abracadabra', maxsplit=1),
1680            ['', 'ab', 'racadabra'])
1681        self.assertEqual(
1682            pat.scanner(string='abracadabra', pos=3, endpos=10).search().span(),
1683            (7, 9))
1684
1685    def test_bug_20998(self):
1686        # Issue #20998: Fullmatch of repeated single character pattern
1687        # with ignore case.
1688        self.assertEqual(re.fullmatch('[a-c]+', 'ABC', re.I).span(), (0, 3))
1689
1690    def test_locale_caching(self):
1691        # Issue #22410
1692        oldlocale = locale.setlocale(locale.LC_CTYPE)
1693        self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1694        for loc in 'en_US.iso88591', 'en_US.utf8':
1695            try:
1696                locale.setlocale(locale.LC_CTYPE, loc)
1697            except locale.Error:
1698                # Unsupported locale on this system
1699                self.skipTest('test needs %s locale' % loc)
1700
1701        re.purge()
1702        self.check_en_US_iso88591()
1703        self.check_en_US_utf8()
1704        re.purge()
1705        self.check_en_US_utf8()
1706        self.check_en_US_iso88591()
1707
1708    def check_en_US_iso88591(self):
1709        locale.setlocale(locale.LC_CTYPE, 'en_US.iso88591')
1710        self.assertTrue(re.match(b'\xc5\xe5', b'\xc5\xe5', re.L|re.I))
1711        self.assertTrue(re.match(b'\xc5', b'\xe5', re.L|re.I))
1712        self.assertTrue(re.match(b'\xe5', b'\xc5', re.L|re.I))
1713        self.assertTrue(re.match(b'(?Li)\xc5\xe5', b'\xc5\xe5'))
1714        self.assertTrue(re.match(b'(?Li)\xc5', b'\xe5'))
1715        self.assertTrue(re.match(b'(?Li)\xe5', b'\xc5'))
1716
1717    def check_en_US_utf8(self):
1718        locale.setlocale(locale.LC_CTYPE, 'en_US.utf8')
1719        self.assertTrue(re.match(b'\xc5\xe5', b'\xc5\xe5', re.L|re.I))
1720        self.assertIsNone(re.match(b'\xc5', b'\xe5', re.L|re.I))
1721        self.assertIsNone(re.match(b'\xe5', b'\xc5', re.L|re.I))
1722        self.assertTrue(re.match(b'(?Li)\xc5\xe5', b'\xc5\xe5'))
1723        self.assertIsNone(re.match(b'(?Li)\xc5', b'\xe5'))
1724        self.assertIsNone(re.match(b'(?Li)\xe5', b'\xc5'))
1725
1726    def test_error(self):
1727        with self.assertRaises(re.error) as cm:
1728            re.compile('(\u20ac))')
1729        err = cm.exception
1730        self.assertIsInstance(err.pattern, str)
1731        self.assertEqual(err.pattern, '(\u20ac))')
1732        self.assertEqual(err.pos, 3)
1733        self.assertEqual(err.lineno, 1)
1734        self.assertEqual(err.colno, 4)
1735        self.assertIn(err.msg, str(err))
1736        self.assertIn(' at position 3', str(err))
1737        self.assertNotIn(' at position 3', err.msg)
1738        # Bytes pattern
1739        with self.assertRaises(re.error) as cm:
1740            re.compile(b'(\xa4))')
1741        err = cm.exception
1742        self.assertIsInstance(err.pattern, bytes)
1743        self.assertEqual(err.pattern, b'(\xa4))')
1744        self.assertEqual(err.pos, 3)
1745        # Multiline pattern
1746        with self.assertRaises(re.error) as cm:
1747            re.compile("""
1748                (
1749                    abc
1750                )
1751                )
1752                (
1753                """, re.VERBOSE)
1754        err = cm.exception
1755        self.assertEqual(err.pos, 77)
1756        self.assertEqual(err.lineno, 5)
1757        self.assertEqual(err.colno, 17)
1758        self.assertIn(err.msg, str(err))
1759        self.assertIn(' at position 77', str(err))
1760        self.assertIn('(line 5, column 17)', str(err))
1761
1762    def test_misc_errors(self):
1763        self.checkPatternError(r'(', 'missing ), unterminated subpattern', 0)
1764        self.checkPatternError(r'((a|b)', 'missing ), unterminated subpattern', 0)
1765        self.checkPatternError(r'(a|b))', 'unbalanced parenthesis', 5)
1766        self.checkPatternError(r'(?P', 'unexpected end of pattern', 3)
1767        self.checkPatternError(r'(?z)', 'unknown extension ?z', 1)
1768        self.checkPatternError(r'(?iz)', 'unknown flag', 3)
1769        self.checkPatternError(r'(?i', 'missing -, : or )', 3)
1770        self.checkPatternError(r'(?#abc', 'missing ), unterminated comment', 0)
1771        self.checkPatternError(r'(?<', 'unexpected end of pattern', 3)
1772        self.checkPatternError(r'(?<>)', 'unknown extension ?<>', 1)
1773        self.checkPatternError(r'(?', 'unexpected end of pattern', 2)
1774
1775    def test_enum(self):
1776        # Issue #28082: Check that str(flag) returns a human readable string
1777        # instead of an integer
1778        self.assertIn('ASCII', str(re.A))
1779        self.assertIn('DOTALL', str(re.S))
1780
1781    def test_pattern_compare(self):
1782        pattern1 = re.compile('abc', re.IGNORECASE)
1783
1784        # equal to itself
1785        self.assertEqual(pattern1, pattern1)
1786        self.assertFalse(pattern1 != pattern1)
1787
1788        # equal
1789        re.purge()
1790        pattern2 = re.compile('abc', re.IGNORECASE)
1791        self.assertEqual(hash(pattern2), hash(pattern1))
1792        self.assertEqual(pattern2, pattern1)
1793
1794        # not equal: different pattern
1795        re.purge()
1796        pattern3 = re.compile('XYZ', re.IGNORECASE)
1797        # Don't test hash(pattern3) != hash(pattern1) because there is no
1798        # warranty that hash values are different
1799        self.assertNotEqual(pattern3, pattern1)
1800
1801        # not equal: different flag (flags=0)
1802        re.purge()
1803        pattern4 = re.compile('abc')
1804        self.assertNotEqual(pattern4, pattern1)
1805
1806        # only == and != comparison operators are supported
1807        with self.assertRaises(TypeError):
1808            pattern1 < pattern2
1809
1810    def test_pattern_compare_bytes(self):
1811        pattern1 = re.compile(b'abc')
1812
1813        # equal: test bytes patterns
1814        re.purge()
1815        pattern2 = re.compile(b'abc')
1816        self.assertEqual(hash(pattern2), hash(pattern1))
1817        self.assertEqual(pattern2, pattern1)
1818
1819        # not equal: pattern of a different types (str vs bytes),
1820        # comparison must not raise a BytesWarning
1821        re.purge()
1822        pattern3 = re.compile('abc')
1823        with warnings.catch_warnings():
1824            warnings.simplefilter('error', BytesWarning)
1825            self.assertNotEqual(pattern3, pattern1)
1826
1827    def test_bug_29444(self):
1828        s = bytearray(b'abcdefgh')
1829        m = re.search(b'[a-h]+', s)
1830        m2 = re.search(b'[e-h]+', s)
1831        self.assertEqual(m.group(), b'abcdefgh')
1832        self.assertEqual(m2.group(), b'efgh')
1833        s[:] = b'xyz'
1834        self.assertEqual(m.group(), b'xyz')
1835        self.assertEqual(m2.group(), b'')
1836
1837
1838class PatternReprTests(unittest.TestCase):
1839    def check(self, pattern, expected):
1840        self.assertEqual(repr(re.compile(pattern)), expected)
1841
1842    def check_flags(self, pattern, flags, expected):
1843        self.assertEqual(repr(re.compile(pattern, flags)), expected)
1844
1845    def test_without_flags(self):
1846        self.check('random pattern',
1847                   "re.compile('random pattern')")
1848
1849    def test_single_flag(self):
1850        self.check_flags('random pattern', re.IGNORECASE,
1851            "re.compile('random pattern', re.IGNORECASE)")
1852
1853    def test_multiple_flags(self):
1854        self.check_flags('random pattern', re.I|re.S|re.X,
1855            "re.compile('random pattern', "
1856            "re.IGNORECASE|re.DOTALL|re.VERBOSE)")
1857
1858    def test_unicode_flag(self):
1859        self.check_flags('random pattern', re.U,
1860                         "re.compile('random pattern')")
1861        self.check_flags('random pattern', re.I|re.S|re.U,
1862                         "re.compile('random pattern', "
1863                         "re.IGNORECASE|re.DOTALL)")
1864
1865    def test_inline_flags(self):
1866        self.check('(?i)pattern',
1867                   "re.compile('(?i)pattern', re.IGNORECASE)")
1868
1869    def test_unknown_flags(self):
1870        self.check_flags('random pattern', 0x123000,
1871                         "re.compile('random pattern', 0x123000)")
1872        self.check_flags('random pattern', 0x123000|re.I,
1873            "re.compile('random pattern', re.IGNORECASE|0x123000)")
1874
1875    def test_bytes(self):
1876        self.check(b'bytes pattern',
1877                   "re.compile(b'bytes pattern')")
1878        self.check_flags(b'bytes pattern', re.A,
1879                         "re.compile(b'bytes pattern', re.ASCII)")
1880
1881    def test_locale(self):
1882        self.check_flags(b'bytes pattern', re.L,
1883                         "re.compile(b'bytes pattern', re.LOCALE)")
1884
1885    def test_quotes(self):
1886        self.check('random "double quoted" pattern',
1887            '''re.compile('random "double quoted" pattern')''')
1888        self.check("random 'single quoted' pattern",
1889            '''re.compile("random 'single quoted' pattern")''')
1890        self.check('''both 'single' and "double" quotes''',
1891            '''re.compile('both \\'single\\' and "double" quotes')''')
1892
1893    def test_long_pattern(self):
1894        pattern = 'Very %spattern' % ('long ' * 1000)
1895        r = repr(re.compile(pattern))
1896        self.assertLess(len(r), 300)
1897        self.assertEqual(r[:30], "re.compile('Very long long lon")
1898        r = repr(re.compile(pattern, re.I))
1899        self.assertLess(len(r), 300)
1900        self.assertEqual(r[:30], "re.compile('Very long long lon")
1901        self.assertEqual(r[-16:], ", re.IGNORECASE)")
1902
1903
1904class ImplementationTest(unittest.TestCase):
1905    """
1906    Test implementation details of the re module.
1907    """
1908
1909    def test_overlap_table(self):
1910        f = sre_compile._generate_overlap_table
1911        self.assertEqual(f(""), [])
1912        self.assertEqual(f("a"), [0])
1913        self.assertEqual(f("abcd"), [0, 0, 0, 0])
1914        self.assertEqual(f("aaaa"), [0, 1, 2, 3])
1915        self.assertEqual(f("ababba"), [0, 0, 1, 2, 0, 1])
1916        self.assertEqual(f("abcabdac"), [0, 0, 0, 1, 2, 0, 1, 0])
1917
1918
1919class ExternalTests(unittest.TestCase):
1920
1921    def test_re_benchmarks(self):
1922        're_tests benchmarks'
1923        from test.re_tests import benchmarks
1924        for pattern, s in benchmarks:
1925            with self.subTest(pattern=pattern, string=s):
1926                p = re.compile(pattern)
1927                self.assertTrue(p.search(s))
1928                self.assertTrue(p.match(s))
1929                self.assertTrue(p.fullmatch(s))
1930                s2 = ' '*10000 + s + ' '*10000
1931                self.assertTrue(p.search(s2))
1932                self.assertTrue(p.match(s2, 10000))
1933                self.assertTrue(p.match(s2, 10000, 10000 + len(s)))
1934                self.assertTrue(p.fullmatch(s2, 10000, 10000 + len(s)))
1935
1936    def test_re_tests(self):
1937        're_tests test suite'
1938        from test.re_tests import tests, SUCCEED, FAIL, SYNTAX_ERROR
1939        for t in tests:
1940            pattern = s = outcome = repl = expected = None
1941            if len(t) == 5:
1942                pattern, s, outcome, repl, expected = t
1943            elif len(t) == 3:
1944                pattern, s, outcome = t
1945            else:
1946                raise ValueError('Test tuples should have 3 or 5 fields', t)
1947
1948            with self.subTest(pattern=pattern, string=s):
1949                if outcome == SYNTAX_ERROR:  # Expected a syntax error
1950                    with self.assertRaises(re.error):
1951                        re.compile(pattern)
1952                    continue
1953
1954                obj = re.compile(pattern)
1955                result = obj.search(s)
1956                if outcome == FAIL:
1957                    self.assertIsNone(result, 'Succeeded incorrectly')
1958                    continue
1959
1960                with self.subTest():
1961                    self.assertTrue(result, 'Failed incorrectly')
1962                    # Matched, as expected, so now we compute the
1963                    # result string and compare it to our expected result.
1964                    start, end = result.span(0)
1965                    vardict = {'found': result.group(0),
1966                               'groups': result.group(),
1967                               'flags': result.re.flags}
1968                    for i in range(1, 100):
1969                        try:
1970                            gi = result.group(i)
1971                            # Special hack because else the string concat fails:
1972                            if gi is None:
1973                                gi = "None"
1974                        except IndexError:
1975                            gi = "Error"
1976                        vardict['g%d' % i] = gi
1977                    for i in result.re.groupindex.keys():
1978                        try:
1979                            gi = result.group(i)
1980                            if gi is None:
1981                                gi = "None"
1982                        except IndexError:
1983                            gi = "Error"
1984                        vardict[i] = gi
1985                    self.assertEqual(eval(repl, vardict), expected,
1986                                     'grouping error')
1987
1988                # Try the match with both pattern and string converted to
1989                # bytes, and check that it still succeeds.
1990                try:
1991                    bpat = bytes(pattern, "ascii")
1992                    bs = bytes(s, "ascii")
1993                except UnicodeEncodeError:
1994                    # skip non-ascii tests
1995                    pass
1996                else:
1997                    with self.subTest('bytes pattern match'):
1998                        obj = re.compile(bpat)
1999                        self.assertTrue(obj.search(bs))
2000
2001                    # Try the match with LOCALE enabled, and check that it
2002                    # still succeeds.
2003                    with self.subTest('locale-sensitive match'):
2004                        obj = re.compile(bpat, re.LOCALE)
2005                        result = obj.search(bs)
2006                        if result is None:
2007                            print('=== Fails on locale-sensitive match', t)
2008
2009                # Try the match with the search area limited to the extent
2010                # of the match and see if it still succeeds.  \B will
2011                # break (because it won't match at the end or start of a
2012                # string), so we'll ignore patterns that feature it.
2013                if (pattern[:2] != r'\B' and pattern[-2:] != r'\B'
2014                            and result is not None):
2015                    with self.subTest('range-limited match'):
2016                        obj = re.compile(pattern)
2017                        self.assertTrue(obj.search(s, start, end + 1))
2018
2019                # Try the match with IGNORECASE enabled, and check that it
2020                # still succeeds.
2021                with self.subTest('case-insensitive match'):
2022                    obj = re.compile(pattern, re.IGNORECASE)
2023                    self.assertTrue(obj.search(s))
2024
2025                # Try the match with UNICODE locale enabled, and check
2026                # that it still succeeds.
2027                with self.subTest('unicode-sensitive match'):
2028                    obj = re.compile(pattern, re.UNICODE)
2029                    self.assertTrue(obj.search(s))
2030
2031
2032if __name__ == "__main__":
2033    unittest.main()
2034