1from test.support import verbose, run_unittest, gc_collect, bigmemtest, _2G, \ 2 cpython_only, captured_stdout 3import io 4import locale 5import re 6import sre_compile 7import string 8import sys 9import traceback 10import unittest 11import warnings 12from re import Scanner 13from weakref import proxy 14 15# Misc tests from Tim Peters' re.doc 16 17# WARNING: Don't change details in these tests if you don't know 18# what you're doing. Some of these tests were carefully modeled to 19# cover most of the code. 20 21class S(str): 22 def __getitem__(self, index): 23 return S(super().__getitem__(index)) 24 25class B(bytes): 26 def __getitem__(self, index): 27 return B(super().__getitem__(index)) 28 29class ReTests(unittest.TestCase): 30 31 def assertTypedEqual(self, actual, expect, msg=None): 32 self.assertEqual(actual, expect, msg) 33 def recurse(actual, expect): 34 if isinstance(expect, (tuple, list)): 35 for x, y in zip(actual, expect): 36 recurse(x, y) 37 else: 38 self.assertIs(type(actual), type(expect), msg) 39 recurse(actual, expect) 40 41 def checkPatternError(self, pattern, errmsg, pos=None): 42 with self.assertRaises(re.error) as cm: 43 re.compile(pattern) 44 with self.subTest(pattern=pattern): 45 err = cm.exception 46 self.assertEqual(err.msg, errmsg) 47 if pos is not None: 48 self.assertEqual(err.pos, pos) 49 50 def checkTemplateError(self, pattern, repl, string, errmsg, pos=None): 51 with self.assertRaises(re.error) as cm: 52 re.sub(pattern, repl, string) 53 with self.subTest(pattern=pattern, repl=repl): 54 err = cm.exception 55 self.assertEqual(err.msg, errmsg) 56 if pos is not None: 57 self.assertEqual(err.pos, pos) 58 59 def test_keep_buffer(self): 60 # See bug 14212 61 b = bytearray(b'x') 62 it = re.finditer(b'a', b) 63 with self.assertRaises(BufferError): 64 b.extend(b'x'*400) 65 list(it) 66 del it 67 gc_collect() 68 b.extend(b'x'*400) 69 70 def test_weakref(self): 71 s = 'QabbbcR' 72 x = re.compile('ab+c') 73 y = proxy(x) 74 self.assertEqual(x.findall('QabbbcR'), y.findall('QabbbcR')) 75 76 def test_search_star_plus(self): 77 self.assertEqual(re.search('x*', 'axx').span(0), (0, 0)) 78 self.assertEqual(re.search('x*', 'axx').span(), (0, 0)) 79 self.assertEqual(re.search('x+', 'axx').span(0), (1, 3)) 80 self.assertEqual(re.search('x+', 'axx').span(), (1, 3)) 81 self.assertIsNone(re.search('x', 'aaa')) 82 self.assertEqual(re.match('a*', 'xxx').span(0), (0, 0)) 83 self.assertEqual(re.match('a*', 'xxx').span(), (0, 0)) 84 self.assertEqual(re.match('x*', 'xxxa').span(0), (0, 3)) 85 self.assertEqual(re.match('x*', 'xxxa').span(), (0, 3)) 86 self.assertIsNone(re.match('a+', 'xxx')) 87 88 def bump_num(self, matchobj): 89 int_value = int(matchobj.group(0)) 90 return str(int_value + 1) 91 92 def test_basic_re_sub(self): 93 self.assertTypedEqual(re.sub('y', 'a', 'xyz'), 'xaz') 94 self.assertTypedEqual(re.sub('y', S('a'), S('xyz')), 'xaz') 95 self.assertTypedEqual(re.sub(b'y', b'a', b'xyz'), b'xaz') 96 self.assertTypedEqual(re.sub(b'y', B(b'a'), B(b'xyz')), b'xaz') 97 self.assertTypedEqual(re.sub(b'y', bytearray(b'a'), bytearray(b'xyz')), b'xaz') 98 self.assertTypedEqual(re.sub(b'y', memoryview(b'a'), memoryview(b'xyz')), b'xaz') 99 for y in ("\xe0", "\u0430", "\U0001d49c"): 100 self.assertEqual(re.sub(y, 'a', 'x%sz' % y), 'xaz') 101 102 self.assertEqual(re.sub("(?i)b+", "x", "bbbb BBBB"), 'x x') 103 self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y'), 104 '9.3 -3 24x100y') 105 self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y', 3), 106 '9.3 -3 23x99y') 107 self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y', count=3), 108 '9.3 -3 23x99y') 109 110 self.assertEqual(re.sub('.', lambda m: r"\n", 'x'), '\\n') 111 self.assertEqual(re.sub('.', r"\n", 'x'), '\n') 112 113 s = r"\1\1" 114 self.assertEqual(re.sub('(.)', s, 'x'), 'xx') 115 self.assertEqual(re.sub('(.)', re.escape(s), 'x'), s) 116 self.assertEqual(re.sub('(.)', lambda m: s, 'x'), s) 117 118 self.assertEqual(re.sub('(?P<a>x)', r'\g<a>\g<a>', 'xx'), 'xxxx') 119 self.assertEqual(re.sub('(?P<a>x)', r'\g<a>\g<1>', 'xx'), 'xxxx') 120 self.assertEqual(re.sub('(?P<unk>x)', r'\g<unk>\g<unk>', 'xx'), 'xxxx') 121 self.assertEqual(re.sub('(?P<unk>x)', r'\g<1>\g<1>', 'xx'), 'xxxx') 122 123 self.assertEqual(re.sub('a', r'\t\n\v\r\f\a\b', 'a'), '\t\n\v\r\f\a\b') 124 self.assertEqual(re.sub('a', '\t\n\v\r\f\a\b', 'a'), '\t\n\v\r\f\a\b') 125 self.assertEqual(re.sub('a', '\t\n\v\r\f\a\b', 'a'), 126 (chr(9)+chr(10)+chr(11)+chr(13)+chr(12)+chr(7)+chr(8))) 127 for c in 'cdehijklmopqsuwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ': 128 with self.subTest(c): 129 with self.assertWarns(DeprecationWarning): 130 self.assertEqual(re.sub('a', '\\' + c, 'a'), '\\' + c) 131 132 self.assertEqual(re.sub(r'^\s*', 'X', 'test'), 'Xtest') 133 134 def test_bug_449964(self): 135 # fails for group followed by other escape 136 self.assertEqual(re.sub(r'(?P<unk>x)', r'\g<1>\g<1>\b', 'xx'), 137 'xx\bxx\b') 138 139 def test_bug_449000(self): 140 # Test for sub() on escaped characters 141 self.assertEqual(re.sub(r'\r\n', r'\n', 'abc\r\ndef\r\n'), 142 'abc\ndef\n') 143 self.assertEqual(re.sub('\r\n', r'\n', 'abc\r\ndef\r\n'), 144 'abc\ndef\n') 145 self.assertEqual(re.sub(r'\r\n', '\n', 'abc\r\ndef\r\n'), 146 'abc\ndef\n') 147 self.assertEqual(re.sub('\r\n', '\n', 'abc\r\ndef\r\n'), 148 'abc\ndef\n') 149 150 def test_bug_1661(self): 151 # Verify that flags do not get silently ignored with compiled patterns 152 pattern = re.compile('.') 153 self.assertRaises(ValueError, re.match, pattern, 'A', re.I) 154 self.assertRaises(ValueError, re.search, pattern, 'A', re.I) 155 self.assertRaises(ValueError, re.findall, pattern, 'A', re.I) 156 self.assertRaises(ValueError, re.compile, pattern, re.I) 157 158 def test_bug_3629(self): 159 # A regex that triggered a bug in the sre-code validator 160 re.compile("(?P<quote>)(?(quote))") 161 162 def test_sub_template_numeric_escape(self): 163 # bug 776311 and friends 164 self.assertEqual(re.sub('x', r'\0', 'x'), '\0') 165 self.assertEqual(re.sub('x', r'\000', 'x'), '\000') 166 self.assertEqual(re.sub('x', r'\001', 'x'), '\001') 167 self.assertEqual(re.sub('x', r'\008', 'x'), '\0' + '8') 168 self.assertEqual(re.sub('x', r'\009', 'x'), '\0' + '9') 169 self.assertEqual(re.sub('x', r'\111', 'x'), '\111') 170 self.assertEqual(re.sub('x', r'\117', 'x'), '\117') 171 self.assertEqual(re.sub('x', r'\377', 'x'), '\377') 172 173 self.assertEqual(re.sub('x', r'\1111', 'x'), '\1111') 174 self.assertEqual(re.sub('x', r'\1111', 'x'), '\111' + '1') 175 176 self.assertEqual(re.sub('x', r'\00', 'x'), '\x00') 177 self.assertEqual(re.sub('x', r'\07', 'x'), '\x07') 178 self.assertEqual(re.sub('x', r'\08', 'x'), '\0' + '8') 179 self.assertEqual(re.sub('x', r'\09', 'x'), '\0' + '9') 180 self.assertEqual(re.sub('x', r'\0a', 'x'), '\0' + 'a') 181 182 self.checkTemplateError('x', r'\400', 'x', 183 r'octal escape value \400 outside of ' 184 r'range 0-0o377', 0) 185 self.checkTemplateError('x', r'\777', 'x', 186 r'octal escape value \777 outside of ' 187 r'range 0-0o377', 0) 188 189 self.checkTemplateError('x', r'\1', 'x', 'invalid group reference 1', 1) 190 self.checkTemplateError('x', r'\8', 'x', 'invalid group reference 8', 1) 191 self.checkTemplateError('x', r'\9', 'x', 'invalid group reference 9', 1) 192 self.checkTemplateError('x', r'\11', 'x', 'invalid group reference 11', 1) 193 self.checkTemplateError('x', r'\18', 'x', 'invalid group reference 18', 1) 194 self.checkTemplateError('x', r'\1a', 'x', 'invalid group reference 1', 1) 195 self.checkTemplateError('x', r'\90', 'x', 'invalid group reference 90', 1) 196 self.checkTemplateError('x', r'\99', 'x', 'invalid group reference 99', 1) 197 self.checkTemplateError('x', r'\118', 'x', 'invalid group reference 11', 1) 198 self.checkTemplateError('x', r'\11a', 'x', 'invalid group reference 11', 1) 199 self.checkTemplateError('x', r'\181', 'x', 'invalid group reference 18', 1) 200 self.checkTemplateError('x', r'\800', 'x', 'invalid group reference 80', 1) 201 self.checkTemplateError('x', r'\8', '', 'invalid group reference 8', 1) 202 203 # in python2.3 (etc), these loop endlessly in sre_parser.py 204 self.assertEqual(re.sub('(((((((((((x)))))))))))', r'\11', 'x'), 'x') 205 self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\118', 'xyz'), 206 'xz8') 207 self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\11a', 'xyz'), 208 'xza') 209 210 def test_qualified_re_sub(self): 211 self.assertEqual(re.sub('a', 'b', 'aaaaa'), 'bbbbb') 212 self.assertEqual(re.sub('a', 'b', 'aaaaa', 1), 'baaaa') 213 self.assertEqual(re.sub('a', 'b', 'aaaaa', count=1), 'baaaa') 214 215 def test_bug_114660(self): 216 self.assertEqual(re.sub(r'(\S)\s+(\S)', r'\1 \2', 'hello there'), 217 'hello there') 218 219 def test_bug_462270(self): 220 # Test for empty sub() behaviour, see SF bug #462270 221 self.assertEqual(re.sub('x*', '-', 'abxd'), '-a-b-d-') 222 self.assertEqual(re.sub('x+', '-', 'abxd'), 'ab-d') 223 224 def test_symbolic_groups(self): 225 re.compile(r'(?P<a>x)(?P=a)(?(a)y)') 226 re.compile(r'(?P<a1>x)(?P=a1)(?(a1)y)') 227 re.compile(r'(?P<a1>x)\1(?(1)y)') 228 self.checkPatternError(r'(?P<a>)(?P<a>)', 229 "redefinition of group name 'a' as group 2; " 230 "was group 1") 231 self.checkPatternError(r'(?P<a>(?P=a))', 232 "cannot refer to an open group", 10) 233 self.checkPatternError(r'(?Pxy)', 'unknown extension ?Px') 234 self.checkPatternError(r'(?P<a>)(?P=a', 'missing ), unterminated name', 11) 235 self.checkPatternError(r'(?P=', 'missing group name', 4) 236 self.checkPatternError(r'(?P=)', 'missing group name', 4) 237 self.checkPatternError(r'(?P=1)', "bad character in group name '1'", 4) 238 self.checkPatternError(r'(?P=a)', "unknown group name 'a'") 239 self.checkPatternError(r'(?P=a1)', "unknown group name 'a1'") 240 self.checkPatternError(r'(?P=a.)', "bad character in group name 'a.'", 4) 241 self.checkPatternError(r'(?P<)', 'missing >, unterminated name', 4) 242 self.checkPatternError(r'(?P<a', 'missing >, unterminated name', 4) 243 self.checkPatternError(r'(?P<', 'missing group name', 4) 244 self.checkPatternError(r'(?P<>)', 'missing group name', 4) 245 self.checkPatternError(r'(?P<1>)', "bad character in group name '1'", 4) 246 self.checkPatternError(r'(?P<a.>)', "bad character in group name 'a.'", 4) 247 self.checkPatternError(r'(?(', 'missing group name', 3) 248 self.checkPatternError(r'(?())', 'missing group name', 3) 249 self.checkPatternError(r'(?(a))', "unknown group name 'a'", 3) 250 self.checkPatternError(r'(?(-1))', "bad character in group name '-1'", 3) 251 self.checkPatternError(r'(?(1a))', "bad character in group name '1a'", 3) 252 self.checkPatternError(r'(?(a.))', "bad character in group name 'a.'", 3) 253 # New valid/invalid identifiers in Python 3 254 re.compile('(?P<µ>x)(?P=µ)(?(µ)y)') 255 re.compile('(?P<>x)(?P=)(?()y)') 256 self.checkPatternError('(?P<©>x)', "bad character in group name '©'", 4) 257 # Support > 100 groups. 258 pat = '|'.join('x(?P<a%d>%x)y' % (i, i) for i in range(1, 200 + 1)) 259 pat = '(?:%s)(?(200)z|t)' % pat 260 self.assertEqual(re.match(pat, 'xc8yz').span(), (0, 5)) 261 262 def test_symbolic_refs(self): 263 self.checkTemplateError('(?P<a>x)', r'\g<a', 'xx', 264 'missing >, unterminated name', 3) 265 self.checkTemplateError('(?P<a>x)', r'\g<', 'xx', 266 'missing group name', 3) 267 self.checkTemplateError('(?P<a>x)', r'\g', 'xx', 'missing <', 2) 268 self.checkTemplateError('(?P<a>x)', r'\g<a a>', 'xx', 269 "bad character in group name 'a a'", 3) 270 self.checkTemplateError('(?P<a>x)', r'\g<>', 'xx', 271 'missing group name', 3) 272 self.checkTemplateError('(?P<a>x)', r'\g<1a1>', 'xx', 273 "bad character in group name '1a1'", 3) 274 self.checkTemplateError('(?P<a>x)', r'\g<2>', 'xx', 275 'invalid group reference 2', 3) 276 self.checkTemplateError('(?P<a>x)', r'\2', 'xx', 277 'invalid group reference 2', 1) 278 with self.assertRaisesRegex(IndexError, "unknown group name 'ab'"): 279 re.sub('(?P<a>x)', r'\g<ab>', 'xx') 280 self.assertEqual(re.sub('(?P<a>x)|(?P<b>y)', r'\g<b>', 'xx'), '') 281 self.assertEqual(re.sub('(?P<a>x)|(?P<b>y)', r'\2', 'xx'), '') 282 self.checkTemplateError('(?P<a>x)', r'\g<-1>', 'xx', 283 "bad character in group name '-1'", 3) 284 # New valid/invalid identifiers in Python 3 285 self.assertEqual(re.sub('(?P<µ>x)', r'\g<µ>', 'xx'), 'xx') 286 self.assertEqual(re.sub('(?P<>x)', r'\g<>', 'xx'), 'xx') 287 self.checkTemplateError('(?P<a>x)', r'\g<©>', 'xx', 288 "bad character in group name '©'", 3) 289 # Support > 100 groups. 290 pat = '|'.join('x(?P<a%d>%x)y' % (i, i) for i in range(1, 200 + 1)) 291 self.assertEqual(re.sub(pat, r'\g<200>', 'xc8yzxc8y'), 'c8zc8') 292 293 def test_re_subn(self): 294 self.assertEqual(re.subn("(?i)b+", "x", "bbbb BBBB"), ('x x', 2)) 295 self.assertEqual(re.subn("b+", "x", "bbbb BBBB"), ('x BBBB', 1)) 296 self.assertEqual(re.subn("b+", "x", "xyz"), ('xyz', 0)) 297 self.assertEqual(re.subn("b*", "x", "xyz"), ('xxxyxzx', 4)) 298 self.assertEqual(re.subn("b*", "x", "xyz", 2), ('xxxyz', 2)) 299 self.assertEqual(re.subn("b*", "x", "xyz", count=2), ('xxxyz', 2)) 300 301 def test_re_split(self): 302 for string in ":a:b::c", S(":a:b::c"): 303 self.assertTypedEqual(re.split(":", string), 304 ['', 'a', 'b', '', 'c']) 305 self.assertTypedEqual(re.split(":+", string), 306 ['', 'a', 'b', 'c']) 307 self.assertTypedEqual(re.split("(:+)", string), 308 ['', ':', 'a', ':', 'b', '::', 'c']) 309 for string in (b":a:b::c", B(b":a:b::c"), bytearray(b":a:b::c"), 310 memoryview(b":a:b::c")): 311 self.assertTypedEqual(re.split(b":", string), 312 [b'', b'a', b'b', b'', b'c']) 313 self.assertTypedEqual(re.split(b":+", string), 314 [b'', b'a', b'b', b'c']) 315 self.assertTypedEqual(re.split(b"(:+)", string), 316 [b'', b':', b'a', b':', b'b', b'::', b'c']) 317 for a, b, c in ("\xe0\xdf\xe7", "\u0430\u0431\u0432", 318 "\U0001d49c\U0001d49e\U0001d4b5"): 319 string = ":%s:%s::%s" % (a, b, c) 320 self.assertEqual(re.split(":", string), ['', a, b, '', c]) 321 self.assertEqual(re.split(":+", string), ['', a, b, c]) 322 self.assertEqual(re.split("(:+)", string), 323 ['', ':', a, ':', b, '::', c]) 324 325 self.assertEqual(re.split("(?::+)", ":a:b::c"), ['', 'a', 'b', 'c']) 326 self.assertEqual(re.split("(:)+", ":a:b::c"), 327 ['', ':', 'a', ':', 'b', ':', 'c']) 328 self.assertEqual(re.split("([b:]+)", ":a:b::c"), 329 ['', ':', 'a', ':b::', 'c']) 330 self.assertEqual(re.split("(b)|(:+)", ":a:b::c"), 331 ['', None, ':', 'a', None, ':', '', 'b', None, '', 332 None, '::', 'c']) 333 self.assertEqual(re.split("(?:b)|(?::+)", ":a:b::c"), 334 ['', 'a', '', '', 'c']) 335 336 for sep, expected in [ 337 (':*', ['', 'a', 'b', 'c']), 338 ('(?::*)', ['', 'a', 'b', 'c']), 339 ('(:*)', ['', ':', 'a', ':', 'b', '::', 'c']), 340 ('(:)*', ['', ':', 'a', ':', 'b', ':', 'c']), 341 ]: 342 with self.subTest(sep=sep), self.assertWarns(FutureWarning): 343 self.assertTypedEqual(re.split(sep, ':a:b::c'), expected) 344 345 for sep, expected in [ 346 ('', [':a:b::c']), 347 (r'\b', [':a:b::c']), 348 (r'(?=:)', [':a:b::c']), 349 (r'(?<=:)', [':a:b::c']), 350 ]: 351 with self.subTest(sep=sep), self.assertRaises(ValueError): 352 self.assertTypedEqual(re.split(sep, ':a:b::c'), expected) 353 354 def test_qualified_re_split(self): 355 self.assertEqual(re.split(":", ":a:b::c", 2), ['', 'a', 'b::c']) 356 self.assertEqual(re.split(":", ":a:b::c", maxsplit=2), ['', 'a', 'b::c']) 357 self.assertEqual(re.split(':', 'a:b:c:d', maxsplit=2), ['a', 'b', 'c:d']) 358 self.assertEqual(re.split("(:)", ":a:b::c", maxsplit=2), 359 ['', ':', 'a', ':', 'b::c']) 360 self.assertEqual(re.split("(:+)", ":a:b::c", maxsplit=2), 361 ['', ':', 'a', ':', 'b::c']) 362 with self.assertWarns(FutureWarning): 363 self.assertEqual(re.split("(:*)", ":a:b::c", maxsplit=2), 364 ['', ':', 'a', ':', 'b::c']) 365 366 def test_re_findall(self): 367 self.assertEqual(re.findall(":+", "abc"), []) 368 for string in "a:b::c:::d", S("a:b::c:::d"): 369 self.assertTypedEqual(re.findall(":+", string), 370 [":", "::", ":::"]) 371 self.assertTypedEqual(re.findall("(:+)", string), 372 [":", "::", ":::"]) 373 self.assertTypedEqual(re.findall("(:)(:*)", string), 374 [(":", ""), (":", ":"), (":", "::")]) 375 for string in (b"a:b::c:::d", B(b"a:b::c:::d"), bytearray(b"a:b::c:::d"), 376 memoryview(b"a:b::c:::d")): 377 self.assertTypedEqual(re.findall(b":+", string), 378 [b":", b"::", b":::"]) 379 self.assertTypedEqual(re.findall(b"(:+)", string), 380 [b":", b"::", b":::"]) 381 self.assertTypedEqual(re.findall(b"(:)(:*)", string), 382 [(b":", b""), (b":", b":"), (b":", b"::")]) 383 for x in ("\xe0", "\u0430", "\U0001d49c"): 384 xx = x * 2 385 xxx = x * 3 386 string = "a%sb%sc%sd" % (x, xx, xxx) 387 self.assertEqual(re.findall("%s+" % x, string), [x, xx, xxx]) 388 self.assertEqual(re.findall("(%s+)" % x, string), [x, xx, xxx]) 389 self.assertEqual(re.findall("(%s)(%s*)" % (x, x), string), 390 [(x, ""), (x, x), (x, xx)]) 391 392 def test_bug_117612(self): 393 self.assertEqual(re.findall(r"(a|(b))", "aba"), 394 [("a", ""),("b", "b"),("a", "")]) 395 396 def test_re_match(self): 397 for string in 'a', S('a'): 398 self.assertEqual(re.match('a', string).groups(), ()) 399 self.assertEqual(re.match('(a)', string).groups(), ('a',)) 400 self.assertEqual(re.match('(a)', string).group(0), 'a') 401 self.assertEqual(re.match('(a)', string).group(1), 'a') 402 self.assertEqual(re.match('(a)', string).group(1, 1), ('a', 'a')) 403 for string in b'a', B(b'a'), bytearray(b'a'), memoryview(b'a'): 404 self.assertEqual(re.match(b'a', string).groups(), ()) 405 self.assertEqual(re.match(b'(a)', string).groups(), (b'a',)) 406 self.assertEqual(re.match(b'(a)', string).group(0), b'a') 407 self.assertEqual(re.match(b'(a)', string).group(1), b'a') 408 self.assertEqual(re.match(b'(a)', string).group(1, 1), (b'a', b'a')) 409 for a in ("\xe0", "\u0430", "\U0001d49c"): 410 self.assertEqual(re.match(a, a).groups(), ()) 411 self.assertEqual(re.match('(%s)' % a, a).groups(), (a,)) 412 self.assertEqual(re.match('(%s)' % a, a).group(0), a) 413 self.assertEqual(re.match('(%s)' % a, a).group(1), a) 414 self.assertEqual(re.match('(%s)' % a, a).group(1, 1), (a, a)) 415 416 pat = re.compile('((a)|(b))(c)?') 417 self.assertEqual(pat.match('a').groups(), ('a', 'a', None, None)) 418 self.assertEqual(pat.match('b').groups(), ('b', None, 'b', None)) 419 self.assertEqual(pat.match('ac').groups(), ('a', 'a', None, 'c')) 420 self.assertEqual(pat.match('bc').groups(), ('b', None, 'b', 'c')) 421 self.assertEqual(pat.match('bc').groups(""), ('b', "", 'b', 'c')) 422 423 pat = re.compile('(?:(?P<a1>a)|(?P<b2>b))(?P<c3>c)?') 424 self.assertEqual(pat.match('a').group(1, 2, 3), ('a', None, None)) 425 self.assertEqual(pat.match('b').group('a1', 'b2', 'c3'), 426 (None, 'b', None)) 427 self.assertEqual(pat.match('ac').group(1, 'b2', 3), ('a', None, 'c')) 428 429 def test_group(self): 430 class Index: 431 def __init__(self, value): 432 self.value = value 433 def __index__(self): 434 return self.value 435 # A single group 436 m = re.match('(a)(b)', 'ab') 437 self.assertEqual(m.group(), 'ab') 438 self.assertEqual(m.group(0), 'ab') 439 self.assertEqual(m.group(1), 'a') 440 self.assertEqual(m.group(Index(1)), 'a') 441 self.assertRaises(IndexError, m.group, -1) 442 self.assertRaises(IndexError, m.group, 3) 443 self.assertRaises(IndexError, m.group, 1<<1000) 444 self.assertRaises(IndexError, m.group, Index(1<<1000)) 445 self.assertRaises(IndexError, m.group, 'x') 446 # Multiple groups 447 self.assertEqual(m.group(2, 1), ('b', 'a')) 448 self.assertEqual(m.group(Index(2), Index(1)), ('b', 'a')) 449 450 def test_match_getitem(self): 451 pat = re.compile('(?:(?P<a1>a)|(?P<b2>b))(?P<c3>c)?') 452 453 m = pat.match('a') 454 self.assertEqual(m['a1'], 'a') 455 self.assertEqual(m['b2'], None) 456 self.assertEqual(m['c3'], None) 457 self.assertEqual('a1={a1} b2={b2} c3={c3}'.format_map(m), 'a1=a b2=None c3=None') 458 self.assertEqual(m[0], 'a') 459 self.assertEqual(m[1], 'a') 460 self.assertEqual(m[2], None) 461 self.assertEqual(m[3], None) 462 with self.assertRaisesRegex(IndexError, 'no such group'): 463 m['X'] 464 with self.assertRaisesRegex(IndexError, 'no such group'): 465 m[-1] 466 with self.assertRaisesRegex(IndexError, 'no such group'): 467 m[4] 468 with self.assertRaisesRegex(IndexError, 'no such group'): 469 m[0, 1] 470 with self.assertRaisesRegex(IndexError, 'no such group'): 471 m[(0,)] 472 with self.assertRaisesRegex(IndexError, 'no such group'): 473 m[(0, 1)] 474 with self.assertRaisesRegex(KeyError, 'a2'): 475 'a1={a2}'.format_map(m) 476 477 m = pat.match('ac') 478 self.assertEqual(m['a1'], 'a') 479 self.assertEqual(m['b2'], None) 480 self.assertEqual(m['c3'], 'c') 481 self.assertEqual('a1={a1} b2={b2} c3={c3}'.format_map(m), 'a1=a b2=None c3=c') 482 self.assertEqual(m[0], 'ac') 483 self.assertEqual(m[1], 'a') 484 self.assertEqual(m[2], None) 485 self.assertEqual(m[3], 'c') 486 487 # Cannot assign. 488 with self.assertRaises(TypeError): 489 m[0] = 1 490 491 # No len(). 492 self.assertRaises(TypeError, len, m) 493 494 def test_re_fullmatch(self): 495 # Issue 16203: Proposal: add re.fullmatch() method. 496 self.assertEqual(re.fullmatch(r"a", "a").span(), (0, 1)) 497 for string in "ab", S("ab"): 498 self.assertEqual(re.fullmatch(r"a|ab", string).span(), (0, 2)) 499 for string in b"ab", B(b"ab"), bytearray(b"ab"), memoryview(b"ab"): 500 self.assertEqual(re.fullmatch(br"a|ab", string).span(), (0, 2)) 501 for a, b in "\xe0\xdf", "\u0430\u0431", "\U0001d49c\U0001d49e": 502 r = r"%s|%s" % (a, a + b) 503 self.assertEqual(re.fullmatch(r, a + b).span(), (0, 2)) 504 self.assertEqual(re.fullmatch(r".*?$", "abc").span(), (0, 3)) 505 self.assertEqual(re.fullmatch(r".*?", "abc").span(), (0, 3)) 506 self.assertEqual(re.fullmatch(r"a.*?b", "ab").span(), (0, 2)) 507 self.assertEqual(re.fullmatch(r"a.*?b", "abb").span(), (0, 3)) 508 self.assertEqual(re.fullmatch(r"a.*?b", "axxb").span(), (0, 4)) 509 self.assertIsNone(re.fullmatch(r"a+", "ab")) 510 self.assertIsNone(re.fullmatch(r"abc$", "abc\n")) 511 self.assertIsNone(re.fullmatch(r"abc\Z", "abc\n")) 512 self.assertIsNone(re.fullmatch(r"(?m)abc$", "abc\n")) 513 self.assertEqual(re.fullmatch(r"ab(?=c)cd", "abcd").span(), (0, 4)) 514 self.assertEqual(re.fullmatch(r"ab(?<=b)cd", "abcd").span(), (0, 4)) 515 self.assertEqual(re.fullmatch(r"(?=a|ab)ab", "ab").span(), (0, 2)) 516 517 self.assertEqual( 518 re.compile(r"bc").fullmatch("abcd", pos=1, endpos=3).span(), (1, 3)) 519 self.assertEqual( 520 re.compile(r".*?$").fullmatch("abcd", pos=1, endpos=3).span(), (1, 3)) 521 self.assertEqual( 522 re.compile(r".*?").fullmatch("abcd", pos=1, endpos=3).span(), (1, 3)) 523 524 def test_re_groupref_exists(self): 525 self.assertEqual(re.match(r'^(\()?([^()]+)(?(1)\))$', '(a)').groups(), 526 ('(', 'a')) 527 self.assertEqual(re.match(r'^(\()?([^()]+)(?(1)\))$', 'a').groups(), 528 (None, 'a')) 529 self.assertIsNone(re.match(r'^(\()?([^()]+)(?(1)\))$', 'a)')) 530 self.assertIsNone(re.match(r'^(\()?([^()]+)(?(1)\))$', '(a')) 531 self.assertEqual(re.match('^(?:(a)|c)((?(1)b|d))$', 'ab').groups(), 532 ('a', 'b')) 533 self.assertEqual(re.match(r'^(?:(a)|c)((?(1)b|d))$', 'cd').groups(), 534 (None, 'd')) 535 self.assertEqual(re.match(r'^(?:(a)|c)((?(1)|d))$', 'cd').groups(), 536 (None, 'd')) 537 self.assertEqual(re.match(r'^(?:(a)|c)((?(1)|d))$', 'a').groups(), 538 ('a', '')) 539 540 # Tests for bug #1177831: exercise groups other than the first group 541 p = re.compile('(?P<g1>a)(?P<g2>b)?((?(g2)c|d))') 542 self.assertEqual(p.match('abc').groups(), 543 ('a', 'b', 'c')) 544 self.assertEqual(p.match('ad').groups(), 545 ('a', None, 'd')) 546 self.assertIsNone(p.match('abd')) 547 self.assertIsNone(p.match('ac')) 548 549 # Support > 100 groups. 550 pat = '|'.join('x(?P<a%d>%x)y' % (i, i) for i in range(1, 200 + 1)) 551 pat = '(?:%s)(?(200)z)' % pat 552 self.assertEqual(re.match(pat, 'xc8yz').span(), (0, 5)) 553 554 self.checkPatternError(r'(?P<a>)(?(0))', 'bad group number', 10) 555 self.checkPatternError(r'()(?(1)a|b', 556 'missing ), unterminated subpattern', 2) 557 self.checkPatternError(r'()(?(1)a|b|c)', 558 'conditional backref with more than ' 559 'two branches', 10) 560 561 def test_re_groupref_overflow(self): 562 from sre_constants import MAXGROUPS 563 self.checkTemplateError('()', r'\g<%s>' % MAXGROUPS, 'xx', 564 'invalid group reference %d' % MAXGROUPS, 3) 565 self.checkPatternError(r'(?P<a>)(?(%d))' % MAXGROUPS, 566 'invalid group reference %d' % MAXGROUPS, 10) 567 568 def test_re_groupref(self): 569 self.assertEqual(re.match(r'^(\|)?([^()]+)\1$', '|a|').groups(), 570 ('|', 'a')) 571 self.assertEqual(re.match(r'^(\|)?([^()]+)\1?$', 'a').groups(), 572 (None, 'a')) 573 self.assertIsNone(re.match(r'^(\|)?([^()]+)\1$', 'a|')) 574 self.assertIsNone(re.match(r'^(\|)?([^()]+)\1$', '|a')) 575 self.assertEqual(re.match(r'^(?:(a)|c)(\1)$', 'aa').groups(), 576 ('a', 'a')) 577 self.assertEqual(re.match(r'^(?:(a)|c)(\1)?$', 'c').groups(), 578 (None, None)) 579 580 self.checkPatternError(r'(abc\1)', 'cannot refer to an open group', 4) 581 582 def test_groupdict(self): 583 self.assertEqual(re.match('(?P<first>first) (?P<second>second)', 584 'first second').groupdict(), 585 {'first':'first', 'second':'second'}) 586 587 def test_expand(self): 588 self.assertEqual(re.match("(?P<first>first) (?P<second>second)", 589 "first second") 590 .expand(r"\2 \1 \g<second> \g<first>"), 591 "second first second first") 592 self.assertEqual(re.match("(?P<first>first)|(?P<second>second)", 593 "first") 594 .expand(r"\2 \g<second>"), 595 " ") 596 597 def test_repeat_minmax(self): 598 self.assertIsNone(re.match(r"^(\w){1}$", "abc")) 599 self.assertIsNone(re.match(r"^(\w){1}?$", "abc")) 600 self.assertIsNone(re.match(r"^(\w){1,2}$", "abc")) 601 self.assertIsNone(re.match(r"^(\w){1,2}?$", "abc")) 602 603 self.assertEqual(re.match(r"^(\w){3}$", "abc").group(1), "c") 604 self.assertEqual(re.match(r"^(\w){1,3}$", "abc").group(1), "c") 605 self.assertEqual(re.match(r"^(\w){1,4}$", "abc").group(1), "c") 606 self.assertEqual(re.match(r"^(\w){3,4}?$", "abc").group(1), "c") 607 self.assertEqual(re.match(r"^(\w){3}?$", "abc").group(1), "c") 608 self.assertEqual(re.match(r"^(\w){1,3}?$", "abc").group(1), "c") 609 self.assertEqual(re.match(r"^(\w){1,4}?$", "abc").group(1), "c") 610 self.assertEqual(re.match(r"^(\w){3,4}?$", "abc").group(1), "c") 611 612 self.assertIsNone(re.match(r"^x{1}$", "xxx")) 613 self.assertIsNone(re.match(r"^x{1}?$", "xxx")) 614 self.assertIsNone(re.match(r"^x{1,2}$", "xxx")) 615 self.assertIsNone(re.match(r"^x{1,2}?$", "xxx")) 616 617 self.assertTrue(re.match(r"^x{3}$", "xxx")) 618 self.assertTrue(re.match(r"^x{1,3}$", "xxx")) 619 self.assertTrue(re.match(r"^x{3,3}$", "xxx")) 620 self.assertTrue(re.match(r"^x{1,4}$", "xxx")) 621 self.assertTrue(re.match(r"^x{3,4}?$", "xxx")) 622 self.assertTrue(re.match(r"^x{3}?$", "xxx")) 623 self.assertTrue(re.match(r"^x{1,3}?$", "xxx")) 624 self.assertTrue(re.match(r"^x{1,4}?$", "xxx")) 625 self.assertTrue(re.match(r"^x{3,4}?$", "xxx")) 626 627 self.assertIsNone(re.match(r"^x{}$", "xxx")) 628 self.assertTrue(re.match(r"^x{}$", "x{}")) 629 630 self.checkPatternError(r'x{2,1}', 631 'min repeat greater than max repeat', 2) 632 633 def test_getattr(self): 634 self.assertEqual(re.compile("(?i)(a)(b)").pattern, "(?i)(a)(b)") 635 self.assertEqual(re.compile("(?i)(a)(b)").flags, re.I | re.U) 636 self.assertEqual(re.compile("(?i)(a)(b)").groups, 2) 637 self.assertEqual(re.compile("(?i)(a)(b)").groupindex, {}) 638 self.assertEqual(re.compile("(?i)(?P<first>a)(?P<other>b)").groupindex, 639 {'first': 1, 'other': 2}) 640 641 self.assertEqual(re.match("(a)", "a").pos, 0) 642 self.assertEqual(re.match("(a)", "a").endpos, 1) 643 self.assertEqual(re.match("(a)", "a").string, "a") 644 self.assertEqual(re.match("(a)", "a").regs, ((0, 1), (0, 1))) 645 self.assertTrue(re.match("(a)", "a").re) 646 647 # Issue 14260. groupindex should be non-modifiable mapping. 648 p = re.compile(r'(?i)(?P<first>a)(?P<other>b)') 649 self.assertEqual(sorted(p.groupindex), ['first', 'other']) 650 self.assertEqual(p.groupindex['other'], 2) 651 with self.assertRaises(TypeError): 652 p.groupindex['other'] = 0 653 self.assertEqual(p.groupindex['other'], 2) 654 655 def test_special_escapes(self): 656 self.assertEqual(re.search(r"\b(b.)\b", 657 "abcd abc bcd bx").group(1), "bx") 658 self.assertEqual(re.search(r"\B(b.)\B", 659 "abc bcd bc abxd").group(1), "bx") 660 self.assertEqual(re.search(r"\b(b.)\b", 661 "abcd abc bcd bx", re.ASCII).group(1), "bx") 662 self.assertEqual(re.search(r"\B(b.)\B", 663 "abc bcd bc abxd", re.ASCII).group(1), "bx") 664 self.assertEqual(re.search(r"^abc$", "\nabc\n", re.M).group(0), "abc") 665 self.assertEqual(re.search(r"^\Aabc\Z$", "abc", re.M).group(0), "abc") 666 self.assertIsNone(re.search(r"^\Aabc\Z$", "\nabc\n", re.M)) 667 self.assertEqual(re.search(br"\b(b.)\b", 668 b"abcd abc bcd bx").group(1), b"bx") 669 self.assertEqual(re.search(br"\B(b.)\B", 670 b"abc bcd bc abxd").group(1), b"bx") 671 self.assertEqual(re.search(br"\b(b.)\b", 672 b"abcd abc bcd bx", re.LOCALE).group(1), b"bx") 673 self.assertEqual(re.search(br"\B(b.)\B", 674 b"abc bcd bc abxd", re.LOCALE).group(1), b"bx") 675 self.assertEqual(re.search(br"^abc$", b"\nabc\n", re.M).group(0), b"abc") 676 self.assertEqual(re.search(br"^\Aabc\Z$", b"abc", re.M).group(0), b"abc") 677 self.assertIsNone(re.search(br"^\Aabc\Z$", b"\nabc\n", re.M)) 678 self.assertEqual(re.search(r"\d\D\w\W\s\S", 679 "1aa! a").group(0), "1aa! a") 680 self.assertEqual(re.search(br"\d\D\w\W\s\S", 681 b"1aa! a").group(0), b"1aa! a") 682 self.assertEqual(re.search(r"\d\D\w\W\s\S", 683 "1aa! a", re.ASCII).group(0), "1aa! a") 684 self.assertEqual(re.search(br"\d\D\w\W\s\S", 685 b"1aa! a", re.LOCALE).group(0), b"1aa! a") 686 687 def test_other_escapes(self): 688 self.checkPatternError("\\", 'bad escape (end of pattern)', 0) 689 self.assertEqual(re.match(r"\(", '(').group(), '(') 690 self.assertIsNone(re.match(r"\(", ')')) 691 self.assertEqual(re.match(r"\\", '\\').group(), '\\') 692 self.assertEqual(re.match(r"[\]]", ']').group(), ']') 693 self.assertIsNone(re.match(r"[\]]", '[')) 694 self.assertEqual(re.match(r"[a\-c]", '-').group(), '-') 695 self.assertIsNone(re.match(r"[a\-c]", 'b')) 696 self.assertEqual(re.match(r"[\^a]+", 'a^').group(), 'a^') 697 self.assertIsNone(re.match(r"[\^a]+", 'b')) 698 re.purge() # for warnings 699 for c in 'ceghijklmopqyzCEFGHIJKLMNOPQRTVXY': 700 with self.subTest(c): 701 self.assertRaises(re.error, re.compile, '\\%c' % c) 702 for c in 'ceghijklmopqyzABCEFGHIJKLMNOPQRTVXYZ': 703 with self.subTest(c): 704 self.assertRaises(re.error, re.compile, '[\\%c]' % c) 705 706 def test_string_boundaries(self): 707 # See http://bugs.python.org/issue10713 708 self.assertEqual(re.search(r"\b(abc)\b", "abc").group(1), 709 "abc") 710 # There's a word boundary at the start of a string. 711 self.assertTrue(re.match(r"\b", "abc")) 712 # A non-empty string includes a non-boundary zero-length match. 713 self.assertTrue(re.search(r"\B", "abc")) 714 # There is no non-boundary match at the start of a string. 715 self.assertFalse(re.match(r"\B", "abc")) 716 # However, an empty string contains no word boundaries, and also no 717 # non-boundaries. 718 self.assertIsNone(re.search(r"\B", "")) 719 # This one is questionable and different from the perlre behaviour, 720 # but describes current behavior. 721 self.assertIsNone(re.search(r"\b", "")) 722 # A single word-character string has two boundaries, but no 723 # non-boundary gaps. 724 self.assertEqual(len(re.findall(r"\b", "a")), 2) 725 self.assertEqual(len(re.findall(r"\B", "a")), 0) 726 # If there are no words, there are no boundaries 727 self.assertEqual(len(re.findall(r"\b", " ")), 0) 728 self.assertEqual(len(re.findall(r"\b", " ")), 0) 729 # Can match around the whitespace. 730 self.assertEqual(len(re.findall(r"\B", " ")), 2) 731 732 def test_bigcharset(self): 733 self.assertEqual(re.match("([\u2222\u2223])", 734 "\u2222").group(1), "\u2222") 735 r = '[%s]' % ''.join(map(chr, range(256, 2**16, 255))) 736 self.assertEqual(re.match(r, "\uff01").group(), "\uff01") 737 738 def test_big_codesize(self): 739 # Issue #1160 740 r = re.compile('|'.join(('%d'%x for x in range(10000)))) 741 self.assertTrue(r.match('1000')) 742 self.assertTrue(r.match('9999')) 743 744 def test_anyall(self): 745 self.assertEqual(re.match("a.b", "a\nb", re.DOTALL).group(0), 746 "a\nb") 747 self.assertEqual(re.match("a.*b", "a\n\nb", re.DOTALL).group(0), 748 "a\n\nb") 749 750 def test_lookahead(self): 751 self.assertEqual(re.match(r"(a(?=\s[^a]))", "a b").group(1), "a") 752 self.assertEqual(re.match(r"(a(?=\s[^a]*))", "a b").group(1), "a") 753 self.assertEqual(re.match(r"(a(?=\s[abc]))", "a b").group(1), "a") 754 self.assertEqual(re.match(r"(a(?=\s[abc]*))", "a bc").group(1), "a") 755 self.assertEqual(re.match(r"(a)(?=\s\1)", "a a").group(1), "a") 756 self.assertEqual(re.match(r"(a)(?=\s\1*)", "a aa").group(1), "a") 757 self.assertEqual(re.match(r"(a)(?=\s(abc|a))", "a a").group(1), "a") 758 759 self.assertEqual(re.match(r"(a(?!\s[^a]))", "a a").group(1), "a") 760 self.assertEqual(re.match(r"(a(?!\s[abc]))", "a d").group(1), "a") 761 self.assertEqual(re.match(r"(a)(?!\s\1)", "a b").group(1), "a") 762 self.assertEqual(re.match(r"(a)(?!\s(abc|a))", "a b").group(1), "a") 763 764 # Group reference. 765 self.assertTrue(re.match(r'(a)b(?=\1)a', 'aba')) 766 self.assertIsNone(re.match(r'(a)b(?=\1)c', 'abac')) 767 # Conditional group reference. 768 self.assertTrue(re.match(r'(?:(a)|(x))b(?=(?(2)x|c))c', 'abc')) 769 self.assertIsNone(re.match(r'(?:(a)|(x))b(?=(?(2)c|x))c', 'abc')) 770 self.assertTrue(re.match(r'(?:(a)|(x))b(?=(?(2)x|c))c', 'abc')) 771 self.assertIsNone(re.match(r'(?:(a)|(x))b(?=(?(1)b|x))c', 'abc')) 772 self.assertTrue(re.match(r'(?:(a)|(x))b(?=(?(1)c|x))c', 'abc')) 773 # Group used before defined. 774 self.assertTrue(re.match(r'(a)b(?=(?(2)x|c))(c)', 'abc')) 775 self.assertIsNone(re.match(r'(a)b(?=(?(2)b|x))(c)', 'abc')) 776 self.assertTrue(re.match(r'(a)b(?=(?(1)c|x))(c)', 'abc')) 777 778 def test_lookbehind(self): 779 self.assertTrue(re.match(r'ab(?<=b)c', 'abc')) 780 self.assertIsNone(re.match(r'ab(?<=c)c', 'abc')) 781 self.assertIsNone(re.match(r'ab(?<!b)c', 'abc')) 782 self.assertTrue(re.match(r'ab(?<!c)c', 'abc')) 783 # Group reference. 784 self.assertTrue(re.match(r'(a)a(?<=\1)c', 'aac')) 785 self.assertIsNone(re.match(r'(a)b(?<=\1)a', 'abaa')) 786 self.assertIsNone(re.match(r'(a)a(?<!\1)c', 'aac')) 787 self.assertTrue(re.match(r'(a)b(?<!\1)a', 'abaa')) 788 # Conditional group reference. 789 self.assertIsNone(re.match(r'(?:(a)|(x))b(?<=(?(2)x|c))c', 'abc')) 790 self.assertIsNone(re.match(r'(?:(a)|(x))b(?<=(?(2)b|x))c', 'abc')) 791 self.assertTrue(re.match(r'(?:(a)|(x))b(?<=(?(2)x|b))c', 'abc')) 792 self.assertIsNone(re.match(r'(?:(a)|(x))b(?<=(?(1)c|x))c', 'abc')) 793 self.assertTrue(re.match(r'(?:(a)|(x))b(?<=(?(1)b|x))c', 'abc')) 794 # Group used before defined. 795 self.assertRaises(re.error, re.compile, r'(a)b(?<=(?(2)b|x))(c)') 796 self.assertIsNone(re.match(r'(a)b(?<=(?(1)c|x))(c)', 'abc')) 797 self.assertTrue(re.match(r'(a)b(?<=(?(1)b|x))(c)', 'abc')) 798 # Group defined in the same lookbehind pattern 799 self.assertRaises(re.error, re.compile, r'(a)b(?<=(.)\2)(c)') 800 self.assertRaises(re.error, re.compile, r'(a)b(?<=(?P<a>.)(?P=a))(c)') 801 self.assertRaises(re.error, re.compile, r'(a)b(?<=(a)(?(2)b|x))(c)') 802 self.assertRaises(re.error, re.compile, r'(a)b(?<=(.)(?<=\2))(c)') 803 804 def test_ignore_case(self): 805 self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC") 806 self.assertEqual(re.match(b"abc", b"ABC", re.I).group(0), b"ABC") 807 self.assertEqual(re.match(r"(a\s[^a])", "a b", re.I).group(1), "a b") 808 self.assertEqual(re.match(r"(a\s[^a]*)", "a bb", re.I).group(1), "a bb") 809 self.assertEqual(re.match(r"(a\s[abc])", "a b", re.I).group(1), "a b") 810 self.assertEqual(re.match(r"(a\s[abc]*)", "a bb", re.I).group(1), "a bb") 811 self.assertEqual(re.match(r"((a)\s\2)", "a a", re.I).group(1), "a a") 812 self.assertEqual(re.match(r"((a)\s\2*)", "a aa", re.I).group(1), "a aa") 813 self.assertEqual(re.match(r"((a)\s(abc|a))", "a a", re.I).group(1), "a a") 814 self.assertEqual(re.match(r"((a)\s(abc|a)*)", "a aa", re.I).group(1), "a aa") 815 816 assert '\u212a'.lower() == 'k' # 'K' 817 self.assertTrue(re.match(r'K', '\u212a', re.I)) 818 self.assertTrue(re.match(r'k', '\u212a', re.I)) 819 self.assertTrue(re.match(r'\u212a', 'K', re.I)) 820 self.assertTrue(re.match(r'\u212a', 'k', re.I)) 821 assert '\u017f'.upper() == 'S' # 'ſ' 822 self.assertTrue(re.match(r'S', '\u017f', re.I)) 823 self.assertTrue(re.match(r's', '\u017f', re.I)) 824 self.assertTrue(re.match(r'\u017f', 'S', re.I)) 825 self.assertTrue(re.match(r'\u017f', 's', re.I)) 826 assert '\ufb05'.upper() == '\ufb06'.upper() == 'ST' # 'ſt', 'st' 827 self.assertTrue(re.match(r'\ufb05', '\ufb06', re.I)) 828 self.assertTrue(re.match(r'\ufb06', '\ufb05', re.I)) 829 830 def test_ignore_case_set(self): 831 self.assertTrue(re.match(r'[19A]', 'A', re.I)) 832 self.assertTrue(re.match(r'[19a]', 'a', re.I)) 833 self.assertTrue(re.match(r'[19a]', 'A', re.I)) 834 self.assertTrue(re.match(r'[19A]', 'a', re.I)) 835 self.assertTrue(re.match(br'[19A]', b'A', re.I)) 836 self.assertTrue(re.match(br'[19a]', b'a', re.I)) 837 self.assertTrue(re.match(br'[19a]', b'A', re.I)) 838 self.assertTrue(re.match(br'[19A]', b'a', re.I)) 839 assert '\u212a'.lower() == 'k' # 'K' 840 self.assertTrue(re.match(r'[19K]', '\u212a', re.I)) 841 self.assertTrue(re.match(r'[19k]', '\u212a', re.I)) 842 self.assertTrue(re.match(r'[19\u212a]', 'K', re.I)) 843 self.assertTrue(re.match(r'[19\u212a]', 'k', re.I)) 844 assert '\u017f'.upper() == 'S' # 'ſ' 845 self.assertTrue(re.match(r'[19S]', '\u017f', re.I)) 846 self.assertTrue(re.match(r'[19s]', '\u017f', re.I)) 847 self.assertTrue(re.match(r'[19\u017f]', 'S', re.I)) 848 self.assertTrue(re.match(r'[19\u017f]', 's', re.I)) 849 assert '\ufb05'.upper() == '\ufb06'.upper() == 'ST' # 'ſt', 'st' 850 self.assertTrue(re.match(r'[19\ufb05]', '\ufb06', re.I)) 851 self.assertTrue(re.match(r'[19\ufb06]', '\ufb05', re.I)) 852 853 def test_ignore_case_range(self): 854 # Issues #3511, #17381. 855 self.assertTrue(re.match(r'[9-a]', '_', re.I)) 856 self.assertIsNone(re.match(r'[9-A]', '_', re.I)) 857 self.assertTrue(re.match(br'[9-a]', b'_', re.I)) 858 self.assertIsNone(re.match(br'[9-A]', b'_', re.I)) 859 self.assertTrue(re.match(r'[\xc0-\xde]', '\xd7', re.I)) 860 self.assertIsNone(re.match(r'[\xc0-\xde]', '\xf7', re.I)) 861 self.assertTrue(re.match(r'[\xe0-\xfe]', '\xf7', re.I)) 862 self.assertIsNone(re.match(r'[\xe0-\xfe]', '\xd7', re.I)) 863 self.assertTrue(re.match(r'[\u0430-\u045f]', '\u0450', re.I)) 864 self.assertTrue(re.match(r'[\u0430-\u045f]', '\u0400', re.I)) 865 self.assertTrue(re.match(r'[\u0400-\u042f]', '\u0450', re.I)) 866 self.assertTrue(re.match(r'[\u0400-\u042f]', '\u0400', re.I)) 867 self.assertTrue(re.match(r'[\U00010428-\U0001044f]', '\U00010428', re.I)) 868 self.assertTrue(re.match(r'[\U00010428-\U0001044f]', '\U00010400', re.I)) 869 self.assertTrue(re.match(r'[\U00010400-\U00010427]', '\U00010428', re.I)) 870 self.assertTrue(re.match(r'[\U00010400-\U00010427]', '\U00010400', re.I)) 871 872 assert '\u212a'.lower() == 'k' # 'K' 873 self.assertTrue(re.match(r'[J-M]', '\u212a', re.I)) 874 self.assertTrue(re.match(r'[j-m]', '\u212a', re.I)) 875 self.assertTrue(re.match(r'[\u2129-\u212b]', 'K', re.I)) 876 self.assertTrue(re.match(r'[\u2129-\u212b]', 'k', re.I)) 877 assert '\u017f'.upper() == 'S' # 'ſ' 878 self.assertTrue(re.match(r'[R-T]', '\u017f', re.I)) 879 self.assertTrue(re.match(r'[r-t]', '\u017f', re.I)) 880 self.assertTrue(re.match(r'[\u017e-\u0180]', 'S', re.I)) 881 self.assertTrue(re.match(r'[\u017e-\u0180]', 's', re.I)) 882 assert '\ufb05'.upper() == '\ufb06'.upper() == 'ST' # 'ſt', 'st' 883 self.assertTrue(re.match(r'[\ufb04-\ufb05]', '\ufb06', re.I)) 884 self.assertTrue(re.match(r'[\ufb06-\ufb07]', '\ufb05', re.I)) 885 886 def test_category(self): 887 self.assertEqual(re.match(r"(\s)", " ").group(1), " ") 888 889 def test_getlower(self): 890 import _sre 891 self.assertEqual(_sre.getlower(ord('A'), 0), ord('a')) 892 self.assertEqual(_sre.getlower(ord('A'), re.LOCALE), ord('a')) 893 self.assertEqual(_sre.getlower(ord('A'), re.UNICODE), ord('a')) 894 self.assertEqual(_sre.getlower(ord('A'), re.ASCII), ord('a')) 895 896 self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC") 897 self.assertEqual(re.match(b"abc", b"ABC", re.I).group(0), b"ABC") 898 self.assertEqual(re.match("abc", "ABC", re.I|re.A).group(0), "ABC") 899 self.assertEqual(re.match(b"abc", b"ABC", re.I|re.L).group(0), b"ABC") 900 901 def test_not_literal(self): 902 self.assertEqual(re.search(r"\s([^a])", " b").group(1), "b") 903 self.assertEqual(re.search(r"\s([^a]*)", " bb").group(1), "bb") 904 905 def test_search_coverage(self): 906 self.assertEqual(re.search(r"\s(b)", " b").group(1), "b") 907 self.assertEqual(re.search(r"a\s", "a ").group(0), "a ") 908 909 def assertMatch(self, pattern, text, match=None, span=None, 910 matcher=re.match): 911 if match is None and span is None: 912 # the pattern matches the whole text 913 match = text 914 span = (0, len(text)) 915 elif match is None or span is None: 916 raise ValueError('If match is not None, span should be specified ' 917 '(and vice versa).') 918 m = matcher(pattern, text) 919 self.assertTrue(m) 920 self.assertEqual(m.group(), match) 921 self.assertEqual(m.span(), span) 922 923 def test_re_escape(self): 924 alnum_chars = string.ascii_letters + string.digits + '_' 925 p = ''.join(chr(i) for i in range(256)) 926 for c in p: 927 if c in alnum_chars: 928 self.assertEqual(re.escape(c), c) 929 elif c == '\x00': 930 self.assertEqual(re.escape(c), '\\000') 931 else: 932 self.assertEqual(re.escape(c), '\\' + c) 933 self.assertMatch(re.escape(c), c) 934 self.assertMatch(re.escape(p), p) 935 936 def test_re_escape_byte(self): 937 alnum_chars = (string.ascii_letters + string.digits + '_').encode('ascii') 938 p = bytes(range(256)) 939 for i in p: 940 b = bytes([i]) 941 if b in alnum_chars: 942 self.assertEqual(re.escape(b), b) 943 elif i == 0: 944 self.assertEqual(re.escape(b), b'\\000') 945 else: 946 self.assertEqual(re.escape(b), b'\\' + b) 947 self.assertMatch(re.escape(b), b) 948 self.assertMatch(re.escape(p), p) 949 950 def test_re_escape_non_ascii(self): 951 s = 'xxx\u2620\u2620\u2620xxx' 952 s_escaped = re.escape(s) 953 self.assertEqual(s_escaped, 'xxx\\\u2620\\\u2620\\\u2620xxx') 954 self.assertMatch(s_escaped, s) 955 self.assertMatch('.%s+.' % re.escape('\u2620'), s, 956 'x\u2620\u2620\u2620x', (2, 7), re.search) 957 958 def test_re_escape_non_ascii_bytes(self): 959 b = 'y\u2620y\u2620y'.encode('utf-8') 960 b_escaped = re.escape(b) 961 self.assertEqual(b_escaped, b'y\\\xe2\\\x98\\\xa0y\\\xe2\\\x98\\\xa0y') 962 self.assertMatch(b_escaped, b) 963 res = re.findall(re.escape('\u2620'.encode('utf-8')), b) 964 self.assertEqual(len(res), 2) 965 966 def test_pickling(self): 967 import pickle 968 oldpat = re.compile('a(?:b|(c|e){1,2}?|d)+?(.)', re.UNICODE) 969 for proto in range(pickle.HIGHEST_PROTOCOL + 1): 970 pickled = pickle.dumps(oldpat, proto) 971 newpat = pickle.loads(pickled) 972 self.assertEqual(newpat, oldpat) 973 # current pickle expects the _compile() reconstructor in re module 974 from re import _compile 975 976 def test_constants(self): 977 self.assertEqual(re.I, re.IGNORECASE) 978 self.assertEqual(re.L, re.LOCALE) 979 self.assertEqual(re.M, re.MULTILINE) 980 self.assertEqual(re.S, re.DOTALL) 981 self.assertEqual(re.X, re.VERBOSE) 982 983 def test_flags(self): 984 for flag in [re.I, re.M, re.X, re.S, re.A, re.U]: 985 self.assertTrue(re.compile('^pattern$', flag)) 986 for flag in [re.I, re.M, re.X, re.S, re.A, re.L]: 987 self.assertTrue(re.compile(b'^pattern$', flag)) 988 989 def test_sre_character_literals(self): 990 for i in [0, 8, 16, 32, 64, 127, 128, 255, 256, 0xFFFF, 0x10000, 0x10FFFF]: 991 if i < 256: 992 self.assertTrue(re.match(r"\%03o" % i, chr(i))) 993 self.assertTrue(re.match(r"\%03o0" % i, chr(i)+"0")) 994 self.assertTrue(re.match(r"\%03o8" % i, chr(i)+"8")) 995 self.assertTrue(re.match(r"\x%02x" % i, chr(i))) 996 self.assertTrue(re.match(r"\x%02x0" % i, chr(i)+"0")) 997 self.assertTrue(re.match(r"\x%02xz" % i, chr(i)+"z")) 998 if i < 0x10000: 999 self.assertTrue(re.match(r"\u%04x" % i, chr(i))) 1000 self.assertTrue(re.match(r"\u%04x0" % i, chr(i)+"0")) 1001 self.assertTrue(re.match(r"\u%04xz" % i, chr(i)+"z")) 1002 self.assertTrue(re.match(r"\U%08x" % i, chr(i))) 1003 self.assertTrue(re.match(r"\U%08x0" % i, chr(i)+"0")) 1004 self.assertTrue(re.match(r"\U%08xz" % i, chr(i)+"z")) 1005 self.assertTrue(re.match(r"\0", "\000")) 1006 self.assertTrue(re.match(r"\08", "\0008")) 1007 self.assertTrue(re.match(r"\01", "\001")) 1008 self.assertTrue(re.match(r"\018", "\0018")) 1009 self.checkPatternError(r"\567", 1010 r'octal escape value \567 outside of ' 1011 r'range 0-0o377', 0) 1012 self.checkPatternError(r"\911", 'invalid group reference 91', 1) 1013 self.checkPatternError(r"\x1", r'incomplete escape \x1', 0) 1014 self.checkPatternError(r"\x1z", r'incomplete escape \x1', 0) 1015 self.checkPatternError(r"\u123", r'incomplete escape \u123', 0) 1016 self.checkPatternError(r"\u123z", r'incomplete escape \u123', 0) 1017 self.checkPatternError(r"\U0001234", r'incomplete escape \U0001234', 0) 1018 self.checkPatternError(r"\U0001234z", r'incomplete escape \U0001234', 0) 1019 self.checkPatternError(r"\U00110000", r'bad escape \U00110000', 0) 1020 1021 def test_sre_character_class_literals(self): 1022 for i in [0, 8, 16, 32, 64, 127, 128, 255, 256, 0xFFFF, 0x10000, 0x10FFFF]: 1023 if i < 256: 1024 self.assertTrue(re.match(r"[\%o]" % i, chr(i))) 1025 self.assertTrue(re.match(r"[\%o8]" % i, chr(i))) 1026 self.assertTrue(re.match(r"[\%03o]" % i, chr(i))) 1027 self.assertTrue(re.match(r"[\%03o0]" % i, chr(i))) 1028 self.assertTrue(re.match(r"[\%03o8]" % i, chr(i))) 1029 self.assertTrue(re.match(r"[\x%02x]" % i, chr(i))) 1030 self.assertTrue(re.match(r"[\x%02x0]" % i, chr(i))) 1031 self.assertTrue(re.match(r"[\x%02xz]" % i, chr(i))) 1032 if i < 0x10000: 1033 self.assertTrue(re.match(r"[\u%04x]" % i, chr(i))) 1034 self.assertTrue(re.match(r"[\u%04x0]" % i, chr(i))) 1035 self.assertTrue(re.match(r"[\u%04xz]" % i, chr(i))) 1036 self.assertTrue(re.match(r"[\U%08x]" % i, chr(i))) 1037 self.assertTrue(re.match(r"[\U%08x0]" % i, chr(i)+"0")) 1038 self.assertTrue(re.match(r"[\U%08xz]" % i, chr(i)+"z")) 1039 self.checkPatternError(r"[\567]", 1040 r'octal escape value \567 outside of ' 1041 r'range 0-0o377', 1) 1042 self.checkPatternError(r"[\911]", r'bad escape \9', 1) 1043 self.checkPatternError(r"[\x1z]", r'incomplete escape \x1', 1) 1044 self.checkPatternError(r"[\u123z]", r'incomplete escape \u123', 1) 1045 self.checkPatternError(r"[\U0001234z]", r'incomplete escape \U0001234', 1) 1046 self.checkPatternError(r"[\U00110000]", r'bad escape \U00110000', 1) 1047 self.assertTrue(re.match(r"[\U0001d49c-\U0001d4b5]", "\U0001d49e")) 1048 1049 def test_sre_byte_literals(self): 1050 for i in [0, 8, 16, 32, 64, 127, 128, 255]: 1051 self.assertTrue(re.match((r"\%03o" % i).encode(), bytes([i]))) 1052 self.assertTrue(re.match((r"\%03o0" % i).encode(), bytes([i])+b"0")) 1053 self.assertTrue(re.match((r"\%03o8" % i).encode(), bytes([i])+b"8")) 1054 self.assertTrue(re.match((r"\x%02x" % i).encode(), bytes([i]))) 1055 self.assertTrue(re.match((r"\x%02x0" % i).encode(), bytes([i])+b"0")) 1056 self.assertTrue(re.match((r"\x%02xz" % i).encode(), bytes([i])+b"z")) 1057 self.assertRaises(re.error, re.compile, br"\u1234") 1058 self.assertRaises(re.error, re.compile, br"\U00012345") 1059 self.assertTrue(re.match(br"\0", b"\000")) 1060 self.assertTrue(re.match(br"\08", b"\0008")) 1061 self.assertTrue(re.match(br"\01", b"\001")) 1062 self.assertTrue(re.match(br"\018", b"\0018")) 1063 self.checkPatternError(br"\567", 1064 r'octal escape value \567 outside of ' 1065 r'range 0-0o377', 0) 1066 self.checkPatternError(br"\911", 'invalid group reference 91', 1) 1067 self.checkPatternError(br"\x1", r'incomplete escape \x1', 0) 1068 self.checkPatternError(br"\x1z", r'incomplete escape \x1', 0) 1069 1070 def test_sre_byte_class_literals(self): 1071 for i in [0, 8, 16, 32, 64, 127, 128, 255]: 1072 self.assertTrue(re.match((r"[\%o]" % i).encode(), bytes([i]))) 1073 self.assertTrue(re.match((r"[\%o8]" % i).encode(), bytes([i]))) 1074 self.assertTrue(re.match((r"[\%03o]" % i).encode(), bytes([i]))) 1075 self.assertTrue(re.match((r"[\%03o0]" % i).encode(), bytes([i]))) 1076 self.assertTrue(re.match((r"[\%03o8]" % i).encode(), bytes([i]))) 1077 self.assertTrue(re.match((r"[\x%02x]" % i).encode(), bytes([i]))) 1078 self.assertTrue(re.match((r"[\x%02x0]" % i).encode(), bytes([i]))) 1079 self.assertTrue(re.match((r"[\x%02xz]" % i).encode(), bytes([i]))) 1080 self.assertRaises(re.error, re.compile, br"[\u1234]") 1081 self.assertRaises(re.error, re.compile, br"[\U00012345]") 1082 self.checkPatternError(br"[\567]", 1083 r'octal escape value \567 outside of ' 1084 r'range 0-0o377', 1) 1085 self.checkPatternError(br"[\911]", r'bad escape \9', 1) 1086 self.checkPatternError(br"[\x1z]", r'incomplete escape \x1', 1) 1087 1088 def test_character_set_errors(self): 1089 self.checkPatternError(r'[', 'unterminated character set', 0) 1090 self.checkPatternError(r'[^', 'unterminated character set', 0) 1091 self.checkPatternError(r'[a', 'unterminated character set', 0) 1092 # bug 545855 -- This pattern failed to cause a compile error as it 1093 # should, instead provoking a TypeError. 1094 self.checkPatternError(r"[a-", 'unterminated character set', 0) 1095 self.checkPatternError(r"[\w-b]", r'bad character range \w-b', 1) 1096 self.checkPatternError(r"[a-\w]", r'bad character range a-\w', 1) 1097 self.checkPatternError(r"[b-a]", 'bad character range b-a', 1) 1098 1099 def test_bug_113254(self): 1100 self.assertEqual(re.match(r'(a)|(b)', 'b').start(1), -1) 1101 self.assertEqual(re.match(r'(a)|(b)', 'b').end(1), -1) 1102 self.assertEqual(re.match(r'(a)|(b)', 'b').span(1), (-1, -1)) 1103 1104 def test_bug_527371(self): 1105 # bug described in patches 527371/672491 1106 self.assertIsNone(re.match(r'(a)?a','a').lastindex) 1107 self.assertEqual(re.match(r'(a)(b)?b','ab').lastindex, 1) 1108 self.assertEqual(re.match(r'(?P<a>a)(?P<b>b)?b','ab').lastgroup, 'a') 1109 self.assertEqual(re.match(r"(?P<a>a(b))", "ab").lastgroup, 'a') 1110 self.assertEqual(re.match(r"((a))", "a").lastindex, 1) 1111 1112 def test_bug_418626(self): 1113 # bugs 418626 at al. -- Testing Greg Chapman's addition of op code 1114 # SRE_OP_MIN_REPEAT_ONE for eliminating recursion on simple uses of 1115 # pattern '*?' on a long string. 1116 self.assertEqual(re.match('.*?c', 10000*'ab'+'cd').end(0), 20001) 1117 self.assertEqual(re.match('.*?cd', 5000*'ab'+'c'+5000*'ab'+'cde').end(0), 1118 20003) 1119 self.assertEqual(re.match('.*?cd', 20000*'abc'+'de').end(0), 60001) 1120 # non-simple '*?' still used to hit the recursion limit, before the 1121 # non-recursive scheme was implemented. 1122 self.assertEqual(re.search('(a|b)*?c', 10000*'ab'+'cd').end(0), 20001) 1123 1124 def test_bug_612074(self): 1125 pat="["+re.escape("\u2039")+"]" 1126 self.assertEqual(re.compile(pat) and 1, 1) 1127 1128 def test_stack_overflow(self): 1129 # nasty cases that used to overflow the straightforward recursive 1130 # implementation of repeated groups. 1131 self.assertEqual(re.match('(x)*', 50000*'x').group(1), 'x') 1132 self.assertEqual(re.match('(x)*y', 50000*'x'+'y').group(1), 'x') 1133 self.assertEqual(re.match('(x)*?y', 50000*'x'+'y').group(1), 'x') 1134 1135 def test_nothing_to_repeat(self): 1136 for reps in '*', '+', '?', '{1,2}': 1137 for mod in '', '?': 1138 self.checkPatternError('%s%s' % (reps, mod), 1139 'nothing to repeat', 0) 1140 self.checkPatternError('(?:%s%s)' % (reps, mod), 1141 'nothing to repeat', 3) 1142 1143 def test_multiple_repeat(self): 1144 for outer_reps in '*', '+', '{1,2}': 1145 for outer_mod in '', '?': 1146 outer_op = outer_reps + outer_mod 1147 for inner_reps in '*', '+', '?', '{1,2}': 1148 for inner_mod in '', '?': 1149 inner_op = inner_reps + inner_mod 1150 self.checkPatternError(r'x%s%s' % (inner_op, outer_op), 1151 'multiple repeat', 1 + len(inner_op)) 1152 1153 def test_unlimited_zero_width_repeat(self): 1154 # Issue #9669 1155 self.assertIsNone(re.match(r'(?:a?)*y', 'z')) 1156 self.assertIsNone(re.match(r'(?:a?)+y', 'z')) 1157 self.assertIsNone(re.match(r'(?:a?){2,}y', 'z')) 1158 self.assertIsNone(re.match(r'(?:a?)*?y', 'z')) 1159 self.assertIsNone(re.match(r'(?:a?)+?y', 'z')) 1160 self.assertIsNone(re.match(r'(?:a?){2,}?y', 'z')) 1161 1162 def test_scanner(self): 1163 def s_ident(scanner, token): return token 1164 def s_operator(scanner, token): return "op%s" % token 1165 def s_float(scanner, token): return float(token) 1166 def s_int(scanner, token): return int(token) 1167 1168 scanner = Scanner([ 1169 (r"[a-zA-Z_]\w*", s_ident), 1170 (r"\d+\.\d*", s_float), 1171 (r"\d+", s_int), 1172 (r"=|\+|-|\*|/", s_operator), 1173 (r"\s+", None), 1174 ]) 1175 1176 self.assertTrue(scanner.scanner.scanner("").pattern) 1177 1178 self.assertEqual(scanner.scan("sum = 3*foo + 312.50 + bar"), 1179 (['sum', 'op=', 3, 'op*', 'foo', 'op+', 312.5, 1180 'op+', 'bar'], '')) 1181 1182 def test_bug_448951(self): 1183 # bug 448951 (similar to 429357, but with single char match) 1184 # (Also test greedy matches.) 1185 for op in '','?','*': 1186 self.assertEqual(re.match(r'((.%s):)?z'%op, 'z').groups(), 1187 (None, None)) 1188 self.assertEqual(re.match(r'((.%s):)?z'%op, 'a:z').groups(), 1189 ('a:', 'a')) 1190 1191 def test_bug_725106(self): 1192 # capturing groups in alternatives in repeats 1193 self.assertEqual(re.match('^((a)|b)*', 'abc').groups(), 1194 ('b', 'a')) 1195 self.assertEqual(re.match('^(([ab])|c)*', 'abc').groups(), 1196 ('c', 'b')) 1197 self.assertEqual(re.match('^((d)|[ab])*', 'abc').groups(), 1198 ('b', None)) 1199 self.assertEqual(re.match('^((a)c|[ab])*', 'abc').groups(), 1200 ('b', None)) 1201 self.assertEqual(re.match('^((a)|b)*?c', 'abc').groups(), 1202 ('b', 'a')) 1203 self.assertEqual(re.match('^(([ab])|c)*?d', 'abcd').groups(), 1204 ('c', 'b')) 1205 self.assertEqual(re.match('^((d)|[ab])*?c', 'abc').groups(), 1206 ('b', None)) 1207 self.assertEqual(re.match('^((a)c|[ab])*?c', 'abc').groups(), 1208 ('b', None)) 1209 1210 def test_bug_725149(self): 1211 # mark_stack_base restoring before restoring marks 1212 self.assertEqual(re.match('(a)(?:(?=(b)*)c)*', 'abb').groups(), 1213 ('a', None)) 1214 self.assertEqual(re.match('(a)((?!(b)*))*', 'abb').groups(), 1215 ('a', None, None)) 1216 1217 def test_bug_764548(self): 1218 # bug 764548, re.compile() barfs on str/unicode subclasses 1219 class my_unicode(str): pass 1220 pat = re.compile(my_unicode("abc")) 1221 self.assertIsNone(pat.match("xyz")) 1222 1223 def test_finditer(self): 1224 iter = re.finditer(r":+", "a:b::c:::d") 1225 self.assertEqual([item.group(0) for item in iter], 1226 [":", "::", ":::"]) 1227 1228 pat = re.compile(r":+") 1229 iter = pat.finditer("a:b::c:::d", 1, 10) 1230 self.assertEqual([item.group(0) for item in iter], 1231 [":", "::", ":::"]) 1232 1233 pat = re.compile(r":+") 1234 iter = pat.finditer("a:b::c:::d", pos=1, endpos=10) 1235 self.assertEqual([item.group(0) for item in iter], 1236 [":", "::", ":::"]) 1237 1238 pat = re.compile(r":+") 1239 iter = pat.finditer("a:b::c:::d", endpos=10, pos=1) 1240 self.assertEqual([item.group(0) for item in iter], 1241 [":", "::", ":::"]) 1242 1243 pat = re.compile(r":+") 1244 iter = pat.finditer("a:b::c:::d", pos=3, endpos=8) 1245 self.assertEqual([item.group(0) for item in iter], 1246 ["::", "::"]) 1247 1248 def test_bug_926075(self): 1249 self.assertIsNot(re.compile('bug_926075'), 1250 re.compile(b'bug_926075')) 1251 1252 def test_bug_931848(self): 1253 pattern = "[\u002E\u3002\uFF0E\uFF61]" 1254 self.assertEqual(re.compile(pattern).split("a.b.c"), 1255 ['a','b','c']) 1256 1257 def test_bug_581080(self): 1258 iter = re.finditer(r"\s", "a b") 1259 self.assertEqual(next(iter).span(), (1,2)) 1260 self.assertRaises(StopIteration, next, iter) 1261 1262 scanner = re.compile(r"\s").scanner("a b") 1263 self.assertEqual(scanner.search().span(), (1, 2)) 1264 self.assertIsNone(scanner.search()) 1265 1266 def test_bug_817234(self): 1267 iter = re.finditer(r".*", "asdf") 1268 self.assertEqual(next(iter).span(), (0, 4)) 1269 self.assertEqual(next(iter).span(), (4, 4)) 1270 self.assertRaises(StopIteration, next, iter) 1271 1272 def test_bug_6561(self): 1273 # '\d' should match characters in Unicode category 'Nd' 1274 # (Number, Decimal Digit), but not those in 'Nl' (Number, 1275 # Letter) or 'No' (Number, Other). 1276 decimal_digits = [ 1277 '\u0037', # '\N{DIGIT SEVEN}', category 'Nd' 1278 '\u0e58', # '\N{THAI DIGIT SIX}', category 'Nd' 1279 '\uff10', # '\N{FULLWIDTH DIGIT ZERO}', category 'Nd' 1280 ] 1281 for x in decimal_digits: 1282 self.assertEqual(re.match(r'^\d$', x).group(0), x) 1283 1284 not_decimal_digits = [ 1285 '\u2165', # '\N{ROMAN NUMERAL SIX}', category 'Nl' 1286 '\u3039', # '\N{HANGZHOU NUMERAL TWENTY}', category 'Nl' 1287 '\u2082', # '\N{SUBSCRIPT TWO}', category 'No' 1288 '\u32b4', # '\N{CIRCLED NUMBER THIRTY NINE}', category 'No' 1289 ] 1290 for x in not_decimal_digits: 1291 self.assertIsNone(re.match(r'^\d$', x)) 1292 1293 def test_empty_array(self): 1294 # SF buf 1647541 1295 import array 1296 for typecode in 'bBuhHiIlLfd': 1297 a = array.array(typecode) 1298 self.assertIsNone(re.compile(b"bla").match(a)) 1299 self.assertEqual(re.compile(b"").match(a).groups(), ()) 1300 1301 def test_inline_flags(self): 1302 # Bug #1700 1303 upper_char = '\u1ea0' # Latin Capital Letter A with Dot Below 1304 lower_char = '\u1ea1' # Latin Small Letter A with Dot Below 1305 1306 p = re.compile(upper_char, re.I | re.U) 1307 q = p.match(lower_char) 1308 self.assertTrue(q) 1309 1310 p = re.compile(lower_char, re.I | re.U) 1311 q = p.match(upper_char) 1312 self.assertTrue(q) 1313 1314 p = re.compile('(?i)' + upper_char, re.U) 1315 q = p.match(lower_char) 1316 self.assertTrue(q) 1317 1318 p = re.compile('(?i)' + lower_char, re.U) 1319 q = p.match(upper_char) 1320 self.assertTrue(q) 1321 1322 p = re.compile('(?iu)' + upper_char) 1323 q = p.match(lower_char) 1324 self.assertTrue(q) 1325 1326 p = re.compile('(?iu)' + lower_char) 1327 q = p.match(upper_char) 1328 self.assertTrue(q) 1329 1330 self.assertTrue(re.match('(?ixu) ' + upper_char, lower_char)) 1331 self.assertTrue(re.match('(?ixu) ' + lower_char, upper_char)) 1332 1333 p = upper_char + '(?i)' 1334 with self.assertWarns(DeprecationWarning) as warns: 1335 self.assertTrue(re.match(p, lower_char)) 1336 self.assertEqual( 1337 str(warns.warnings[0].message), 1338 'Flags not at the start of the expression %s' % p 1339 ) 1340 1341 p = upper_char + '(?i)%s' % ('.?' * 100) 1342 with self.assertWarns(DeprecationWarning) as warns: 1343 self.assertTrue(re.match(p, lower_char)) 1344 self.assertEqual( 1345 str(warns.warnings[0].message), 1346 'Flags not at the start of the expression %s (truncated)' % p[:20] 1347 ) 1348 1349 def test_dollar_matches_twice(self): 1350 "$ matches the end of string, and just before the terminating \n" 1351 pattern = re.compile('$') 1352 self.assertEqual(pattern.sub('#', 'a\nb\n'), 'a\nb#\n#') 1353 self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a\nb\nc#') 1354 self.assertEqual(pattern.sub('#', '\n'), '#\n#') 1355 1356 pattern = re.compile('$', re.MULTILINE) 1357 self.assertEqual(pattern.sub('#', 'a\nb\n' ), 'a#\nb#\n#' ) 1358 self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a#\nb#\nc#') 1359 self.assertEqual(pattern.sub('#', '\n'), '#\n#') 1360 1361 def test_bytes_str_mixing(self): 1362 # Mixing str and bytes is disallowed 1363 pat = re.compile('.') 1364 bpat = re.compile(b'.') 1365 self.assertRaises(TypeError, pat.match, b'b') 1366 self.assertRaises(TypeError, bpat.match, 'b') 1367 self.assertRaises(TypeError, pat.sub, b'b', 'c') 1368 self.assertRaises(TypeError, pat.sub, 'b', b'c') 1369 self.assertRaises(TypeError, pat.sub, b'b', b'c') 1370 self.assertRaises(TypeError, bpat.sub, b'b', 'c') 1371 self.assertRaises(TypeError, bpat.sub, 'b', b'c') 1372 self.assertRaises(TypeError, bpat.sub, 'b', 'c') 1373 1374 def test_ascii_and_unicode_flag(self): 1375 # String patterns 1376 for flags in (0, re.UNICODE): 1377 pat = re.compile('\xc0', flags | re.IGNORECASE) 1378 self.assertTrue(pat.match('\xe0')) 1379 pat = re.compile(r'\w', flags) 1380 self.assertTrue(pat.match('\xe0')) 1381 pat = re.compile('\xc0', re.ASCII | re.IGNORECASE) 1382 self.assertIsNone(pat.match('\xe0')) 1383 pat = re.compile('(?a)\xc0', re.IGNORECASE) 1384 self.assertIsNone(pat.match('\xe0')) 1385 pat = re.compile(r'\w', re.ASCII) 1386 self.assertIsNone(pat.match('\xe0')) 1387 pat = re.compile(r'(?a)\w') 1388 self.assertIsNone(pat.match('\xe0')) 1389 # Bytes patterns 1390 for flags in (0, re.ASCII): 1391 pat = re.compile(b'\xc0', flags | re.IGNORECASE) 1392 self.assertIsNone(pat.match(b'\xe0')) 1393 pat = re.compile(br'\w', flags) 1394 self.assertIsNone(pat.match(b'\xe0')) 1395 # Incompatibilities 1396 self.assertRaises(ValueError, re.compile, br'\w', re.UNICODE) 1397 self.assertRaises(ValueError, re.compile, br'(?u)\w') 1398 self.assertRaises(ValueError, re.compile, r'\w', re.UNICODE | re.ASCII) 1399 self.assertRaises(ValueError, re.compile, r'(?u)\w', re.ASCII) 1400 self.assertRaises(ValueError, re.compile, r'(?a)\w', re.UNICODE) 1401 self.assertRaises(ValueError, re.compile, r'(?au)\w') 1402 1403 def test_locale_flag(self): 1404 import locale 1405 enc = locale.getpreferredencoding(False) 1406 # Search non-ASCII letter 1407 for i in range(128, 256): 1408 try: 1409 c = bytes([i]).decode(enc) 1410 sletter = c.lower() 1411 if sletter == c: continue 1412 bletter = sletter.encode(enc) 1413 if len(bletter) != 1: continue 1414 if bletter.decode(enc) != sletter: continue 1415 bpat = re.escape(bytes([i])) 1416 break 1417 except (UnicodeError, TypeError): 1418 pass 1419 else: 1420 bletter = None 1421 bpat = b'A' 1422 # Bytes patterns 1423 pat = re.compile(bpat, re.LOCALE | re.IGNORECASE) 1424 if bletter: 1425 self.assertTrue(pat.match(bletter)) 1426 pat = re.compile(b'(?L)' + bpat, re.IGNORECASE) 1427 if bletter: 1428 self.assertTrue(pat.match(bletter)) 1429 pat = re.compile(bpat, re.IGNORECASE) 1430 if bletter: 1431 self.assertIsNone(pat.match(bletter)) 1432 pat = re.compile(br'\w', re.LOCALE) 1433 if bletter: 1434 self.assertTrue(pat.match(bletter)) 1435 pat = re.compile(br'(?L)\w') 1436 if bletter: 1437 self.assertTrue(pat.match(bletter)) 1438 pat = re.compile(br'\w') 1439 if bletter: 1440 self.assertIsNone(pat.match(bletter)) 1441 # Incompatibilities 1442 self.assertRaises(ValueError, re.compile, '', re.LOCALE) 1443 self.assertRaises(ValueError, re.compile, '(?L)') 1444 self.assertRaises(ValueError, re.compile, b'', re.LOCALE | re.ASCII) 1445 self.assertRaises(ValueError, re.compile, b'(?L)', re.ASCII) 1446 self.assertRaises(ValueError, re.compile, b'(?a)', re.LOCALE) 1447 self.assertRaises(ValueError, re.compile, b'(?aL)') 1448 1449 def test_scoped_flags(self): 1450 self.assertTrue(re.match(r'(?i:a)b', 'Ab')) 1451 self.assertIsNone(re.match(r'(?i:a)b', 'aB')) 1452 self.assertIsNone(re.match(r'(?-i:a)b', 'Ab', re.IGNORECASE)) 1453 self.assertTrue(re.match(r'(?-i:a)b', 'aB', re.IGNORECASE)) 1454 self.assertIsNone(re.match(r'(?i:(?-i:a)b)', 'Ab')) 1455 self.assertTrue(re.match(r'(?i:(?-i:a)b)', 'aB')) 1456 1457 self.assertTrue(re.match(r'(?x: a) b', 'a b')) 1458 self.assertIsNone(re.match(r'(?x: a) b', ' a b')) 1459 self.assertTrue(re.match(r'(?-x: a) b', ' ab', re.VERBOSE)) 1460 self.assertIsNone(re.match(r'(?-x: a) b', 'ab', re.VERBOSE)) 1461 1462 self.checkPatternError(r'(?a:\w)', 1463 'bad inline flags: cannot turn on global flag', 3) 1464 self.checkPatternError(r'(?a)(?-a:\w)', 1465 'bad inline flags: cannot turn off global flag', 8) 1466 self.checkPatternError(r'(?i-i:a)', 1467 'bad inline flags: flag turned on and off', 5) 1468 1469 self.checkPatternError(r'(?-', 'missing flag', 3) 1470 self.checkPatternError(r'(?-+', 'missing flag', 3) 1471 self.checkPatternError(r'(?-z', 'unknown flag', 3) 1472 self.checkPatternError(r'(?-i', 'missing :', 4) 1473 self.checkPatternError(r'(?-i)', 'missing :', 4) 1474 self.checkPatternError(r'(?-i+', 'missing :', 4) 1475 self.checkPatternError(r'(?-iz', 'unknown flag', 4) 1476 self.checkPatternError(r'(?i:', 'missing ), unterminated subpattern', 0) 1477 self.checkPatternError(r'(?i', 'missing -, : or )', 3) 1478 self.checkPatternError(r'(?i+', 'missing -, : or )', 3) 1479 self.checkPatternError(r'(?iz', 'unknown flag', 3) 1480 1481 def test_bug_6509(self): 1482 # Replacement strings of both types must parse properly. 1483 # all strings 1484 pat = re.compile(r'a(\w)') 1485 self.assertEqual(pat.sub('b\\1', 'ac'), 'bc') 1486 pat = re.compile('a(.)') 1487 self.assertEqual(pat.sub('b\\1', 'a\u1234'), 'b\u1234') 1488 pat = re.compile('..') 1489 self.assertEqual(pat.sub(lambda m: 'str', 'a5'), 'str') 1490 1491 # all bytes 1492 pat = re.compile(br'a(\w)') 1493 self.assertEqual(pat.sub(b'b\\1', b'ac'), b'bc') 1494 pat = re.compile(b'a(.)') 1495 self.assertEqual(pat.sub(b'b\\1', b'a\xCD'), b'b\xCD') 1496 pat = re.compile(b'..') 1497 self.assertEqual(pat.sub(lambda m: b'bytes', b'a5'), b'bytes') 1498 1499 def test_dealloc(self): 1500 # issue 3299: check for segfault in debug build 1501 import _sre 1502 # the overflow limit is different on wide and narrow builds and it 1503 # depends on the definition of SRE_CODE (see sre.h). 1504 # 2**128 should be big enough to overflow on both. For smaller values 1505 # a RuntimeError is raised instead of OverflowError. 1506 long_overflow = 2**128 1507 self.assertRaises(TypeError, re.finditer, "a", {}) 1508 with self.assertRaises(OverflowError): 1509 _sre.compile("abc", 0, [long_overflow], 0, [], []) 1510 with self.assertRaises(TypeError): 1511 _sre.compile({}, 0, [], 0, [], []) 1512 1513 def test_search_dot_unicode(self): 1514 self.assertTrue(re.search("123.*-", '123abc-')) 1515 self.assertTrue(re.search("123.*-", '123\xe9-')) 1516 self.assertTrue(re.search("123.*-", '123\u20ac-')) 1517 self.assertTrue(re.search("123.*-", '123\U0010ffff-')) 1518 self.assertTrue(re.search("123.*-", '123\xe9\u20ac\U0010ffff-')) 1519 1520 def test_compile(self): 1521 # Test return value when given string and pattern as parameter 1522 pattern = re.compile('random pattern') 1523 self.assertIsInstance(pattern, re._pattern_type) 1524 same_pattern = re.compile(pattern) 1525 self.assertIsInstance(same_pattern, re._pattern_type) 1526 self.assertIs(same_pattern, pattern) 1527 # Test behaviour when not given a string or pattern as parameter 1528 self.assertRaises(TypeError, re.compile, 0) 1529 1530 @bigmemtest(size=_2G, memuse=1) 1531 def test_large_search(self, size): 1532 # Issue #10182: indices were 32-bit-truncated. 1533 s = 'a' * size 1534 m = re.search('$', s) 1535 self.assertIsNotNone(m) 1536 self.assertEqual(m.start(), size) 1537 self.assertEqual(m.end(), size) 1538 1539 # The huge memuse is because of re.sub() using a list and a join() 1540 # to create the replacement result. 1541 @bigmemtest(size=_2G, memuse=16 + 2) 1542 def test_large_subn(self, size): 1543 # Issue #10182: indices were 32-bit-truncated. 1544 s = 'a' * size 1545 r, n = re.subn('', '', s) 1546 self.assertEqual(r, s) 1547 self.assertEqual(n, size + 1) 1548 1549 def test_bug_16688(self): 1550 # Issue 16688: Backreferences make case-insensitive regex fail on 1551 # non-ASCII strings. 1552 self.assertEqual(re.findall(r"(?i)(a)\1", "aa \u0100"), ['a']) 1553 self.assertEqual(re.match(r"(?s).{1,3}", "\u0100\u0100").span(), (0, 2)) 1554 1555 def test_repeat_minmax_overflow(self): 1556 # Issue #13169 1557 string = "x" * 100000 1558 self.assertEqual(re.match(r".{65535}", string).span(), (0, 65535)) 1559 self.assertEqual(re.match(r".{,65535}", string).span(), (0, 65535)) 1560 self.assertEqual(re.match(r".{65535,}?", string).span(), (0, 65535)) 1561 self.assertEqual(re.match(r".{65536}", string).span(), (0, 65536)) 1562 self.assertEqual(re.match(r".{,65536}", string).span(), (0, 65536)) 1563 self.assertEqual(re.match(r".{65536,}?", string).span(), (0, 65536)) 1564 # 2**128 should be big enough to overflow both SRE_CODE and Py_ssize_t. 1565 self.assertRaises(OverflowError, re.compile, r".{%d}" % 2**128) 1566 self.assertRaises(OverflowError, re.compile, r".{,%d}" % 2**128) 1567 self.assertRaises(OverflowError, re.compile, r".{%d,}?" % 2**128) 1568 self.assertRaises(OverflowError, re.compile, r".{%d,%d}" % (2**129, 2**128)) 1569 1570 @cpython_only 1571 def test_repeat_minmax_overflow_maxrepeat(self): 1572 try: 1573 from _sre import MAXREPEAT 1574 except ImportError: 1575 self.skipTest('requires _sre.MAXREPEAT constant') 1576 string = "x" * 100000 1577 self.assertIsNone(re.match(r".{%d}" % (MAXREPEAT - 1), string)) 1578 self.assertEqual(re.match(r".{,%d}" % (MAXREPEAT - 1), string).span(), 1579 (0, 100000)) 1580 self.assertIsNone(re.match(r".{%d,}?" % (MAXREPEAT - 1), string)) 1581 self.assertRaises(OverflowError, re.compile, r".{%d}" % MAXREPEAT) 1582 self.assertRaises(OverflowError, re.compile, r".{,%d}" % MAXREPEAT) 1583 self.assertRaises(OverflowError, re.compile, r".{%d,}?" % MAXREPEAT) 1584 1585 def test_backref_group_name_in_exception(self): 1586 # Issue 17341: Poor error message when compiling invalid regex 1587 self.checkPatternError('(?P=<foo>)', 1588 "bad character in group name '<foo>'", 4) 1589 1590 def test_group_name_in_exception(self): 1591 # Issue 17341: Poor error message when compiling invalid regex 1592 self.checkPatternError('(?P<?foo>)', 1593 "bad character in group name '?foo'", 4) 1594 1595 def test_issue17998(self): 1596 for reps in '*', '+', '?', '{1}': 1597 for mod in '', '?': 1598 pattern = '.' + reps + mod + 'yz' 1599 self.assertEqual(re.compile(pattern, re.S).findall('xyz'), 1600 ['xyz'], msg=pattern) 1601 pattern = pattern.encode() 1602 self.assertEqual(re.compile(pattern, re.S).findall(b'xyz'), 1603 [b'xyz'], msg=pattern) 1604 1605 def test_match_repr(self): 1606 for string in '[abracadabra]', S('[abracadabra]'): 1607 m = re.search(r'(.+)(.*?)\1', string) 1608 self.assertEqual(repr(m), "<%s.%s object; " 1609 "span=(1, 12), match='abracadabra'>" % 1610 (type(m).__module__, type(m).__qualname__)) 1611 for string in (b'[abracadabra]', B(b'[abracadabra]'), 1612 bytearray(b'[abracadabra]'), 1613 memoryview(b'[abracadabra]')): 1614 m = re.search(br'(.+)(.*?)\1', string) 1615 self.assertEqual(repr(m), "<%s.%s object; " 1616 "span=(1, 12), match=b'abracadabra'>" % 1617 (type(m).__module__, type(m).__qualname__)) 1618 1619 first, second = list(re.finditer("(aa)|(bb)", "aa bb")) 1620 self.assertEqual(repr(first), "<%s.%s object; " 1621 "span=(0, 2), match='aa'>" % 1622 (type(second).__module__, type(first).__qualname__)) 1623 self.assertEqual(repr(second), "<%s.%s object; " 1624 "span=(3, 5), match='bb'>" % 1625 (type(second).__module__, type(second).__qualname__)) 1626 1627 1628 def test_bug_2537(self): 1629 # issue 2537: empty submatches 1630 for outer_op in ('{0,}', '*', '+', '{1,187}'): 1631 for inner_op in ('{0,}', '*', '?'): 1632 r = re.compile("^((x|y)%s)%s" % (inner_op, outer_op)) 1633 m = r.match("xyyzy") 1634 self.assertEqual(m.group(0), "xyy") 1635 self.assertEqual(m.group(1), "") 1636 self.assertEqual(m.group(2), "y") 1637 1638 def test_debug_flag(self): 1639 pat = r'(\.)(?:[ch]|py)(?(1)$|: )' 1640 with captured_stdout() as out: 1641 re.compile(pat, re.DEBUG) 1642 dump = '''\ 1643SUBPATTERN 1 0 0 1644 LITERAL 46 1645SUBPATTERN None 0 0 1646 BRANCH 1647 IN 1648 LITERAL 99 1649 LITERAL 104 1650 OR 1651 LITERAL 112 1652 LITERAL 121 1653SUBPATTERN None 0 0 1654 GROUPREF_EXISTS 1 1655 AT AT_END 1656 ELSE 1657 LITERAL 58 1658 LITERAL 32 1659''' 1660 self.assertEqual(out.getvalue(), dump) 1661 # Debug output is output again even a second time (bypassing 1662 # the cache -- issue #20426). 1663 with captured_stdout() as out: 1664 re.compile(pat, re.DEBUG) 1665 self.assertEqual(out.getvalue(), dump) 1666 1667 def test_keyword_parameters(self): 1668 # Issue #20283: Accepting the string keyword parameter. 1669 pat = re.compile(r'(ab)') 1670 self.assertEqual( 1671 pat.match(string='abracadabra', pos=7, endpos=10).span(), (7, 9)) 1672 self.assertEqual( 1673 pat.fullmatch(string='abracadabra', pos=7, endpos=9).span(), (7, 9)) 1674 self.assertEqual( 1675 pat.search(string='abracadabra', pos=3, endpos=10).span(), (7, 9)) 1676 self.assertEqual( 1677 pat.findall(string='abracadabra', pos=3, endpos=10), ['ab']) 1678 self.assertEqual( 1679 pat.split(string='abracadabra', maxsplit=1), 1680 ['', 'ab', 'racadabra']) 1681 self.assertEqual( 1682 pat.scanner(string='abracadabra', pos=3, endpos=10).search().span(), 1683 (7, 9)) 1684 1685 def test_bug_20998(self): 1686 # Issue #20998: Fullmatch of repeated single character pattern 1687 # with ignore case. 1688 self.assertEqual(re.fullmatch('[a-c]+', 'ABC', re.I).span(), (0, 3)) 1689 1690 def test_locale_caching(self): 1691 # Issue #22410 1692 oldlocale = locale.setlocale(locale.LC_CTYPE) 1693 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale) 1694 for loc in 'en_US.iso88591', 'en_US.utf8': 1695 try: 1696 locale.setlocale(locale.LC_CTYPE, loc) 1697 except locale.Error: 1698 # Unsupported locale on this system 1699 self.skipTest('test needs %s locale' % loc) 1700 1701 re.purge() 1702 self.check_en_US_iso88591() 1703 self.check_en_US_utf8() 1704 re.purge() 1705 self.check_en_US_utf8() 1706 self.check_en_US_iso88591() 1707 1708 def check_en_US_iso88591(self): 1709 locale.setlocale(locale.LC_CTYPE, 'en_US.iso88591') 1710 self.assertTrue(re.match(b'\xc5\xe5', b'\xc5\xe5', re.L|re.I)) 1711 self.assertTrue(re.match(b'\xc5', b'\xe5', re.L|re.I)) 1712 self.assertTrue(re.match(b'\xe5', b'\xc5', re.L|re.I)) 1713 self.assertTrue(re.match(b'(?Li)\xc5\xe5', b'\xc5\xe5')) 1714 self.assertTrue(re.match(b'(?Li)\xc5', b'\xe5')) 1715 self.assertTrue(re.match(b'(?Li)\xe5', b'\xc5')) 1716 1717 def check_en_US_utf8(self): 1718 locale.setlocale(locale.LC_CTYPE, 'en_US.utf8') 1719 self.assertTrue(re.match(b'\xc5\xe5', b'\xc5\xe5', re.L|re.I)) 1720 self.assertIsNone(re.match(b'\xc5', b'\xe5', re.L|re.I)) 1721 self.assertIsNone(re.match(b'\xe5', b'\xc5', re.L|re.I)) 1722 self.assertTrue(re.match(b'(?Li)\xc5\xe5', b'\xc5\xe5')) 1723 self.assertIsNone(re.match(b'(?Li)\xc5', b'\xe5')) 1724 self.assertIsNone(re.match(b'(?Li)\xe5', b'\xc5')) 1725 1726 def test_error(self): 1727 with self.assertRaises(re.error) as cm: 1728 re.compile('(\u20ac))') 1729 err = cm.exception 1730 self.assertIsInstance(err.pattern, str) 1731 self.assertEqual(err.pattern, '(\u20ac))') 1732 self.assertEqual(err.pos, 3) 1733 self.assertEqual(err.lineno, 1) 1734 self.assertEqual(err.colno, 4) 1735 self.assertIn(err.msg, str(err)) 1736 self.assertIn(' at position 3', str(err)) 1737 self.assertNotIn(' at position 3', err.msg) 1738 # Bytes pattern 1739 with self.assertRaises(re.error) as cm: 1740 re.compile(b'(\xa4))') 1741 err = cm.exception 1742 self.assertIsInstance(err.pattern, bytes) 1743 self.assertEqual(err.pattern, b'(\xa4))') 1744 self.assertEqual(err.pos, 3) 1745 # Multiline pattern 1746 with self.assertRaises(re.error) as cm: 1747 re.compile(""" 1748 ( 1749 abc 1750 ) 1751 ) 1752 ( 1753 """, re.VERBOSE) 1754 err = cm.exception 1755 self.assertEqual(err.pos, 77) 1756 self.assertEqual(err.lineno, 5) 1757 self.assertEqual(err.colno, 17) 1758 self.assertIn(err.msg, str(err)) 1759 self.assertIn(' at position 77', str(err)) 1760 self.assertIn('(line 5, column 17)', str(err)) 1761 1762 def test_misc_errors(self): 1763 self.checkPatternError(r'(', 'missing ), unterminated subpattern', 0) 1764 self.checkPatternError(r'((a|b)', 'missing ), unterminated subpattern', 0) 1765 self.checkPatternError(r'(a|b))', 'unbalanced parenthesis', 5) 1766 self.checkPatternError(r'(?P', 'unexpected end of pattern', 3) 1767 self.checkPatternError(r'(?z)', 'unknown extension ?z', 1) 1768 self.checkPatternError(r'(?iz)', 'unknown flag', 3) 1769 self.checkPatternError(r'(?i', 'missing -, : or )', 3) 1770 self.checkPatternError(r'(?#abc', 'missing ), unterminated comment', 0) 1771 self.checkPatternError(r'(?<', 'unexpected end of pattern', 3) 1772 self.checkPatternError(r'(?<>)', 'unknown extension ?<>', 1) 1773 self.checkPatternError(r'(?', 'unexpected end of pattern', 2) 1774 1775 def test_enum(self): 1776 # Issue #28082: Check that str(flag) returns a human readable string 1777 # instead of an integer 1778 self.assertIn('ASCII', str(re.A)) 1779 self.assertIn('DOTALL', str(re.S)) 1780 1781 def test_pattern_compare(self): 1782 pattern1 = re.compile('abc', re.IGNORECASE) 1783 1784 # equal to itself 1785 self.assertEqual(pattern1, pattern1) 1786 self.assertFalse(pattern1 != pattern1) 1787 1788 # equal 1789 re.purge() 1790 pattern2 = re.compile('abc', re.IGNORECASE) 1791 self.assertEqual(hash(pattern2), hash(pattern1)) 1792 self.assertEqual(pattern2, pattern1) 1793 1794 # not equal: different pattern 1795 re.purge() 1796 pattern3 = re.compile('XYZ', re.IGNORECASE) 1797 # Don't test hash(pattern3) != hash(pattern1) because there is no 1798 # warranty that hash values are different 1799 self.assertNotEqual(pattern3, pattern1) 1800 1801 # not equal: different flag (flags=0) 1802 re.purge() 1803 pattern4 = re.compile('abc') 1804 self.assertNotEqual(pattern4, pattern1) 1805 1806 # only == and != comparison operators are supported 1807 with self.assertRaises(TypeError): 1808 pattern1 < pattern2 1809 1810 def test_pattern_compare_bytes(self): 1811 pattern1 = re.compile(b'abc') 1812 1813 # equal: test bytes patterns 1814 re.purge() 1815 pattern2 = re.compile(b'abc') 1816 self.assertEqual(hash(pattern2), hash(pattern1)) 1817 self.assertEqual(pattern2, pattern1) 1818 1819 # not equal: pattern of a different types (str vs bytes), 1820 # comparison must not raise a BytesWarning 1821 re.purge() 1822 pattern3 = re.compile('abc') 1823 with warnings.catch_warnings(): 1824 warnings.simplefilter('error', BytesWarning) 1825 self.assertNotEqual(pattern3, pattern1) 1826 1827 def test_bug_29444(self): 1828 s = bytearray(b'abcdefgh') 1829 m = re.search(b'[a-h]+', s) 1830 m2 = re.search(b'[e-h]+', s) 1831 self.assertEqual(m.group(), b'abcdefgh') 1832 self.assertEqual(m2.group(), b'efgh') 1833 s[:] = b'xyz' 1834 self.assertEqual(m.group(), b'xyz') 1835 self.assertEqual(m2.group(), b'') 1836 1837 1838class PatternReprTests(unittest.TestCase): 1839 def check(self, pattern, expected): 1840 self.assertEqual(repr(re.compile(pattern)), expected) 1841 1842 def check_flags(self, pattern, flags, expected): 1843 self.assertEqual(repr(re.compile(pattern, flags)), expected) 1844 1845 def test_without_flags(self): 1846 self.check('random pattern', 1847 "re.compile('random pattern')") 1848 1849 def test_single_flag(self): 1850 self.check_flags('random pattern', re.IGNORECASE, 1851 "re.compile('random pattern', re.IGNORECASE)") 1852 1853 def test_multiple_flags(self): 1854 self.check_flags('random pattern', re.I|re.S|re.X, 1855 "re.compile('random pattern', " 1856 "re.IGNORECASE|re.DOTALL|re.VERBOSE)") 1857 1858 def test_unicode_flag(self): 1859 self.check_flags('random pattern', re.U, 1860 "re.compile('random pattern')") 1861 self.check_flags('random pattern', re.I|re.S|re.U, 1862 "re.compile('random pattern', " 1863 "re.IGNORECASE|re.DOTALL)") 1864 1865 def test_inline_flags(self): 1866 self.check('(?i)pattern', 1867 "re.compile('(?i)pattern', re.IGNORECASE)") 1868 1869 def test_unknown_flags(self): 1870 self.check_flags('random pattern', 0x123000, 1871 "re.compile('random pattern', 0x123000)") 1872 self.check_flags('random pattern', 0x123000|re.I, 1873 "re.compile('random pattern', re.IGNORECASE|0x123000)") 1874 1875 def test_bytes(self): 1876 self.check(b'bytes pattern', 1877 "re.compile(b'bytes pattern')") 1878 self.check_flags(b'bytes pattern', re.A, 1879 "re.compile(b'bytes pattern', re.ASCII)") 1880 1881 def test_locale(self): 1882 self.check_flags(b'bytes pattern', re.L, 1883 "re.compile(b'bytes pattern', re.LOCALE)") 1884 1885 def test_quotes(self): 1886 self.check('random "double quoted" pattern', 1887 '''re.compile('random "double quoted" pattern')''') 1888 self.check("random 'single quoted' pattern", 1889 '''re.compile("random 'single quoted' pattern")''') 1890 self.check('''both 'single' and "double" quotes''', 1891 '''re.compile('both \\'single\\' and "double" quotes')''') 1892 1893 def test_long_pattern(self): 1894 pattern = 'Very %spattern' % ('long ' * 1000) 1895 r = repr(re.compile(pattern)) 1896 self.assertLess(len(r), 300) 1897 self.assertEqual(r[:30], "re.compile('Very long long lon") 1898 r = repr(re.compile(pattern, re.I)) 1899 self.assertLess(len(r), 300) 1900 self.assertEqual(r[:30], "re.compile('Very long long lon") 1901 self.assertEqual(r[-16:], ", re.IGNORECASE)") 1902 1903 1904class ImplementationTest(unittest.TestCase): 1905 """ 1906 Test implementation details of the re module. 1907 """ 1908 1909 def test_overlap_table(self): 1910 f = sre_compile._generate_overlap_table 1911 self.assertEqual(f(""), []) 1912 self.assertEqual(f("a"), [0]) 1913 self.assertEqual(f("abcd"), [0, 0, 0, 0]) 1914 self.assertEqual(f("aaaa"), [0, 1, 2, 3]) 1915 self.assertEqual(f("ababba"), [0, 0, 1, 2, 0, 1]) 1916 self.assertEqual(f("abcabdac"), [0, 0, 0, 1, 2, 0, 1, 0]) 1917 1918 1919class ExternalTests(unittest.TestCase): 1920 1921 def test_re_benchmarks(self): 1922 're_tests benchmarks' 1923 from test.re_tests import benchmarks 1924 for pattern, s in benchmarks: 1925 with self.subTest(pattern=pattern, string=s): 1926 p = re.compile(pattern) 1927 self.assertTrue(p.search(s)) 1928 self.assertTrue(p.match(s)) 1929 self.assertTrue(p.fullmatch(s)) 1930 s2 = ' '*10000 + s + ' '*10000 1931 self.assertTrue(p.search(s2)) 1932 self.assertTrue(p.match(s2, 10000)) 1933 self.assertTrue(p.match(s2, 10000, 10000 + len(s))) 1934 self.assertTrue(p.fullmatch(s2, 10000, 10000 + len(s))) 1935 1936 def test_re_tests(self): 1937 're_tests test suite' 1938 from test.re_tests import tests, SUCCEED, FAIL, SYNTAX_ERROR 1939 for t in tests: 1940 pattern = s = outcome = repl = expected = None 1941 if len(t) == 5: 1942 pattern, s, outcome, repl, expected = t 1943 elif len(t) == 3: 1944 pattern, s, outcome = t 1945 else: 1946 raise ValueError('Test tuples should have 3 or 5 fields', t) 1947 1948 with self.subTest(pattern=pattern, string=s): 1949 if outcome == SYNTAX_ERROR: # Expected a syntax error 1950 with self.assertRaises(re.error): 1951 re.compile(pattern) 1952 continue 1953 1954 obj = re.compile(pattern) 1955 result = obj.search(s) 1956 if outcome == FAIL: 1957 self.assertIsNone(result, 'Succeeded incorrectly') 1958 continue 1959 1960 with self.subTest(): 1961 self.assertTrue(result, 'Failed incorrectly') 1962 # Matched, as expected, so now we compute the 1963 # result string and compare it to our expected result. 1964 start, end = result.span(0) 1965 vardict = {'found': result.group(0), 1966 'groups': result.group(), 1967 'flags': result.re.flags} 1968 for i in range(1, 100): 1969 try: 1970 gi = result.group(i) 1971 # Special hack because else the string concat fails: 1972 if gi is None: 1973 gi = "None" 1974 except IndexError: 1975 gi = "Error" 1976 vardict['g%d' % i] = gi 1977 for i in result.re.groupindex.keys(): 1978 try: 1979 gi = result.group(i) 1980 if gi is None: 1981 gi = "None" 1982 except IndexError: 1983 gi = "Error" 1984 vardict[i] = gi 1985 self.assertEqual(eval(repl, vardict), expected, 1986 'grouping error') 1987 1988 # Try the match with both pattern and string converted to 1989 # bytes, and check that it still succeeds. 1990 try: 1991 bpat = bytes(pattern, "ascii") 1992 bs = bytes(s, "ascii") 1993 except UnicodeEncodeError: 1994 # skip non-ascii tests 1995 pass 1996 else: 1997 with self.subTest('bytes pattern match'): 1998 obj = re.compile(bpat) 1999 self.assertTrue(obj.search(bs)) 2000 2001 # Try the match with LOCALE enabled, and check that it 2002 # still succeeds. 2003 with self.subTest('locale-sensitive match'): 2004 obj = re.compile(bpat, re.LOCALE) 2005 result = obj.search(bs) 2006 if result is None: 2007 print('=== Fails on locale-sensitive match', t) 2008 2009 # Try the match with the search area limited to the extent 2010 # of the match and see if it still succeeds. \B will 2011 # break (because it won't match at the end or start of a 2012 # string), so we'll ignore patterns that feature it. 2013 if (pattern[:2] != r'\B' and pattern[-2:] != r'\B' 2014 and result is not None): 2015 with self.subTest('range-limited match'): 2016 obj = re.compile(pattern) 2017 self.assertTrue(obj.search(s, start, end + 1)) 2018 2019 # Try the match with IGNORECASE enabled, and check that it 2020 # still succeeds. 2021 with self.subTest('case-insensitive match'): 2022 obj = re.compile(pattern, re.IGNORECASE) 2023 self.assertTrue(obj.search(s)) 2024 2025 # Try the match with UNICODE locale enabled, and check 2026 # that it still succeeds. 2027 with self.subTest('unicode-sensitive match'): 2028 obj = re.compile(pattern, re.UNICODE) 2029 self.assertTrue(obj.search(s)) 2030 2031 2032if __name__ == "__main__": 2033 unittest.main() 2034