1""" Test script for the Unicode implementation.
2
3Written by Marc-Andre Lemburg (mal@lemburg.com).
4
5(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
6
7"""
8import _string
9import codecs
10import itertools
11import operator
12import struct
13import string
14import sys
15import unittest
16import warnings
17from test import support, string_tests
18
19# Error handling (bad decoder return)
20def search_function(encoding):
21    def decode1(input, errors="strict"):
22        return 42 # not a tuple
23    def encode1(input, errors="strict"):
24        return 42 # not a tuple
25    def encode2(input, errors="strict"):
26        return (42, 42) # no unicode
27    def decode2(input, errors="strict"):
28        return (42, 42) # no unicode
29    if encoding=="test.unicode1":
30        return (encode1, decode1, None, None)
31    elif encoding=="test.unicode2":
32        return (encode2, decode2, None, None)
33    else:
34        return None
35codecs.register(search_function)
36
37def duplicate_string(text):
38    """
39    Try to get a fresh clone of the specified text:
40    new object with a reference count of 1.
41
42    This is a best-effort: latin1 single letters and the empty
43    string ('') are singletons and cannot be cloned.
44    """
45    return text.encode().decode()
46
47class StrSubclass(str):
48    pass
49
50class UnicodeTest(string_tests.CommonTest,
51        string_tests.MixinStrUnicodeUserStringTest,
52        string_tests.MixinStrUnicodeTest,
53        unittest.TestCase):
54
55    type2test = str
56
57    def checkequalnofix(self, result, object, methodname, *args):
58        method = getattr(object, methodname)
59        realresult = method(*args)
60        self.assertEqual(realresult, result)
61        self.assertTrue(type(realresult) is type(result))
62
63        # if the original is returned make sure that
64        # this doesn't happen with subclasses
65        if realresult is object:
66            class usub(str):
67                def __repr__(self):
68                    return 'usub(%r)' % str.__repr__(self)
69            object = usub(object)
70            method = getattr(object, methodname)
71            realresult = method(*args)
72            self.assertEqual(realresult, result)
73            self.assertTrue(object is not realresult)
74
75    def test_literals(self):
76        self.assertEqual('\xff', '\u00ff')
77        self.assertEqual('\uffff', '\U0000ffff')
78        self.assertRaises(SyntaxError, eval, '\'\\Ufffffffe\'')
79        self.assertRaises(SyntaxError, eval, '\'\\Uffffffff\'')
80        self.assertRaises(SyntaxError, eval, '\'\\U%08x\'' % 0x110000)
81        # raw strings should not have unicode escapes
82        self.assertNotEqual(r"\u0020", " ")
83
84    def test_ascii(self):
85        if not sys.platform.startswith('java'):
86            # Test basic sanity of repr()
87            self.assertEqual(ascii('abc'), "'abc'")
88            self.assertEqual(ascii('ab\\c'), "'ab\\\\c'")
89            self.assertEqual(ascii('ab\\'), "'ab\\\\'")
90            self.assertEqual(ascii('\\c'), "'\\\\c'")
91            self.assertEqual(ascii('\\'), "'\\\\'")
92            self.assertEqual(ascii('\n'), "'\\n'")
93            self.assertEqual(ascii('\r'), "'\\r'")
94            self.assertEqual(ascii('\t'), "'\\t'")
95            self.assertEqual(ascii('\b'), "'\\x08'")
96            self.assertEqual(ascii("'\""), """'\\'"'""")
97            self.assertEqual(ascii("'\""), """'\\'"'""")
98            self.assertEqual(ascii("'"), '''"'"''')
99            self.assertEqual(ascii('"'), """'"'""")
100            latin1repr = (
101                "'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r"
102                "\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a"
103                "\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHI"
104                "JKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f"
105                "\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d"
106                "\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b"
107                "\\x9c\\x9d\\x9e\\x9f\\xa0\\xa1\\xa2\\xa3\\xa4\\xa5\\xa6\\xa7\\xa8\\xa9"
108                "\\xaa\\xab\\xac\\xad\\xae\\xaf\\xb0\\xb1\\xb2\\xb3\\xb4\\xb5\\xb6\\xb7"
109                "\\xb8\\xb9\\xba\\xbb\\xbc\\xbd\\xbe\\xbf\\xc0\\xc1\\xc2\\xc3\\xc4\\xc5"
110                "\\xc6\\xc7\\xc8\\xc9\\xca\\xcb\\xcc\\xcd\\xce\\xcf\\xd0\\xd1\\xd2\\xd3"
111                "\\xd4\\xd5\\xd6\\xd7\\xd8\\xd9\\xda\\xdb\\xdc\\xdd\\xde\\xdf\\xe0\\xe1"
112                "\\xe2\\xe3\\xe4\\xe5\\xe6\\xe7\\xe8\\xe9\\xea\\xeb\\xec\\xed\\xee\\xef"
113                "\\xf0\\xf1\\xf2\\xf3\\xf4\\xf5\\xf6\\xf7\\xf8\\xf9\\xfa\\xfb\\xfc\\xfd"
114                "\\xfe\\xff'")
115            testrepr = ascii(''.join(map(chr, range(256))))
116            self.assertEqual(testrepr, latin1repr)
117            # Test ascii works on wide unicode escapes without overflow.
118            self.assertEqual(ascii("\U00010000" * 39 + "\uffff" * 4096),
119                             ascii("\U00010000" * 39 + "\uffff" * 4096))
120
121            class WrongRepr:
122                def __repr__(self):
123                    return b'byte-repr'
124            self.assertRaises(TypeError, ascii, WrongRepr())
125
126    def test_repr(self):
127        if not sys.platform.startswith('java'):
128            # Test basic sanity of repr()
129            self.assertEqual(repr('abc'), "'abc'")
130            self.assertEqual(repr('ab\\c'), "'ab\\\\c'")
131            self.assertEqual(repr('ab\\'), "'ab\\\\'")
132            self.assertEqual(repr('\\c'), "'\\\\c'")
133            self.assertEqual(repr('\\'), "'\\\\'")
134            self.assertEqual(repr('\n'), "'\\n'")
135            self.assertEqual(repr('\r'), "'\\r'")
136            self.assertEqual(repr('\t'), "'\\t'")
137            self.assertEqual(repr('\b'), "'\\x08'")
138            self.assertEqual(repr("'\""), """'\\'"'""")
139            self.assertEqual(repr("'\""), """'\\'"'""")
140            self.assertEqual(repr("'"), '''"'"''')
141            self.assertEqual(repr('"'), """'"'""")
142            latin1repr = (
143                "'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r"
144                "\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a"
145                "\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHI"
146                "JKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f"
147                "\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d"
148                "\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b"
149                "\\x9c\\x9d\\x9e\\x9f\\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9"
150                "\xaa\xab\xac\\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7"
151                "\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5"
152                "\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3"
153                "\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1"
154                "\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef"
155                "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd"
156                "\xfe\xff'")
157            testrepr = repr(''.join(map(chr, range(256))))
158            self.assertEqual(testrepr, latin1repr)
159            # Test repr works on wide unicode escapes without overflow.
160            self.assertEqual(repr("\U00010000" * 39 + "\uffff" * 4096),
161                             repr("\U00010000" * 39 + "\uffff" * 4096))
162
163            class WrongRepr:
164                def __repr__(self):
165                    return b'byte-repr'
166            self.assertRaises(TypeError, repr, WrongRepr())
167
168    def test_iterators(self):
169        # Make sure unicode objects have an __iter__ method
170        it = "\u1111\u2222\u3333".__iter__()
171        self.assertEqual(next(it), "\u1111")
172        self.assertEqual(next(it), "\u2222")
173        self.assertEqual(next(it), "\u3333")
174        self.assertRaises(StopIteration, next, it)
175
176    def test_count(self):
177        string_tests.CommonTest.test_count(self)
178        # check mixed argument types
179        self.checkequalnofix(3,  'aaa', 'count', 'a')
180        self.checkequalnofix(0,  'aaa', 'count', 'b')
181        self.checkequalnofix(3, 'aaa', 'count',  'a')
182        self.checkequalnofix(0, 'aaa', 'count',  'b')
183        self.checkequalnofix(0, 'aaa', 'count',  'b')
184        self.checkequalnofix(1, 'aaa', 'count',  'a', -1)
185        self.checkequalnofix(3, 'aaa', 'count',  'a', -10)
186        self.checkequalnofix(2, 'aaa', 'count',  'a', 0, -1)
187        self.checkequalnofix(0, 'aaa', 'count',  'a', 0, -10)
188        # test mixed kinds
189        self.checkequal(10, '\u0102' + 'a' * 10, 'count', 'a')
190        self.checkequal(10, '\U00100304' + 'a' * 10, 'count', 'a')
191        self.checkequal(10, '\U00100304' + '\u0102' * 10, 'count', '\u0102')
192        self.checkequal(0, 'a' * 10, 'count', '\u0102')
193        self.checkequal(0, 'a' * 10, 'count', '\U00100304')
194        self.checkequal(0, '\u0102' * 10, 'count', '\U00100304')
195        self.checkequal(10, '\u0102' + 'a_' * 10, 'count', 'a_')
196        self.checkequal(10, '\U00100304' + 'a_' * 10, 'count', 'a_')
197        self.checkequal(10, '\U00100304' + '\u0102_' * 10, 'count', '\u0102_')
198        self.checkequal(0, 'a' * 10, 'count', 'a\u0102')
199        self.checkequal(0, 'a' * 10, 'count', 'a\U00100304')
200        self.checkequal(0, '\u0102' * 10, 'count', '\u0102\U00100304')
201
202    def test_find(self):
203        string_tests.CommonTest.test_find(self)
204        # test implementation details of the memchr fast path
205        self.checkequal(100, 'a' * 100 + '\u0102', 'find', '\u0102')
206        self.checkequal(-1, 'a' * 100 + '\u0102', 'find', '\u0201')
207        self.checkequal(-1, 'a' * 100 + '\u0102', 'find', '\u0120')
208        self.checkequal(-1, 'a' * 100 + '\u0102', 'find', '\u0220')
209        self.checkequal(100, 'a' * 100 + '\U00100304', 'find', '\U00100304')
210        self.checkequal(-1, 'a' * 100 + '\U00100304', 'find', '\U00100204')
211        self.checkequal(-1, 'a' * 100 + '\U00100304', 'find', '\U00102004')
212        # check mixed argument types
213        self.checkequalnofix(0,  'abcdefghiabc', 'find', 'abc')
214        self.checkequalnofix(9,  'abcdefghiabc', 'find', 'abc', 1)
215        self.checkequalnofix(-1, 'abcdefghiabc', 'find', 'def', 4)
216
217        self.assertRaises(TypeError, 'hello'.find)
218        self.assertRaises(TypeError, 'hello'.find, 42)
219        # test mixed kinds
220        self.checkequal(100, '\u0102' * 100 + 'a', 'find', 'a')
221        self.checkequal(100, '\U00100304' * 100 + 'a', 'find', 'a')
222        self.checkequal(100, '\U00100304' * 100 + '\u0102', 'find', '\u0102')
223        self.checkequal(-1, 'a' * 100, 'find', '\u0102')
224        self.checkequal(-1, 'a' * 100, 'find', '\U00100304')
225        self.checkequal(-1, '\u0102' * 100, 'find', '\U00100304')
226        self.checkequal(100, '\u0102' * 100 + 'a_', 'find', 'a_')
227        self.checkequal(100, '\U00100304' * 100 + 'a_', 'find', 'a_')
228        self.checkequal(100, '\U00100304' * 100 + '\u0102_', 'find', '\u0102_')
229        self.checkequal(-1, 'a' * 100, 'find', 'a\u0102')
230        self.checkequal(-1, 'a' * 100, 'find', 'a\U00100304')
231        self.checkequal(-1, '\u0102' * 100, 'find', '\u0102\U00100304')
232
233    def test_rfind(self):
234        string_tests.CommonTest.test_rfind(self)
235        # test implementation details of the memrchr fast path
236        self.checkequal(0, '\u0102' + 'a' * 100 , 'rfind', '\u0102')
237        self.checkequal(-1, '\u0102' + 'a' * 100 , 'rfind', '\u0201')
238        self.checkequal(-1, '\u0102' + 'a' * 100 , 'rfind', '\u0120')
239        self.checkequal(-1, '\u0102' + 'a' * 100 , 'rfind', '\u0220')
240        self.checkequal(0, '\U00100304' + 'a' * 100, 'rfind', '\U00100304')
241        self.checkequal(-1, '\U00100304' + 'a' * 100, 'rfind', '\U00100204')
242        self.checkequal(-1, '\U00100304' + 'a' * 100, 'rfind', '\U00102004')
243        # check mixed argument types
244        self.checkequalnofix(9,   'abcdefghiabc', 'rfind', 'abc')
245        self.checkequalnofix(12,  'abcdefghiabc', 'rfind', '')
246        self.checkequalnofix(12, 'abcdefghiabc', 'rfind',  '')
247        # test mixed kinds
248        self.checkequal(0, 'a' + '\u0102' * 100, 'rfind', 'a')
249        self.checkequal(0, 'a' + '\U00100304' * 100, 'rfind', 'a')
250        self.checkequal(0, '\u0102' + '\U00100304' * 100, 'rfind', '\u0102')
251        self.checkequal(-1, 'a' * 100, 'rfind', '\u0102')
252        self.checkequal(-1, 'a' * 100, 'rfind', '\U00100304')
253        self.checkequal(-1, '\u0102' * 100, 'rfind', '\U00100304')
254        self.checkequal(0, '_a' + '\u0102' * 100, 'rfind', '_a')
255        self.checkequal(0, '_a' + '\U00100304' * 100, 'rfind', '_a')
256        self.checkequal(0, '_\u0102' + '\U00100304' * 100, 'rfind', '_\u0102')
257        self.checkequal(-1, 'a' * 100, 'rfind', '\u0102a')
258        self.checkequal(-1, 'a' * 100, 'rfind', '\U00100304a')
259        self.checkequal(-1, '\u0102' * 100, 'rfind', '\U00100304\u0102')
260
261    def test_index(self):
262        string_tests.CommonTest.test_index(self)
263        self.checkequalnofix(0, 'abcdefghiabc', 'index',  '')
264        self.checkequalnofix(3, 'abcdefghiabc', 'index',  'def')
265        self.checkequalnofix(0, 'abcdefghiabc', 'index',  'abc')
266        self.checkequalnofix(9, 'abcdefghiabc', 'index',  'abc', 1)
267        self.assertRaises(ValueError, 'abcdefghiabc'.index, 'hib')
268        self.assertRaises(ValueError, 'abcdefghiab'.index,  'abc', 1)
269        self.assertRaises(ValueError, 'abcdefghi'.index,  'ghi', 8)
270        self.assertRaises(ValueError, 'abcdefghi'.index,  'ghi', -1)
271        # test mixed kinds
272        self.checkequal(100, '\u0102' * 100 + 'a', 'index', 'a')
273        self.checkequal(100, '\U00100304' * 100 + 'a', 'index', 'a')
274        self.checkequal(100, '\U00100304' * 100 + '\u0102', 'index', '\u0102')
275        self.assertRaises(ValueError, ('a' * 100).index, '\u0102')
276        self.assertRaises(ValueError, ('a' * 100).index, '\U00100304')
277        self.assertRaises(ValueError, ('\u0102' * 100).index, '\U00100304')
278        self.checkequal(100, '\u0102' * 100 + 'a_', 'index', 'a_')
279        self.checkequal(100, '\U00100304' * 100 + 'a_', 'index', 'a_')
280        self.checkequal(100, '\U00100304' * 100 + '\u0102_', 'index', '\u0102_')
281        self.assertRaises(ValueError, ('a' * 100).index, 'a\u0102')
282        self.assertRaises(ValueError, ('a' * 100).index, 'a\U00100304')
283        self.assertRaises(ValueError, ('\u0102' * 100).index, '\u0102\U00100304')
284
285    def test_rindex(self):
286        string_tests.CommonTest.test_rindex(self)
287        self.checkequalnofix(12, 'abcdefghiabc', 'rindex',  '')
288        self.checkequalnofix(3,  'abcdefghiabc', 'rindex',  'def')
289        self.checkequalnofix(9,  'abcdefghiabc', 'rindex',  'abc')
290        self.checkequalnofix(0,  'abcdefghiabc', 'rindex',  'abc', 0, -1)
291
292        self.assertRaises(ValueError, 'abcdefghiabc'.rindex,  'hib')
293        self.assertRaises(ValueError, 'defghiabc'.rindex,  'def', 1)
294        self.assertRaises(ValueError, 'defghiabc'.rindex,  'abc', 0, -1)
295        self.assertRaises(ValueError, 'abcdefghi'.rindex,  'ghi', 0, 8)
296        self.assertRaises(ValueError, 'abcdefghi'.rindex,  'ghi', 0, -1)
297        # test mixed kinds
298        self.checkequal(0, 'a' + '\u0102' * 100, 'rindex', 'a')
299        self.checkequal(0, 'a' + '\U00100304' * 100, 'rindex', 'a')
300        self.checkequal(0, '\u0102' + '\U00100304' * 100, 'rindex', '\u0102')
301        self.assertRaises(ValueError, ('a' * 100).rindex, '\u0102')
302        self.assertRaises(ValueError, ('a' * 100).rindex, '\U00100304')
303        self.assertRaises(ValueError, ('\u0102' * 100).rindex, '\U00100304')
304        self.checkequal(0, '_a' + '\u0102' * 100, 'rindex', '_a')
305        self.checkequal(0, '_a' + '\U00100304' * 100, 'rindex', '_a')
306        self.checkequal(0, '_\u0102' + '\U00100304' * 100, 'rindex', '_\u0102')
307        self.assertRaises(ValueError, ('a' * 100).rindex, '\u0102a')
308        self.assertRaises(ValueError, ('a' * 100).rindex, '\U00100304a')
309        self.assertRaises(ValueError, ('\u0102' * 100).rindex, '\U00100304\u0102')
310
311    def test_maketrans_translate(self):
312        # these work with plain translate()
313        self.checkequalnofix('bbbc', 'abababc', 'translate',
314                             {ord('a'): None})
315        self.checkequalnofix('iiic', 'abababc', 'translate',
316                             {ord('a'): None, ord('b'): ord('i')})
317        self.checkequalnofix('iiix', 'abababc', 'translate',
318                             {ord('a'): None, ord('b'): ord('i'), ord('c'): 'x'})
319        self.checkequalnofix('c', 'abababc', 'translate',
320                             {ord('a'): None, ord('b'): ''})
321        self.checkequalnofix('xyyx', 'xzx', 'translate',
322                             {ord('z'): 'yy'})
323
324        # this needs maketrans()
325        self.checkequalnofix('abababc', 'abababc', 'translate',
326                             {'b': '<i>'})
327        tbl = self.type2test.maketrans({'a': None, 'b': '<i>'})
328        self.checkequalnofix('<i><i><i>c', 'abababc', 'translate', tbl)
329        # test alternative way of calling maketrans()
330        tbl = self.type2test.maketrans('abc', 'xyz', 'd')
331        self.checkequalnofix('xyzzy', 'abdcdcbdddd', 'translate', tbl)
332
333        # various tests switching from ASCII to latin1 or the opposite;
334        # same length, remove a letter, or replace with a longer string.
335        self.assertEqual("[a]".translate(str.maketrans('a', 'X')),
336                         "[X]")
337        self.assertEqual("[a]".translate(str.maketrans({'a': 'X'})),
338                         "[X]")
339        self.assertEqual("[a]".translate(str.maketrans({'a': None})),
340                         "[]")
341        self.assertEqual("[a]".translate(str.maketrans({'a': 'XXX'})),
342                         "[XXX]")
343        self.assertEqual("[a]".translate(str.maketrans({'a': '\xe9'})),
344                         "[\xe9]")
345        self.assertEqual('axb'.translate(str.maketrans({'a': None, 'b': '123'})),
346                         "x123")
347        self.assertEqual('axb'.translate(str.maketrans({'a': None, 'b': '\xe9'})),
348                         "x\xe9")
349
350        # test non-ASCII (don't take the fast-path)
351        self.assertEqual("[a]".translate(str.maketrans({'a': '<\xe9>'})),
352                         "[<\xe9>]")
353        self.assertEqual("[\xe9]".translate(str.maketrans({'\xe9': 'a'})),
354                         "[a]")
355        self.assertEqual("[\xe9]".translate(str.maketrans({'\xe9': None})),
356                         "[]")
357        self.assertEqual("[\xe9]".translate(str.maketrans({'\xe9': '123'})),
358                         "[123]")
359        self.assertEqual("[a\xe9]".translate(str.maketrans({'a': '<\u20ac>'})),
360                         "[<\u20ac>\xe9]")
361
362        # invalid Unicode characters
363        invalid_char = 0x10ffff+1
364        for before in "a\xe9\u20ac\U0010ffff":
365            mapping = str.maketrans({before: invalid_char})
366            text = "[%s]" % before
367            self.assertRaises(ValueError, text.translate, mapping)
368
369        # errors
370        self.assertRaises(TypeError, self.type2test.maketrans)
371        self.assertRaises(ValueError, self.type2test.maketrans, 'abc', 'defg')
372        self.assertRaises(TypeError, self.type2test.maketrans, 2, 'def')
373        self.assertRaises(TypeError, self.type2test.maketrans, 'abc', 2)
374        self.assertRaises(TypeError, self.type2test.maketrans, 'abc', 'def', 2)
375        self.assertRaises(ValueError, self.type2test.maketrans, {'xy': 2})
376        self.assertRaises(TypeError, self.type2test.maketrans, {(1,): 2})
377
378        self.assertRaises(TypeError, 'hello'.translate)
379        self.assertRaises(TypeError, 'abababc'.translate, 'abc', 'xyz')
380
381    def test_split(self):
382        string_tests.CommonTest.test_split(self)
383
384        # test mixed kinds
385        for left, right in ('ba', '\u0101\u0100', '\U00010301\U00010300'):
386            left *= 9
387            right *= 9
388            for delim in ('c', '\u0102', '\U00010302'):
389                self.checkequal([left + right],
390                                left + right, 'split', delim)
391                self.checkequal([left, right],
392                                left + delim + right, 'split', delim)
393                self.checkequal([left + right],
394                                left + right, 'split', delim * 2)
395                self.checkequal([left, right],
396                                left + delim * 2 + right, 'split', delim *2)
397
398    def test_rsplit(self):
399        string_tests.CommonTest.test_rsplit(self)
400        # test mixed kinds
401        for left, right in ('ba', '\u0101\u0100', '\U00010301\U00010300'):
402            left *= 9
403            right *= 9
404            for delim in ('c', '\u0102', '\U00010302'):
405                self.checkequal([left + right],
406                                left + right, 'rsplit', delim)
407                self.checkequal([left, right],
408                                left + delim + right, 'rsplit', delim)
409                self.checkequal([left + right],
410                                left + right, 'rsplit', delim * 2)
411                self.checkequal([left, right],
412                                left + delim * 2 + right, 'rsplit', delim *2)
413
414    def test_partition(self):
415        string_tests.MixinStrUnicodeUserStringTest.test_partition(self)
416        # test mixed kinds
417        self.checkequal(('ABCDEFGH', '', ''), 'ABCDEFGH', 'partition', '\u4200')
418        for left, right in ('ba', '\u0101\u0100', '\U00010301\U00010300'):
419            left *= 9
420            right *= 9
421            for delim in ('c', '\u0102', '\U00010302'):
422                self.checkequal((left + right, '', ''),
423                                left + right, 'partition', delim)
424                self.checkequal((left, delim, right),
425                                left + delim + right, 'partition', delim)
426                self.checkequal((left + right, '', ''),
427                                left + right, 'partition', delim * 2)
428                self.checkequal((left, delim * 2, right),
429                                left + delim * 2 + right, 'partition', delim * 2)
430
431    def test_rpartition(self):
432        string_tests.MixinStrUnicodeUserStringTest.test_rpartition(self)
433        # test mixed kinds
434        self.checkequal(('', '', 'ABCDEFGH'), 'ABCDEFGH', 'rpartition', '\u4200')
435        for left, right in ('ba', '\u0101\u0100', '\U00010301\U00010300'):
436            left *= 9
437            right *= 9
438            for delim in ('c', '\u0102', '\U00010302'):
439                self.checkequal(('', '', left + right),
440                                left + right, 'rpartition', delim)
441                self.checkequal((left, delim, right),
442                                left + delim + right, 'rpartition', delim)
443                self.checkequal(('', '', left + right),
444                                left + right, 'rpartition', delim * 2)
445                self.checkequal((left, delim * 2, right),
446                                left + delim * 2 + right, 'rpartition', delim * 2)
447
448    def test_join(self):
449        string_tests.MixinStrUnicodeUserStringTest.test_join(self)
450
451        class MyWrapper:
452            def __init__(self, sval): self.sval = sval
453            def __str__(self): return self.sval
454
455        # mixed arguments
456        self.checkequalnofix('a b c d', ' ', 'join', ['a', 'b', 'c', 'd'])
457        self.checkequalnofix('abcd', '', 'join', ('a', 'b', 'c', 'd'))
458        self.checkequalnofix('w x y z', ' ', 'join', string_tests.Sequence('wxyz'))
459        self.checkequalnofix('a b c d', ' ', 'join', ['a', 'b', 'c', 'd'])
460        self.checkequalnofix('a b c d', ' ', 'join', ['a', 'b', 'c', 'd'])
461        self.checkequalnofix('abcd', '', 'join', ('a', 'b', 'c', 'd'))
462        self.checkequalnofix('w x y z', ' ', 'join', string_tests.Sequence('wxyz'))
463        self.checkraises(TypeError, ' ', 'join', ['1', '2', MyWrapper('foo')])
464        self.checkraises(TypeError, ' ', 'join', ['1', '2', '3', bytes()])
465        self.checkraises(TypeError, ' ', 'join', [1, 2, 3])
466        self.checkraises(TypeError, ' ', 'join', ['1', '2', 3])
467
468    @unittest.skipIf(sys.maxsize > 2**32,
469        'needs too much memory on a 64-bit platform')
470    def test_join_overflow(self):
471        size = int(sys.maxsize**0.5) + 1
472        seq = ('A' * size,) * size
473        self.assertRaises(OverflowError, ''.join, seq)
474
475    def test_replace(self):
476        string_tests.CommonTest.test_replace(self)
477
478        # method call forwarded from str implementation because of unicode argument
479        self.checkequalnofix('one@two!three!', 'one!two!three!', 'replace', '!', '@', 1)
480        self.assertRaises(TypeError, 'replace'.replace, "r", 42)
481        # test mixed kinds
482        for left, right in ('ba', '\u0101\u0100', '\U00010301\U00010300'):
483            left *= 9
484            right *= 9
485            for delim in ('c', '\u0102', '\U00010302'):
486                for repl in ('d', '\u0103', '\U00010303'):
487                    self.checkequal(left + right,
488                                    left + right, 'replace', delim, repl)
489                    self.checkequal(left + repl + right,
490                                    left + delim + right,
491                                    'replace', delim, repl)
492                    self.checkequal(left + right,
493                                    left + right, 'replace', delim * 2, repl)
494                    self.checkequal(left + repl + right,
495                                    left + delim * 2 + right,
496                                    'replace', delim * 2, repl)
497
498    @support.cpython_only
499    def test_replace_id(self):
500        pattern = 'abc'
501        text = 'abc def'
502        self.assertIs(text.replace(pattern, pattern), text)
503
504    def test_bytes_comparison(self):
505        with support.check_warnings():
506            warnings.simplefilter('ignore', BytesWarning)
507            self.assertEqual('abc' == b'abc', False)
508            self.assertEqual('abc' != b'abc', True)
509            self.assertEqual('abc' == bytearray(b'abc'), False)
510            self.assertEqual('abc' != bytearray(b'abc'), True)
511
512    def test_comparison(self):
513        # Comparisons:
514        self.assertEqual('abc', 'abc')
515        self.assertTrue('abcd' > 'abc')
516        self.assertTrue('abc' < 'abcd')
517
518        if 0:
519            # Move these tests to a Unicode collation module test...
520            # Testing UTF-16 code point order comparisons...
521
522            # No surrogates, no fixup required.
523            self.assertTrue('\u0061' < '\u20ac')
524            # Non surrogate below surrogate value, no fixup required
525            self.assertTrue('\u0061' < '\ud800\udc02')
526
527            # Non surrogate above surrogate value, fixup required
528            def test_lecmp(s, s2):
529                self.assertTrue(s < s2)
530
531            def test_fixup(s):
532                s2 = '\ud800\udc01'
533                test_lecmp(s, s2)
534                s2 = '\ud900\udc01'
535                test_lecmp(s, s2)
536                s2 = '\uda00\udc01'
537                test_lecmp(s, s2)
538                s2 = '\udb00\udc01'
539                test_lecmp(s, s2)
540                s2 = '\ud800\udd01'
541                test_lecmp(s, s2)
542                s2 = '\ud900\udd01'
543                test_lecmp(s, s2)
544                s2 = '\uda00\udd01'
545                test_lecmp(s, s2)
546                s2 = '\udb00\udd01'
547                test_lecmp(s, s2)
548                s2 = '\ud800\ude01'
549                test_lecmp(s, s2)
550                s2 = '\ud900\ude01'
551                test_lecmp(s, s2)
552                s2 = '\uda00\ude01'
553                test_lecmp(s, s2)
554                s2 = '\udb00\ude01'
555                test_lecmp(s, s2)
556                s2 = '\ud800\udfff'
557                test_lecmp(s, s2)
558                s2 = '\ud900\udfff'
559                test_lecmp(s, s2)
560                s2 = '\uda00\udfff'
561                test_lecmp(s, s2)
562                s2 = '\udb00\udfff'
563                test_lecmp(s, s2)
564
565                test_fixup('\ue000')
566                test_fixup('\uff61')
567
568        # Surrogates on both sides, no fixup required
569        self.assertTrue('\ud800\udc02' < '\ud84d\udc56')
570
571    def test_islower(self):
572        super().test_islower()
573        self.checkequalnofix(False, '\u1FFc', 'islower')
574        self.assertFalse('\u2167'.islower())
575        self.assertTrue('\u2177'.islower())
576        # non-BMP, uppercase
577        self.assertFalse('\U00010401'.islower())
578        self.assertFalse('\U00010427'.islower())
579        # non-BMP, lowercase
580        self.assertTrue('\U00010429'.islower())
581        self.assertTrue('\U0001044E'.islower())
582        # non-BMP, non-cased
583        self.assertFalse('\U0001F40D'.islower())
584        self.assertFalse('\U0001F46F'.islower())
585
586    def test_isupper(self):
587        super().test_isupper()
588        if not sys.platform.startswith('java'):
589            self.checkequalnofix(False, '\u1FFc', 'isupper')
590        self.assertTrue('\u2167'.isupper())
591        self.assertFalse('\u2177'.isupper())
592        # non-BMP, uppercase
593        self.assertTrue('\U00010401'.isupper())
594        self.assertTrue('\U00010427'.isupper())
595        # non-BMP, lowercase
596        self.assertFalse('\U00010429'.isupper())
597        self.assertFalse('\U0001044E'.isupper())
598        # non-BMP, non-cased
599        self.assertFalse('\U0001F40D'.isupper())
600        self.assertFalse('\U0001F46F'.isupper())
601
602    def test_istitle(self):
603        super().test_istitle()
604        self.checkequalnofix(True, '\u1FFc', 'istitle')
605        self.checkequalnofix(True, 'Greek \u1FFcitlecases ...', 'istitle')
606
607        # non-BMP, uppercase + lowercase
608        self.assertTrue('\U00010401\U00010429'.istitle())
609        self.assertTrue('\U00010427\U0001044E'.istitle())
610        # apparently there are no titlecased (Lt) non-BMP chars in Unicode 6
611        for ch in ['\U00010429', '\U0001044E', '\U0001F40D', '\U0001F46F']:
612            self.assertFalse(ch.istitle(), '{!a} is not title'.format(ch))
613
614    def test_isspace(self):
615        super().test_isspace()
616        self.checkequalnofix(True, '\u2000', 'isspace')
617        self.checkequalnofix(True, '\u200a', 'isspace')
618        self.checkequalnofix(False, '\u2014', 'isspace')
619        # apparently there are no non-BMP spaces chars in Unicode 6
620        for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
621                   '\U0001F40D', '\U0001F46F']:
622            self.assertFalse(ch.isspace(), '{!a} is not space.'.format(ch))
623
624    def test_isalnum(self):
625        super().test_isalnum()
626        for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
627                   '\U0001D7F6', '\U00011066', '\U000104A0', '\U0001F107']:
628            self.assertTrue(ch.isalnum(), '{!a} is alnum.'.format(ch))
629
630    def test_isalpha(self):
631        super().test_isalpha()
632        self.checkequalnofix(True, '\u1FFc', 'isalpha')
633        # non-BMP, cased
634        self.assertTrue('\U00010401'.isalpha())
635        self.assertTrue('\U00010427'.isalpha())
636        self.assertTrue('\U00010429'.isalpha())
637        self.assertTrue('\U0001044E'.isalpha())
638        # non-BMP, non-cased
639        self.assertFalse('\U0001F40D'.isalpha())
640        self.assertFalse('\U0001F46F'.isalpha())
641
642    def test_isdecimal(self):
643        self.checkequalnofix(False, '', 'isdecimal')
644        self.checkequalnofix(False, 'a', 'isdecimal')
645        self.checkequalnofix(True, '0', 'isdecimal')
646        self.checkequalnofix(False, '\u2460', 'isdecimal') # CIRCLED DIGIT ONE
647        self.checkequalnofix(False, '\xbc', 'isdecimal') # VULGAR FRACTION ONE QUARTER
648        self.checkequalnofix(True, '\u0660', 'isdecimal') # ARABIC-INDIC DIGIT ZERO
649        self.checkequalnofix(True, '0123456789', 'isdecimal')
650        self.checkequalnofix(False, '0123456789a', 'isdecimal')
651
652        self.checkraises(TypeError, 'abc', 'isdecimal', 42)
653
654        for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
655                   '\U0001F40D', '\U0001F46F', '\U00011065', '\U0001F107']:
656            self.assertFalse(ch.isdecimal(), '{!a} is not decimal.'.format(ch))
657        for ch in ['\U0001D7F6', '\U00011066', '\U000104A0']:
658            self.assertTrue(ch.isdecimal(), '{!a} is decimal.'.format(ch))
659
660    def test_isdigit(self):
661        super().test_isdigit()
662        self.checkequalnofix(True, '\u2460', 'isdigit')
663        self.checkequalnofix(False, '\xbc', 'isdigit')
664        self.checkequalnofix(True, '\u0660', 'isdigit')
665
666        for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
667                   '\U0001F40D', '\U0001F46F', '\U00011065']:
668            self.assertFalse(ch.isdigit(), '{!a} is not a digit.'.format(ch))
669        for ch in ['\U0001D7F6', '\U00011066', '\U000104A0', '\U0001F107']:
670            self.assertTrue(ch.isdigit(), '{!a} is a digit.'.format(ch))
671
672    def test_isnumeric(self):
673        self.checkequalnofix(False, '', 'isnumeric')
674        self.checkequalnofix(False, 'a', 'isnumeric')
675        self.checkequalnofix(True, '0', 'isnumeric')
676        self.checkequalnofix(True, '\u2460', 'isnumeric')
677        self.checkequalnofix(True, '\xbc', 'isnumeric')
678        self.checkequalnofix(True, '\u0660', 'isnumeric')
679        self.checkequalnofix(True, '0123456789', 'isnumeric')
680        self.checkequalnofix(False, '0123456789a', 'isnumeric')
681
682        self.assertRaises(TypeError, "abc".isnumeric, 42)
683
684        for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
685                   '\U0001F40D', '\U0001F46F']:
686            self.assertFalse(ch.isnumeric(), '{!a} is not numeric.'.format(ch))
687        for ch in ['\U00011065', '\U0001D7F6', '\U00011066',
688                   '\U000104A0', '\U0001F107']:
689            self.assertTrue(ch.isnumeric(), '{!a} is numeric.'.format(ch))
690
691    def test_isidentifier(self):
692        self.assertTrue("a".isidentifier())
693        self.assertTrue("Z".isidentifier())
694        self.assertTrue("_".isidentifier())
695        self.assertTrue("b0".isidentifier())
696        self.assertTrue("bc".isidentifier())
697        self.assertTrue("b_".isidentifier())
698        self.assertTrue("µ".isidentifier())
699        self.assertTrue("��������������".isidentifier())
700
701        self.assertFalse(" ".isidentifier())
702        self.assertFalse("[".isidentifier())
703        self.assertFalse("©".isidentifier())
704        self.assertFalse("0".isidentifier())
705
706    def test_isprintable(self):
707        self.assertTrue("".isprintable())
708        self.assertTrue(" ".isprintable())
709        self.assertTrue("abcdefg".isprintable())
710        self.assertFalse("abcdefg\n".isprintable())
711        # some defined Unicode character
712        self.assertTrue("\u0374".isprintable())
713        # undefined character
714        self.assertFalse("\u0378".isprintable())
715        # single surrogate character
716        self.assertFalse("\ud800".isprintable())
717
718        self.assertTrue('\U0001F46F'.isprintable())
719        self.assertFalse('\U000E0020'.isprintable())
720
721    def test_surrogates(self):
722        for s in ('a\uD800b\uDFFF', 'a\uDFFFb\uD800',
723                  'a\uD800b\uDFFFa', 'a\uDFFFb\uD800a'):
724            self.assertTrue(s.islower())
725            self.assertFalse(s.isupper())
726            self.assertFalse(s.istitle())
727        for s in ('A\uD800B\uDFFF', 'A\uDFFFB\uD800',
728                  'A\uD800B\uDFFFA', 'A\uDFFFB\uD800A'):
729            self.assertFalse(s.islower())
730            self.assertTrue(s.isupper())
731            self.assertTrue(s.istitle())
732
733        for meth_name in ('islower', 'isupper', 'istitle'):
734            meth = getattr(str, meth_name)
735            for s in ('\uD800', '\uDFFF', '\uD800\uD800', '\uDFFF\uDFFF'):
736                self.assertFalse(meth(s), '%a.%s() is False' % (s, meth_name))
737
738        for meth_name in ('isalpha', 'isalnum', 'isdigit', 'isspace',
739                          'isdecimal', 'isnumeric',
740                          'isidentifier', 'isprintable'):
741            meth = getattr(str, meth_name)
742            for s in ('\uD800', '\uDFFF', '\uD800\uD800', '\uDFFF\uDFFF',
743                      'a\uD800b\uDFFF', 'a\uDFFFb\uD800',
744                      'a\uD800b\uDFFFa', 'a\uDFFFb\uD800a'):
745                self.assertFalse(meth(s), '%a.%s() is False' % (s, meth_name))
746
747
748    def test_lower(self):
749        string_tests.CommonTest.test_lower(self)
750        self.assertEqual('\U00010427'.lower(), '\U0001044F')
751        self.assertEqual('\U00010427\U00010427'.lower(),
752                         '\U0001044F\U0001044F')
753        self.assertEqual('\U00010427\U0001044F'.lower(),
754                         '\U0001044F\U0001044F')
755        self.assertEqual('X\U00010427x\U0001044F'.lower(),
756                         'x\U0001044Fx\U0001044F')
757        self.assertEqual('fi'.lower(), 'fi')
758        self.assertEqual('\u0130'.lower(), '\u0069\u0307')
759        # Special case for GREEK CAPITAL LETTER SIGMA U+03A3
760        self.assertEqual('\u03a3'.lower(), '\u03c3')
761        self.assertEqual('\u0345\u03a3'.lower(), '\u0345\u03c3')
762        self.assertEqual('A\u0345\u03a3'.lower(), 'a\u0345\u03c2')
763        self.assertEqual('A\u0345\u03a3a'.lower(), 'a\u0345\u03c3a')
764        self.assertEqual('A\u0345\u03a3'.lower(), 'a\u0345\u03c2')
765        self.assertEqual('A\u03a3\u0345'.lower(), 'a\u03c2\u0345')
766        self.assertEqual('\u03a3\u0345 '.lower(), '\u03c3\u0345 ')
767        self.assertEqual('\U0008fffe'.lower(), '\U0008fffe')
768        self.assertEqual('\u2177'.lower(), '\u2177')
769
770    def test_casefold(self):
771        self.assertEqual('hello'.casefold(), 'hello')
772        self.assertEqual('hELlo'.casefold(), 'hello')
773        self.assertEqual('ß'.casefold(), 'ss')
774        self.assertEqual('fi'.casefold(), 'fi')
775        self.assertEqual('\u03a3'.casefold(), '\u03c3')
776        self.assertEqual('A\u0345\u03a3'.casefold(), 'a\u03b9\u03c3')
777        self.assertEqual('\u00b5'.casefold(), '\u03bc')
778
779    def test_upper(self):
780        string_tests.CommonTest.test_upper(self)
781        self.assertEqual('\U0001044F'.upper(), '\U00010427')
782        self.assertEqual('\U0001044F\U0001044F'.upper(),
783                         '\U00010427\U00010427')
784        self.assertEqual('\U00010427\U0001044F'.upper(),
785                         '\U00010427\U00010427')
786        self.assertEqual('X\U00010427x\U0001044F'.upper(),
787                         'X\U00010427X\U00010427')
788        self.assertEqual('fi'.upper(), 'FI')
789        self.assertEqual('\u0130'.upper(), '\u0130')
790        self.assertEqual('\u03a3'.upper(), '\u03a3')
791        self.assertEqual('ß'.upper(), 'SS')
792        self.assertEqual('\u1fd2'.upper(), '\u0399\u0308\u0300')
793        self.assertEqual('\U0008fffe'.upper(), '\U0008fffe')
794        self.assertEqual('\u2177'.upper(), '\u2167')
795
796    def test_capitalize(self):
797        string_tests.CommonTest.test_capitalize(self)
798        self.assertEqual('\U0001044F'.capitalize(), '\U00010427')
799        self.assertEqual('\U0001044F\U0001044F'.capitalize(),
800                         '\U00010427\U0001044F')
801        self.assertEqual('\U00010427\U0001044F'.capitalize(),
802                         '\U00010427\U0001044F')
803        self.assertEqual('\U0001044F\U00010427'.capitalize(),
804                         '\U00010427\U0001044F')
805        self.assertEqual('X\U00010427x\U0001044F'.capitalize(),
806                         'X\U0001044Fx\U0001044F')
807        self.assertEqual('h\u0130'.capitalize(), 'H\u0069\u0307')
808        exp = '\u0399\u0308\u0300\u0069\u0307'
809        self.assertEqual('\u1fd2\u0130'.capitalize(), exp)
810        self.assertEqual('finnish'.capitalize(), 'FInnish')
811        self.assertEqual('A\u0345\u03a3'.capitalize(), 'A\u0345\u03c2')
812
813    def test_title(self):
814        super().test_title()
815        self.assertEqual('\U0001044F'.title(), '\U00010427')
816        self.assertEqual('\U0001044F\U0001044F'.title(),
817                         '\U00010427\U0001044F')
818        self.assertEqual('\U0001044F\U0001044F \U0001044F\U0001044F'.title(),
819                         '\U00010427\U0001044F \U00010427\U0001044F')
820        self.assertEqual('\U00010427\U0001044F \U00010427\U0001044F'.title(),
821                         '\U00010427\U0001044F \U00010427\U0001044F')
822        self.assertEqual('\U0001044F\U00010427 \U0001044F\U00010427'.title(),
823                         '\U00010427\U0001044F \U00010427\U0001044F')
824        self.assertEqual('X\U00010427x\U0001044F X\U00010427x\U0001044F'.title(),
825                         'X\U0001044Fx\U0001044F X\U0001044Fx\U0001044F')
826        self.assertEqual('fiNNISH'.title(), 'Finnish')
827        self.assertEqual('A\u03a3 \u1fa1xy'.title(), 'A\u03c2 \u1fa9xy')
828        self.assertEqual('A\u03a3A'.title(), 'A\u03c3a')
829
830    def test_swapcase(self):
831        string_tests.CommonTest.test_swapcase(self)
832        self.assertEqual('\U0001044F'.swapcase(), '\U00010427')
833        self.assertEqual('\U00010427'.swapcase(), '\U0001044F')
834        self.assertEqual('\U0001044F\U0001044F'.swapcase(),
835                         '\U00010427\U00010427')
836        self.assertEqual('\U00010427\U0001044F'.swapcase(),
837                         '\U0001044F\U00010427')
838        self.assertEqual('\U0001044F\U00010427'.swapcase(),
839                         '\U00010427\U0001044F')
840        self.assertEqual('X\U00010427x\U0001044F'.swapcase(),
841                         'x\U0001044FX\U00010427')
842        self.assertEqual('fi'.swapcase(), 'FI')
843        self.assertEqual('\u0130'.swapcase(), '\u0069\u0307')
844        # Special case for GREEK CAPITAL LETTER SIGMA U+03A3
845        self.assertEqual('\u03a3'.swapcase(), '\u03c3')
846        self.assertEqual('\u0345\u03a3'.swapcase(), '\u0399\u03c3')
847        self.assertEqual('A\u0345\u03a3'.swapcase(), 'a\u0399\u03c2')
848        self.assertEqual('A\u0345\u03a3a'.swapcase(), 'a\u0399\u03c3A')
849        self.assertEqual('A\u0345\u03a3'.swapcase(), 'a\u0399\u03c2')
850        self.assertEqual('A\u03a3\u0345'.swapcase(), 'a\u03c2\u0399')
851        self.assertEqual('\u03a3\u0345 '.swapcase(), '\u03c3\u0399 ')
852        self.assertEqual('\u03a3'.swapcase(), '\u03c3')
853        self.assertEqual('ß'.swapcase(), 'SS')
854        self.assertEqual('\u1fd2'.swapcase(), '\u0399\u0308\u0300')
855
856    def test_center(self):
857        string_tests.CommonTest.test_center(self)
858        self.assertEqual('x'.center(2, '\U0010FFFF'),
859                         'x\U0010FFFF')
860        self.assertEqual('x'.center(3, '\U0010FFFF'),
861                         '\U0010FFFFx\U0010FFFF')
862        self.assertEqual('x'.center(4, '\U0010FFFF'),
863                         '\U0010FFFFx\U0010FFFF\U0010FFFF')
864
865    @unittest.skipUnless(sys.maxsize == 2**31 - 1, "requires 32-bit system")
866    @support.cpython_only
867    def test_case_operation_overflow(self):
868        # Issue #22643
869        size = 2**32//12 + 1
870        try:
871            s = "ü" * size
872        except MemoryError:
873            self.skipTest('no enough memory (%.0f MiB required)' % (size / 2**20))
874        try:
875            self.assertRaises(OverflowError, s.upper)
876        finally:
877            del s
878
879    def test_contains(self):
880        # Testing Unicode contains method
881        self.assertIn('a', 'abdb')
882        self.assertIn('a', 'bdab')
883        self.assertIn('a', 'bdaba')
884        self.assertIn('a', 'bdba')
885        self.assertNotIn('a', 'bdb')
886        self.assertIn('a', 'bdba')
887        self.assertIn('a', ('a',1,None))
888        self.assertIn('a', (1,None,'a'))
889        self.assertIn('a', ('a',1,None))
890        self.assertIn('a', (1,None,'a'))
891        self.assertNotIn('a', ('x',1,'y'))
892        self.assertNotIn('a', ('x',1,None))
893        self.assertNotIn('abcd', 'abcxxxx')
894        self.assertIn('ab', 'abcd')
895        self.assertIn('ab', 'abc')
896        self.assertIn('ab', (1,None,'ab'))
897        self.assertIn('', 'abc')
898        self.assertIn('', '')
899        self.assertIn('', 'abc')
900        self.assertNotIn('\0', 'abc')
901        self.assertIn('\0', '\0abc')
902        self.assertIn('\0', 'abc\0')
903        self.assertIn('a', '\0abc')
904        self.assertIn('asdf', 'asdf')
905        self.assertNotIn('asdf', 'asd')
906        self.assertNotIn('asdf', '')
907
908        self.assertRaises(TypeError, "abc".__contains__)
909        # test mixed kinds
910        for fill in ('a', '\u0100', '\U00010300'):
911            fill *= 9
912            for delim in ('c', '\u0102', '\U00010302'):
913                self.assertNotIn(delim, fill)
914                self.assertIn(delim, fill + delim)
915                self.assertNotIn(delim * 2, fill)
916                self.assertIn(delim * 2, fill + delim * 2)
917
918    def test_issue18183(self):
919        '\U00010000\U00100000'.lower()
920        '\U00010000\U00100000'.casefold()
921        '\U00010000\U00100000'.upper()
922        '\U00010000\U00100000'.capitalize()
923        '\U00010000\U00100000'.title()
924        '\U00010000\U00100000'.swapcase()
925        '\U00100000'.center(3, '\U00010000')
926        '\U00100000'.ljust(3, '\U00010000')
927        '\U00100000'.rjust(3, '\U00010000')
928
929    def test_format(self):
930        self.assertEqual(''.format(), '')
931        self.assertEqual('a'.format(), 'a')
932        self.assertEqual('ab'.format(), 'ab')
933        self.assertEqual('a{{'.format(), 'a{')
934        self.assertEqual('a}}'.format(), 'a}')
935        self.assertEqual('{{b'.format(), '{b')
936        self.assertEqual('}}b'.format(), '}b')
937        self.assertEqual('a{{b'.format(), 'a{b')
938
939        # examples from the PEP:
940        import datetime
941        self.assertEqual("My name is {0}".format('Fred'), "My name is Fred")
942        self.assertEqual("My name is {0[name]}".format(dict(name='Fred')),
943                         "My name is Fred")
944        self.assertEqual("My name is {0} :-{{}}".format('Fred'),
945                         "My name is Fred :-{}")
946
947        d = datetime.date(2007, 8, 18)
948        self.assertEqual("The year is {0.year}".format(d),
949                         "The year is 2007")
950
951        # classes we'll use for testing
952        class C:
953            def __init__(self, x=100):
954                self._x = x
955            def __format__(self, spec):
956                return spec
957
958        class D:
959            def __init__(self, x):
960                self.x = x
961            def __format__(self, spec):
962                return str(self.x)
963
964        # class with __str__, but no __format__
965        class E:
966            def __init__(self, x):
967                self.x = x
968            def __str__(self):
969                return 'E(' + self.x + ')'
970
971        # class with __repr__, but no __format__ or __str__
972        class F:
973            def __init__(self, x):
974                self.x = x
975            def __repr__(self):
976                return 'F(' + self.x + ')'
977
978        # class with __format__ that forwards to string, for some format_spec's
979        class G:
980            def __init__(self, x):
981                self.x = x
982            def __str__(self):
983                return "string is " + self.x
984            def __format__(self, format_spec):
985                if format_spec == 'd':
986                    return 'G(' + self.x + ')'
987                return object.__format__(self, format_spec)
988
989        class I(datetime.date):
990            def __format__(self, format_spec):
991                return self.strftime(format_spec)
992
993        class J(int):
994            def __format__(self, format_spec):
995                return int.__format__(self * 2, format_spec)
996
997        class M:
998            def __init__(self, x):
999                self.x = x
1000            def __repr__(self):
1001                return 'M(' + self.x + ')'
1002            __str__ = None
1003
1004        class N:
1005            def __init__(self, x):
1006                self.x = x
1007            def __repr__(self):
1008                return 'N(' + self.x + ')'
1009            __format__ = None
1010
1011        self.assertEqual(''.format(), '')
1012        self.assertEqual('abc'.format(), 'abc')
1013        self.assertEqual('{0}'.format('abc'), 'abc')
1014        self.assertEqual('{0:}'.format('abc'), 'abc')
1015#        self.assertEqual('{ 0 }'.format('abc'), 'abc')
1016        self.assertEqual('X{0}'.format('abc'), 'Xabc')
1017        self.assertEqual('{0}X'.format('abc'), 'abcX')
1018        self.assertEqual('X{0}Y'.format('abc'), 'XabcY')
1019        self.assertEqual('{1}'.format(1, 'abc'), 'abc')
1020        self.assertEqual('X{1}'.format(1, 'abc'), 'Xabc')
1021        self.assertEqual('{1}X'.format(1, 'abc'), 'abcX')
1022        self.assertEqual('X{1}Y'.format(1, 'abc'), 'XabcY')
1023        self.assertEqual('{0}'.format(-15), '-15')
1024        self.assertEqual('{0}{1}'.format(-15, 'abc'), '-15abc')
1025        self.assertEqual('{0}X{1}'.format(-15, 'abc'), '-15Xabc')
1026        self.assertEqual('{{'.format(), '{')
1027        self.assertEqual('}}'.format(), '}')
1028        self.assertEqual('{{}}'.format(), '{}')
1029        self.assertEqual('{{x}}'.format(), '{x}')
1030        self.assertEqual('{{{0}}}'.format(123), '{123}')
1031        self.assertEqual('{{{{0}}}}'.format(), '{{0}}')
1032        self.assertEqual('}}{{'.format(), '}{')
1033        self.assertEqual('}}x{{'.format(), '}x{')
1034
1035        # weird field names
1036        self.assertEqual("{0[foo-bar]}".format({'foo-bar':'baz'}), 'baz')
1037        self.assertEqual("{0[foo bar]}".format({'foo bar':'baz'}), 'baz')
1038        self.assertEqual("{0[ ]}".format({' ':3}), '3')
1039
1040        self.assertEqual('{foo._x}'.format(foo=C(20)), '20')
1041        self.assertEqual('{1}{0}'.format(D(10), D(20)), '2010')
1042        self.assertEqual('{0._x.x}'.format(C(D('abc'))), 'abc')
1043        self.assertEqual('{0[0]}'.format(['abc', 'def']), 'abc')
1044        self.assertEqual('{0[1]}'.format(['abc', 'def']), 'def')
1045        self.assertEqual('{0[1][0]}'.format(['abc', ['def']]), 'def')
1046        self.assertEqual('{0[1][0].x}'.format(['abc', [D('def')]]), 'def')
1047
1048        # strings
1049        self.assertEqual('{0:.3s}'.format('abc'), 'abc')
1050        self.assertEqual('{0:.3s}'.format('ab'), 'ab')
1051        self.assertEqual('{0:.3s}'.format('abcdef'), 'abc')
1052        self.assertEqual('{0:.0s}'.format('abcdef'), '')
1053        self.assertEqual('{0:3.3s}'.format('abc'), 'abc')
1054        self.assertEqual('{0:2.3s}'.format('abc'), 'abc')
1055        self.assertEqual('{0:2.2s}'.format('abc'), 'ab')
1056        self.assertEqual('{0:3.2s}'.format('abc'), 'ab ')
1057        self.assertEqual('{0:x<0s}'.format('result'), 'result')
1058        self.assertEqual('{0:x<5s}'.format('result'), 'result')
1059        self.assertEqual('{0:x<6s}'.format('result'), 'result')
1060        self.assertEqual('{0:x<7s}'.format('result'), 'resultx')
1061        self.assertEqual('{0:x<8s}'.format('result'), 'resultxx')
1062        self.assertEqual('{0: <7s}'.format('result'), 'result ')
1063        self.assertEqual('{0:<7s}'.format('result'), 'result ')
1064        self.assertEqual('{0:>7s}'.format('result'), ' result')
1065        self.assertEqual('{0:>8s}'.format('result'), '  result')
1066        self.assertEqual('{0:^8s}'.format('result'), ' result ')
1067        self.assertEqual('{0:^9s}'.format('result'), ' result  ')
1068        self.assertEqual('{0:^10s}'.format('result'), '  result  ')
1069        self.assertEqual('{0:10000}'.format('a'), 'a' + ' ' * 9999)
1070        self.assertEqual('{0:10000}'.format(''), ' ' * 10000)
1071        self.assertEqual('{0:10000000}'.format(''), ' ' * 10000000)
1072
1073        # issue 12546: use \x00 as a fill character
1074        self.assertEqual('{0:\x00<6s}'.format('foo'), 'foo\x00\x00\x00')
1075        self.assertEqual('{0:\x01<6s}'.format('foo'), 'foo\x01\x01\x01')
1076        self.assertEqual('{0:\x00^6s}'.format('foo'), '\x00foo\x00\x00')
1077        self.assertEqual('{0:^6s}'.format('foo'), ' foo  ')
1078
1079        self.assertEqual('{0:\x00<6}'.format(3), '3\x00\x00\x00\x00\x00')
1080        self.assertEqual('{0:\x01<6}'.format(3), '3\x01\x01\x01\x01\x01')
1081        self.assertEqual('{0:\x00^6}'.format(3), '\x00\x003\x00\x00\x00')
1082        self.assertEqual('{0:<6}'.format(3), '3     ')
1083
1084        self.assertEqual('{0:\x00<6}'.format(3.14), '3.14\x00\x00')
1085        self.assertEqual('{0:\x01<6}'.format(3.14), '3.14\x01\x01')
1086        self.assertEqual('{0:\x00^6}'.format(3.14), '\x003.14\x00')
1087        self.assertEqual('{0:^6}'.format(3.14), ' 3.14 ')
1088
1089        self.assertEqual('{0:\x00<12}'.format(3+2.0j), '(3+2j)\x00\x00\x00\x00\x00\x00')
1090        self.assertEqual('{0:\x01<12}'.format(3+2.0j), '(3+2j)\x01\x01\x01\x01\x01\x01')
1091        self.assertEqual('{0:\x00^12}'.format(3+2.0j), '\x00\x00\x00(3+2j)\x00\x00\x00')
1092        self.assertEqual('{0:^12}'.format(3+2.0j), '   (3+2j)   ')
1093
1094        # format specifiers for user defined type
1095        self.assertEqual('{0:abc}'.format(C()), 'abc')
1096
1097        # !r, !s and !a coercions
1098        self.assertEqual('{0!s}'.format('Hello'), 'Hello')
1099        self.assertEqual('{0!s:}'.format('Hello'), 'Hello')
1100        self.assertEqual('{0!s:15}'.format('Hello'), 'Hello          ')
1101        self.assertEqual('{0!s:15s}'.format('Hello'), 'Hello          ')
1102        self.assertEqual('{0!r}'.format('Hello'), "'Hello'")
1103        self.assertEqual('{0!r:}'.format('Hello'), "'Hello'")
1104        self.assertEqual('{0!r}'.format(F('Hello')), 'F(Hello)')
1105        self.assertEqual('{0!r}'.format('\u0378'), "'\\u0378'") # nonprintable
1106        self.assertEqual('{0!r}'.format('\u0374'), "'\u0374'")  # printable
1107        self.assertEqual('{0!r}'.format(F('\u0374')), 'F(\u0374)')
1108        self.assertEqual('{0!a}'.format('Hello'), "'Hello'")
1109        self.assertEqual('{0!a}'.format('\u0378'), "'\\u0378'") # nonprintable
1110        self.assertEqual('{0!a}'.format('\u0374'), "'\\u0374'") # printable
1111        self.assertEqual('{0!a:}'.format('Hello'), "'Hello'")
1112        self.assertEqual('{0!a}'.format(F('Hello')), 'F(Hello)')
1113        self.assertEqual('{0!a}'.format(F('\u0374')), 'F(\\u0374)')
1114
1115        # test fallback to object.__format__
1116        self.assertEqual('{0}'.format({}), '{}')
1117        self.assertEqual('{0}'.format([]), '[]')
1118        self.assertEqual('{0}'.format([1]), '[1]')
1119
1120        self.assertEqual('{0:d}'.format(G('data')), 'G(data)')
1121        self.assertEqual('{0!s}'.format(G('data')), 'string is data')
1122
1123        self.assertRaises(TypeError, '{0:^10}'.format, E('data'))
1124        self.assertRaises(TypeError, '{0:^10s}'.format, E('data'))
1125        self.assertRaises(TypeError, '{0:>15s}'.format, G('data'))
1126
1127        self.assertEqual("{0:date: %Y-%m-%d}".format(I(year=2007,
1128                                                       month=8,
1129                                                       day=27)),
1130                         "date: 2007-08-27")
1131
1132        # test deriving from a builtin type and overriding __format__
1133        self.assertEqual("{0}".format(J(10)), "20")
1134
1135
1136        # string format specifiers
1137        self.assertEqual('{0:}'.format('a'), 'a')
1138
1139        # computed format specifiers
1140        self.assertEqual("{0:.{1}}".format('hello world', 5), 'hello')
1141        self.assertEqual("{0:.{1}s}".format('hello world', 5), 'hello')
1142        self.assertEqual("{0:.{precision}s}".format('hello world', precision=5), 'hello')
1143        self.assertEqual("{0:{width}.{precision}s}".format('hello world', width=10, precision=5), 'hello     ')
1144        self.assertEqual("{0:{width}.{precision}s}".format('hello world', width='10', precision='5'), 'hello     ')
1145
1146        # test various errors
1147        self.assertRaises(ValueError, '{'.format)
1148        self.assertRaises(ValueError, '}'.format)
1149        self.assertRaises(ValueError, 'a{'.format)
1150        self.assertRaises(ValueError, 'a}'.format)
1151        self.assertRaises(ValueError, '{a'.format)
1152        self.assertRaises(ValueError, '}a'.format)
1153        self.assertRaises(IndexError, '{0}'.format)
1154        self.assertRaises(IndexError, '{1}'.format, 'abc')
1155        self.assertRaises(KeyError,   '{x}'.format)
1156        self.assertRaises(ValueError, "}{".format)
1157        self.assertRaises(ValueError, "abc{0:{}".format)
1158        self.assertRaises(ValueError, "{0".format)
1159        self.assertRaises(IndexError, "{0.}".format)
1160        self.assertRaises(ValueError, "{0.}".format, 0)
1161        self.assertRaises(ValueError, "{0[}".format)
1162        self.assertRaises(ValueError, "{0[}".format, [])
1163        self.assertRaises(KeyError,   "{0]}".format)
1164        self.assertRaises(ValueError, "{0.[]}".format, 0)
1165        self.assertRaises(ValueError, "{0..foo}".format, 0)
1166        self.assertRaises(ValueError, "{0[0}".format, 0)
1167        self.assertRaises(ValueError, "{0[0:foo}".format, 0)
1168        self.assertRaises(KeyError,   "{c]}".format)
1169        self.assertRaises(ValueError, "{{ {{{0}}".format, 0)
1170        self.assertRaises(ValueError, "{0}}".format, 0)
1171        self.assertRaises(KeyError,   "{foo}".format, bar=3)
1172        self.assertRaises(ValueError, "{0!x}".format, 3)
1173        self.assertRaises(ValueError, "{0!}".format, 0)
1174        self.assertRaises(ValueError, "{0!rs}".format, 0)
1175        self.assertRaises(ValueError, "{!}".format)
1176        self.assertRaises(IndexError, "{:}".format)
1177        self.assertRaises(IndexError, "{:s}".format)
1178        self.assertRaises(IndexError, "{}".format)
1179        big = "23098475029384702983476098230754973209482573"
1180        self.assertRaises(ValueError, ("{" + big + "}").format)
1181        self.assertRaises(ValueError, ("{[" + big + "]}").format, [0])
1182
1183        # issue 6089
1184        self.assertRaises(ValueError, "{0[0]x}".format, [None])
1185        self.assertRaises(ValueError, "{0[0](10)}".format, [None])
1186
1187        # can't have a replacement on the field name portion
1188        self.assertRaises(TypeError, '{0[{1}]}'.format, 'abcdefg', 4)
1189
1190        # exceed maximum recursion depth
1191        self.assertRaises(ValueError, "{0:{1:{2}}}".format, 'abc', 's', '')
1192        self.assertRaises(ValueError, "{0:{1:{2:{3:{4:{5:{6}}}}}}}".format,
1193                          0, 1, 2, 3, 4, 5, 6, 7)
1194
1195        # string format spec errors
1196        self.assertRaises(ValueError, "{0:-s}".format, '')
1197        self.assertRaises(ValueError, format, "", "-")
1198        self.assertRaises(ValueError, "{0:=s}".format, '')
1199
1200        # Alternate formatting is not supported
1201        self.assertRaises(ValueError, format, '', '#')
1202        self.assertRaises(ValueError, format, '', '#20')
1203
1204        # Non-ASCII
1205        self.assertEqual("{0:s}{1:s}".format("ABC", "\u0410\u0411\u0412"),
1206                         'ABC\u0410\u0411\u0412')
1207        self.assertEqual("{0:.3s}".format("ABC\u0410\u0411\u0412"),
1208                         'ABC')
1209        self.assertEqual("{0:.0s}".format("ABC\u0410\u0411\u0412"),
1210                         '')
1211
1212        self.assertEqual("{[{}]}".format({"{}": 5}), "5")
1213        self.assertEqual("{[{}]}".format({"{}" : "a"}), "a")
1214        self.assertEqual("{[{]}".format({"{" : "a"}), "a")
1215        self.assertEqual("{[}]}".format({"}" : "a"}), "a")
1216        self.assertEqual("{[[]}".format({"[" : "a"}), "a")
1217        self.assertEqual("{[!]}".format({"!" : "a"}), "a")
1218        self.assertRaises(ValueError, "{a{}b}".format, 42)
1219        self.assertRaises(ValueError, "{a{b}".format, 42)
1220        self.assertRaises(ValueError, "{[}".format, 42)
1221
1222        self.assertEqual("0x{:0{:d}X}".format(0x0,16), "0x0000000000000000")
1223
1224        # Blocking fallback
1225        m = M('data')
1226        self.assertEqual("{!r}".format(m), 'M(data)')
1227        self.assertRaises(TypeError, "{!s}".format, m)
1228        self.assertRaises(TypeError, "{}".format, m)
1229        n = N('data')
1230        self.assertEqual("{!r}".format(n), 'N(data)')
1231        self.assertEqual("{!s}".format(n), 'N(data)')
1232        self.assertRaises(TypeError, "{}".format, n)
1233
1234    def test_format_map(self):
1235        self.assertEqual(''.format_map({}), '')
1236        self.assertEqual('a'.format_map({}), 'a')
1237        self.assertEqual('ab'.format_map({}), 'ab')
1238        self.assertEqual('a{{'.format_map({}), 'a{')
1239        self.assertEqual('a}}'.format_map({}), 'a}')
1240        self.assertEqual('{{b'.format_map({}), '{b')
1241        self.assertEqual('}}b'.format_map({}), '}b')
1242        self.assertEqual('a{{b'.format_map({}), 'a{b')
1243
1244        # using mappings
1245        class Mapping(dict):
1246            def __missing__(self, key):
1247                return key
1248        self.assertEqual('{hello}'.format_map(Mapping()), 'hello')
1249        self.assertEqual('{a} {world}'.format_map(Mapping(a='hello')), 'hello world')
1250
1251        class InternalMapping:
1252            def __init__(self):
1253                self.mapping = {'a': 'hello'}
1254            def __getitem__(self, key):
1255                return self.mapping[key]
1256        self.assertEqual('{a}'.format_map(InternalMapping()), 'hello')
1257
1258
1259        class C:
1260            def __init__(self, x=100):
1261                self._x = x
1262            def __format__(self, spec):
1263                return spec
1264        self.assertEqual('{foo._x}'.format_map({'foo': C(20)}), '20')
1265
1266        # test various errors
1267        self.assertRaises(TypeError, ''.format_map)
1268        self.assertRaises(TypeError, 'a'.format_map)
1269
1270        self.assertRaises(ValueError, '{'.format_map, {})
1271        self.assertRaises(ValueError, '}'.format_map, {})
1272        self.assertRaises(ValueError, 'a{'.format_map, {})
1273        self.assertRaises(ValueError, 'a}'.format_map, {})
1274        self.assertRaises(ValueError, '{a'.format_map, {})
1275        self.assertRaises(ValueError, '}a'.format_map, {})
1276
1277        # issue #12579: can't supply positional params to format_map
1278        self.assertRaises(ValueError, '{}'.format_map, {'a' : 2})
1279        self.assertRaises(ValueError, '{}'.format_map, 'a')
1280        self.assertRaises(ValueError, '{a} {}'.format_map, {"a" : 2, "b" : 1})
1281
1282    def test_format_huge_precision(self):
1283        format_string = ".{}f".format(sys.maxsize + 1)
1284        with self.assertRaises(ValueError):
1285            result = format(2.34, format_string)
1286
1287    def test_format_huge_width(self):
1288        format_string = "{}f".format(sys.maxsize + 1)
1289        with self.assertRaises(ValueError):
1290            result = format(2.34, format_string)
1291
1292    def test_format_huge_item_number(self):
1293        format_string = "{{{}:.6f}}".format(sys.maxsize + 1)
1294        with self.assertRaises(ValueError):
1295            result = format_string.format(2.34)
1296
1297    def test_format_auto_numbering(self):
1298        class C:
1299            def __init__(self, x=100):
1300                self._x = x
1301            def __format__(self, spec):
1302                return spec
1303
1304        self.assertEqual('{}'.format(10), '10')
1305        self.assertEqual('{:5}'.format('s'), 's    ')
1306        self.assertEqual('{!r}'.format('s'), "'s'")
1307        self.assertEqual('{._x}'.format(C(10)), '10')
1308        self.assertEqual('{[1]}'.format([1, 2]), '2')
1309        self.assertEqual('{[a]}'.format({'a':4, 'b':2}), '4')
1310        self.assertEqual('a{}b{}c'.format(0, 1), 'a0b1c')
1311
1312        self.assertEqual('a{:{}}b'.format('x', '^10'), 'a    x     b')
1313        self.assertEqual('a{:{}x}b'.format(20, '#'), 'a0x14b')
1314
1315        # can't mix and match numbering and auto-numbering
1316        self.assertRaises(ValueError, '{}{1}'.format, 1, 2)
1317        self.assertRaises(ValueError, '{1}{}'.format, 1, 2)
1318        self.assertRaises(ValueError, '{:{1}}'.format, 1, 2)
1319        self.assertRaises(ValueError, '{0:{}}'.format, 1, 2)
1320
1321        # can mix and match auto-numbering and named
1322        self.assertEqual('{f}{}'.format(4, f='test'), 'test4')
1323        self.assertEqual('{}{f}'.format(4, f='test'), '4test')
1324        self.assertEqual('{:{f}}{g}{}'.format(1, 3, g='g', f=2), ' 1g3')
1325        self.assertEqual('{f:{}}{}{g}'.format(2, 4, f=1, g='g'), ' 14g')
1326
1327    def test_formatting(self):
1328        string_tests.MixinStrUnicodeUserStringTest.test_formatting(self)
1329        # Testing Unicode formatting strings...
1330        self.assertEqual("%s, %s" % ("abc", "abc"), 'abc, abc')
1331        self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", 1, 2, 3), 'abc, abc, 1, 2.000000,  3.00')
1332        self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", 1, -2, 3), 'abc, abc, 1, -2.000000,  3.00')
1333        self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", -1, -2, 3.5), 'abc, abc, -1, -2.000000,  3.50')
1334        self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", -1, -2, 3.57), 'abc, abc, -1, -2.000000,  3.57')
1335        self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", -1, -2, 1003.57), 'abc, abc, -1, -2.000000, 1003.57')
1336        if not sys.platform.startswith('java'):
1337            self.assertEqual("%r, %r" % (b"abc", "abc"), "b'abc', 'abc'")
1338            self.assertEqual("%r" % ("\u1234",), "'\u1234'")
1339            self.assertEqual("%a" % ("\u1234",), "'\\u1234'")
1340        self.assertEqual("%(x)s, %(y)s" % {'x':"abc", 'y':"def"}, 'abc, def')
1341        self.assertEqual("%(x)s, %(\xfc)s" % {'x':"abc", '\xfc':"def"}, 'abc, def')
1342
1343        self.assertEqual('%c' % 0x1234, '\u1234')
1344        self.assertEqual('%c' % 0x21483, '\U00021483')
1345        self.assertRaises(OverflowError, "%c".__mod__, (0x110000,))
1346        self.assertEqual('%c' % '\U00021483', '\U00021483')
1347        self.assertRaises(TypeError, "%c".__mod__, "aa")
1348        self.assertRaises(ValueError, "%.1\u1032f".__mod__, (1.0/3))
1349        self.assertRaises(TypeError, "%i".__mod__, "aa")
1350
1351        # formatting jobs delegated from the string implementation:
1352        self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
1353        self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
1354        self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
1355        self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
1356        self.assertEqual('...%(foo)s...' % {'foo':"abc",'def':123},  '...abc...')
1357        self.assertEqual('...%(foo)s...' % {'foo':"abc",'def':123}, '...abc...')
1358        self.assertEqual('...%s...%s...%s...%s...' % (1,2,3,"abc"), '...1...2...3...abc...')
1359        self.assertEqual('...%%...%%s...%s...%s...%s...%s...' % (1,2,3,"abc"), '...%...%s...1...2...3...abc...')
1360        self.assertEqual('...%s...' % "abc", '...abc...')
1361        self.assertEqual('%*s' % (5,'abc',), '  abc')
1362        self.assertEqual('%*s' % (-5,'abc',), 'abc  ')
1363        self.assertEqual('%*.*s' % (5,2,'abc',), '   ab')
1364        self.assertEqual('%*.*s' % (5,3,'abc',), '  abc')
1365        self.assertEqual('%i %*.*s' % (10, 5,3,'abc',), '10   abc')
1366        self.assertEqual('%i%s %*.*s' % (10, 3, 5, 3, 'abc',), '103   abc')
1367        self.assertEqual('%c' % 'a', 'a')
1368        class Wrapper:
1369            def __str__(self):
1370                return '\u1234'
1371        self.assertEqual('%s' % Wrapper(), '\u1234')
1372
1373        # issue 3382
1374        NAN = float('nan')
1375        INF = float('inf')
1376        self.assertEqual('%f' % NAN, 'nan')
1377        self.assertEqual('%F' % NAN, 'NAN')
1378        self.assertEqual('%f' % INF, 'inf')
1379        self.assertEqual('%F' % INF, 'INF')
1380
1381        # PEP 393
1382        self.assertEqual('%.1s' % "a\xe9\u20ac", 'a')
1383        self.assertEqual('%.2s' % "a\xe9\u20ac", 'a\xe9')
1384
1385        #issue 19995
1386        class PseudoInt:
1387            def __init__(self, value):
1388                self.value = int(value)
1389            def __int__(self):
1390                return self.value
1391            def __index__(self):
1392                return self.value
1393        class PseudoFloat:
1394            def __init__(self, value):
1395                self.value = float(value)
1396            def __int__(self):
1397                return int(self.value)
1398        pi = PseudoFloat(3.1415)
1399        letter_m = PseudoInt(109)
1400        self.assertEqual('%x' % 42, '2a')
1401        self.assertEqual('%X' % 15, 'F')
1402        self.assertEqual('%o' % 9, '11')
1403        self.assertEqual('%c' % 109, 'm')
1404        self.assertEqual('%x' % letter_m, '6d')
1405        self.assertEqual('%X' % letter_m, '6D')
1406        self.assertEqual('%o' % letter_m, '155')
1407        self.assertEqual('%c' % letter_m, 'm')
1408        self.assertRaisesRegex(TypeError, '%x format: an integer is required, not float', operator.mod, '%x', 3.14),
1409        self.assertRaisesRegex(TypeError, '%X format: an integer is required, not float', operator.mod, '%X', 2.11),
1410        self.assertRaisesRegex(TypeError, '%o format: an integer is required, not float', operator.mod, '%o', 1.79),
1411        self.assertRaisesRegex(TypeError, '%x format: an integer is required, not PseudoFloat', operator.mod, '%x', pi),
1412        self.assertRaises(TypeError, operator.mod, '%c', pi),
1413
1414    def test_formatting_with_enum(self):
1415        # issue18780
1416        import enum
1417        class Float(float, enum.Enum):
1418            PI = 3.1415926
1419        class Int(enum.IntEnum):
1420            IDES = 15
1421        class Str(str, enum.Enum):
1422            ABC = 'abc'
1423        # Testing Unicode formatting strings...
1424        self.assertEqual("%s, %s" % (Str.ABC, Str.ABC),
1425                         'Str.ABC, Str.ABC')
1426        self.assertEqual("%s, %s, %d, %i, %u, %f, %5.2f" %
1427                        (Str.ABC, Str.ABC,
1428                         Int.IDES, Int.IDES, Int.IDES,
1429                         Float.PI, Float.PI),
1430                         'Str.ABC, Str.ABC, 15, 15, 15, 3.141593,  3.14')
1431
1432        # formatting jobs delegated from the string implementation:
1433        self.assertEqual('...%(foo)s...' % {'foo':Str.ABC},
1434                         '...Str.ABC...')
1435        self.assertEqual('...%(foo)s...' % {'foo':Int.IDES},
1436                         '...Int.IDES...')
1437        self.assertEqual('...%(foo)i...' % {'foo':Int.IDES},
1438                         '...15...')
1439        self.assertEqual('...%(foo)d...' % {'foo':Int.IDES},
1440                         '...15...')
1441        self.assertEqual('...%(foo)u...' % {'foo':Int.IDES, 'def':Float.PI},
1442                         '...15...')
1443        self.assertEqual('...%(foo)f...' % {'foo':Float.PI,'def':123},
1444                         '...3.141593...')
1445
1446    def test_formatting_huge_precision(self):
1447        format_string = "%.{}f".format(sys.maxsize + 1)
1448        with self.assertRaises(ValueError):
1449            result = format_string % 2.34
1450
1451    def test_issue28598_strsubclass_rhs(self):
1452        # A subclass of str with an __rmod__ method should be able to hook
1453        # into the % operator
1454        class SubclassedStr(str):
1455            def __rmod__(self, other):
1456                return 'Success, self.__rmod__({!r}) was called'.format(other)
1457        self.assertEqual('lhs %% %r' % SubclassedStr('rhs'),
1458                         "Success, self.__rmod__('lhs %% %r') was called")
1459
1460    @support.cpython_only
1461    def test_formatting_huge_precision_c_limits(self):
1462        from _testcapi import INT_MAX
1463        format_string = "%.{}f".format(INT_MAX + 1)
1464        with self.assertRaises(ValueError):
1465            result = format_string % 2.34
1466
1467    def test_formatting_huge_width(self):
1468        format_string = "%{}f".format(sys.maxsize + 1)
1469        with self.assertRaises(ValueError):
1470            result = format_string % 2.34
1471
1472    def test_startswith_endswith_errors(self):
1473        for meth in ('foo'.startswith, 'foo'.endswith):
1474            with self.assertRaises(TypeError) as cm:
1475                meth(['f'])
1476            exc = str(cm.exception)
1477            self.assertIn('str', exc)
1478            self.assertIn('tuple', exc)
1479
1480    @support.run_with_locale('LC_ALL', 'de_DE', 'fr_FR')
1481    def test_format_float(self):
1482        # should not format with a comma, but always with C locale
1483        self.assertEqual('1.0', '%.1f' % 1.0)
1484
1485    def test_constructor(self):
1486        # unicode(obj) tests (this maps to PyObject_Unicode() at C level)
1487
1488        self.assertEqual(
1489            str('unicode remains unicode'),
1490            'unicode remains unicode'
1491        )
1492
1493        for text in ('ascii', '\xe9', '\u20ac', '\U0010FFFF'):
1494            subclass = StrSubclass(text)
1495            self.assertEqual(str(subclass), text)
1496            self.assertEqual(len(subclass), len(text))
1497            if text == 'ascii':
1498                self.assertEqual(subclass.encode('ascii'), b'ascii')
1499                self.assertEqual(subclass.encode('utf-8'), b'ascii')
1500
1501        self.assertEqual(
1502            str('strings are converted to unicode'),
1503            'strings are converted to unicode'
1504        )
1505
1506        class StringCompat:
1507            def __init__(self, x):
1508                self.x = x
1509            def __str__(self):
1510                return self.x
1511
1512        self.assertEqual(
1513            str(StringCompat('__str__ compatible objects are recognized')),
1514            '__str__ compatible objects are recognized'
1515        )
1516
1517        # unicode(obj) is compatible to str():
1518
1519        o = StringCompat('unicode(obj) is compatible to str()')
1520        self.assertEqual(str(o), 'unicode(obj) is compatible to str()')
1521        self.assertEqual(str(o), 'unicode(obj) is compatible to str()')
1522
1523        for obj in (123, 123.45, 123):
1524            self.assertEqual(str(obj), str(str(obj)))
1525
1526        # unicode(obj, encoding, error) tests (this maps to
1527        # PyUnicode_FromEncodedObject() at C level)
1528
1529        if not sys.platform.startswith('java'):
1530            self.assertRaises(
1531                TypeError,
1532                str,
1533                'decoding unicode is not supported',
1534                'utf-8',
1535                'strict'
1536            )
1537
1538        self.assertEqual(
1539            str(b'strings are decoded to unicode', 'utf-8', 'strict'),
1540            'strings are decoded to unicode'
1541        )
1542
1543        if not sys.platform.startswith('java'):
1544            self.assertEqual(
1545                str(
1546                    memoryview(b'character buffers are decoded to unicode'),
1547                    'utf-8',
1548                    'strict'
1549                ),
1550                'character buffers are decoded to unicode'
1551            )
1552
1553        self.assertRaises(TypeError, str, 42, 42, 42)
1554
1555    def test_constructor_keyword_args(self):
1556        """Pass various keyword argument combinations to the constructor."""
1557        # The object argument can be passed as a keyword.
1558        self.assertEqual(str(object='foo'), 'foo')
1559        self.assertEqual(str(object=b'foo', encoding='utf-8'), 'foo')
1560        # The errors argument without encoding triggers "decode" mode.
1561        self.assertEqual(str(b'foo', errors='strict'), 'foo')  # not "b'foo'"
1562        self.assertEqual(str(object=b'foo', errors='strict'), 'foo')
1563
1564    def test_constructor_defaults(self):
1565        """Check the constructor argument defaults."""
1566        # The object argument defaults to '' or b''.
1567        self.assertEqual(str(), '')
1568        self.assertEqual(str(errors='strict'), '')
1569        utf8_cent = '¢'.encode('utf-8')
1570        # The encoding argument defaults to utf-8.
1571        self.assertEqual(str(utf8_cent, errors='strict'), '¢')
1572        # The errors argument defaults to strict.
1573        self.assertRaises(UnicodeDecodeError, str, utf8_cent, encoding='ascii')
1574
1575    def test_codecs_utf7(self):
1576        utfTests = [
1577            ('A\u2262\u0391.', b'A+ImIDkQ.'),             # RFC2152 example
1578            ('Hi Mom -\u263a-!', b'Hi Mom -+Jjo--!'),     # RFC2152 example
1579            ('\u65E5\u672C\u8A9E', b'+ZeVnLIqe-'),        # RFC2152 example
1580            ('Item 3 is \u00a31.', b'Item 3 is +AKM-1.'), # RFC2152 example
1581            ('+', b'+-'),
1582            ('+-', b'+--'),
1583            ('+?', b'+-?'),
1584            (r'\?', b'+AFw?'),
1585            ('+?', b'+-?'),
1586            (r'\\?', b'+AFwAXA?'),
1587            (r'\\\?', b'+AFwAXABc?'),
1588            (r'++--', b'+-+---'),
1589            ('\U000abcde', b'+2m/c3g-'),                  # surrogate pairs
1590            ('/', b'/'),
1591        ]
1592
1593        for (x, y) in utfTests:
1594            self.assertEqual(x.encode('utf-7'), y)
1595
1596        # Unpaired surrogates are passed through
1597        self.assertEqual('\uD801'.encode('utf-7'), b'+2AE-')
1598        self.assertEqual('\uD801x'.encode('utf-7'), b'+2AE-x')
1599        self.assertEqual('\uDC01'.encode('utf-7'), b'+3AE-')
1600        self.assertEqual('\uDC01x'.encode('utf-7'), b'+3AE-x')
1601        self.assertEqual(b'+2AE-'.decode('utf-7'), '\uD801')
1602        self.assertEqual(b'+2AE-x'.decode('utf-7'), '\uD801x')
1603        self.assertEqual(b'+3AE-'.decode('utf-7'), '\uDC01')
1604        self.assertEqual(b'+3AE-x'.decode('utf-7'), '\uDC01x')
1605
1606        self.assertEqual('\uD801\U000abcde'.encode('utf-7'), b'+2AHab9ze-')
1607        self.assertEqual(b'+2AHab9ze-'.decode('utf-7'), '\uD801\U000abcde')
1608
1609        # Issue #2242: crash on some Windows/MSVC versions
1610        self.assertEqual(b'+\xc1'.decode('utf-7', 'ignore'), '')
1611
1612        # Direct encoded characters
1613        set_d = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789'(),-./:?"
1614        # Optional direct characters
1615        set_o = '!"#$%&*;<=>@[]^_`{|}'
1616        for c in set_d:
1617            self.assertEqual(c.encode('utf7'), c.encode('ascii'))
1618            self.assertEqual(c.encode('ascii').decode('utf7'), c)
1619        for c in set_o:
1620            self.assertEqual(c.encode('ascii').decode('utf7'), c)
1621
1622    def test_codecs_utf8(self):
1623        self.assertEqual(''.encode('utf-8'), b'')
1624        self.assertEqual('\u20ac'.encode('utf-8'), b'\xe2\x82\xac')
1625        self.assertEqual('\U00010002'.encode('utf-8'), b'\xf0\x90\x80\x82')
1626        self.assertEqual('\U00023456'.encode('utf-8'), b'\xf0\xa3\x91\x96')
1627        self.assertEqual('\ud800'.encode('utf-8', 'surrogatepass'), b'\xed\xa0\x80')
1628        self.assertEqual('\udc00'.encode('utf-8', 'surrogatepass'), b'\xed\xb0\x80')
1629        self.assertEqual(('\U00010002'*10).encode('utf-8'),
1630                         b'\xf0\x90\x80\x82'*10)
1631        self.assertEqual(
1632            '\u6b63\u78ba\u306b\u8a00\u3046\u3068\u7ffb\u8a33\u306f'
1633            '\u3055\u308c\u3066\u3044\u307e\u305b\u3093\u3002\u4e00'
1634            '\u90e8\u306f\u30c9\u30a4\u30c4\u8a9e\u3067\u3059\u304c'
1635            '\u3001\u3042\u3068\u306f\u3067\u305f\u3089\u3081\u3067'
1636            '\u3059\u3002\u5b9f\u969b\u306b\u306f\u300cWenn ist das'
1637            ' Nunstuck git und'.encode('utf-8'),
1638            b'\xe6\xad\xa3\xe7\xa2\xba\xe3\x81\xab\xe8\xa8\x80\xe3\x81'
1639            b'\x86\xe3\x81\xa8\xe7\xbf\xbb\xe8\xa8\xb3\xe3\x81\xaf\xe3'
1640            b'\x81\x95\xe3\x82\x8c\xe3\x81\xa6\xe3\x81\x84\xe3\x81\xbe'
1641            b'\xe3\x81\x9b\xe3\x82\x93\xe3\x80\x82\xe4\xb8\x80\xe9\x83'
1642            b'\xa8\xe3\x81\xaf\xe3\x83\x89\xe3\x82\xa4\xe3\x83\x84\xe8'
1643            b'\xaa\x9e\xe3\x81\xa7\xe3\x81\x99\xe3\x81\x8c\xe3\x80\x81'
1644            b'\xe3\x81\x82\xe3\x81\xa8\xe3\x81\xaf\xe3\x81\xa7\xe3\x81'
1645            b'\x9f\xe3\x82\x89\xe3\x82\x81\xe3\x81\xa7\xe3\x81\x99\xe3'
1646            b'\x80\x82\xe5\xae\x9f\xe9\x9a\x9b\xe3\x81\xab\xe3\x81\xaf'
1647            b'\xe3\x80\x8cWenn ist das Nunstuck git und'
1648        )
1649
1650        # UTF-8 specific decoding tests
1651        self.assertEqual(str(b'\xf0\xa3\x91\x96', 'utf-8'), '\U00023456' )
1652        self.assertEqual(str(b'\xf0\x90\x80\x82', 'utf-8'), '\U00010002' )
1653        self.assertEqual(str(b'\xe2\x82\xac', 'utf-8'), '\u20ac' )
1654
1655        # Other possible utf-8 test cases:
1656        # * strict decoding testing for all of the
1657        #   UTF8_ERROR cases in PyUnicode_DecodeUTF8
1658
1659    def test_utf8_decode_valid_sequences(self):
1660        sequences = [
1661            # single byte
1662            (b'\x00', '\x00'), (b'a', 'a'), (b'\x7f', '\x7f'),
1663            # 2 bytes
1664            (b'\xc2\x80', '\x80'), (b'\xdf\xbf', '\u07ff'),
1665            # 3 bytes
1666            (b'\xe0\xa0\x80', '\u0800'), (b'\xed\x9f\xbf', '\ud7ff'),
1667            (b'\xee\x80\x80', '\uE000'), (b'\xef\xbf\xbf', '\uffff'),
1668            # 4 bytes
1669            (b'\xF0\x90\x80\x80', '\U00010000'),
1670            (b'\xf4\x8f\xbf\xbf', '\U0010FFFF')
1671        ]
1672        for seq, res in sequences:
1673            self.assertEqual(seq.decode('utf-8'), res)
1674
1675
1676    def test_utf8_decode_invalid_sequences(self):
1677        # continuation bytes in a sequence of 2, 3, or 4 bytes
1678        continuation_bytes = [bytes([x]) for x in range(0x80, 0xC0)]
1679        # start bytes of a 2-byte sequence equivalent to code points < 0x7F
1680        invalid_2B_seq_start_bytes = [bytes([x]) for x in range(0xC0, 0xC2)]
1681        # start bytes of a 4-byte sequence equivalent to code points > 0x10FFFF
1682        invalid_4B_seq_start_bytes = [bytes([x]) for x in range(0xF5, 0xF8)]
1683        invalid_start_bytes = (
1684            continuation_bytes + invalid_2B_seq_start_bytes +
1685            invalid_4B_seq_start_bytes + [bytes([x]) for x in range(0xF7, 0x100)]
1686        )
1687
1688        for byte in invalid_start_bytes:
1689            self.assertRaises(UnicodeDecodeError, byte.decode, 'utf-8')
1690
1691        for sb in invalid_2B_seq_start_bytes:
1692            for cb in continuation_bytes:
1693                self.assertRaises(UnicodeDecodeError, (sb+cb).decode, 'utf-8')
1694
1695        for sb in invalid_4B_seq_start_bytes:
1696            for cb1 in continuation_bytes[:3]:
1697                for cb3 in continuation_bytes[:3]:
1698                    self.assertRaises(UnicodeDecodeError,
1699                                      (sb+cb1+b'\x80'+cb3).decode, 'utf-8')
1700
1701        for cb in [bytes([x]) for x in range(0x80, 0xA0)]:
1702            self.assertRaises(UnicodeDecodeError,
1703                              (b'\xE0'+cb+b'\x80').decode, 'utf-8')
1704            self.assertRaises(UnicodeDecodeError,
1705                              (b'\xE0'+cb+b'\xBF').decode, 'utf-8')
1706        # surrogates
1707        for cb in [bytes([x]) for x in range(0xA0, 0xC0)]:
1708            self.assertRaises(UnicodeDecodeError,
1709                              (b'\xED'+cb+b'\x80').decode, 'utf-8')
1710            self.assertRaises(UnicodeDecodeError,
1711                              (b'\xED'+cb+b'\xBF').decode, 'utf-8')
1712        for cb in [bytes([x]) for x in range(0x80, 0x90)]:
1713            self.assertRaises(UnicodeDecodeError,
1714                              (b'\xF0'+cb+b'\x80\x80').decode, 'utf-8')
1715            self.assertRaises(UnicodeDecodeError,
1716                              (b'\xF0'+cb+b'\xBF\xBF').decode, 'utf-8')
1717        for cb in [bytes([x]) for x in range(0x90, 0xC0)]:
1718            self.assertRaises(UnicodeDecodeError,
1719                              (b'\xF4'+cb+b'\x80\x80').decode, 'utf-8')
1720            self.assertRaises(UnicodeDecodeError,
1721                              (b'\xF4'+cb+b'\xBF\xBF').decode, 'utf-8')
1722
1723    def test_issue8271(self):
1724        # Issue #8271: during the decoding of an invalid UTF-8 byte sequence,
1725        # only the start byte and the continuation byte(s) are now considered
1726        # invalid, instead of the number of bytes specified by the start byte.
1727        # See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf (page 95,
1728        # table 3-8, Row 2) for more information about the algorithm used.
1729        FFFD = '\ufffd'
1730        sequences = [
1731            # invalid start bytes
1732            (b'\x80', FFFD), # continuation byte
1733            (b'\x80\x80', FFFD*2), # 2 continuation bytes
1734            (b'\xc0', FFFD),
1735            (b'\xc0\xc0', FFFD*2),
1736            (b'\xc1', FFFD),
1737            (b'\xc1\xc0', FFFD*2),
1738            (b'\xc0\xc1', FFFD*2),
1739            # with start byte of a 2-byte sequence
1740            (b'\xc2', FFFD), # only the start byte
1741            (b'\xc2\xc2', FFFD*2), # 2 start bytes
1742            (b'\xc2\xc2\xc2', FFFD*3), # 3 start bytes
1743            (b'\xc2\x41', FFFD+'A'), # invalid continuation byte
1744            # with start byte of a 3-byte sequence
1745            (b'\xe1', FFFD), # only the start byte
1746            (b'\xe1\xe1', FFFD*2), # 2 start bytes
1747            (b'\xe1\xe1\xe1', FFFD*3), # 3 start bytes
1748            (b'\xe1\xe1\xe1\xe1', FFFD*4), # 4 start bytes
1749            (b'\xe1\x80', FFFD), # only 1 continuation byte
1750            (b'\xe1\x41', FFFD+'A'), # invalid continuation byte
1751            (b'\xe1\x41\x80', FFFD+'A'+FFFD), # invalid cb followed by valid cb
1752            (b'\xe1\x41\x41', FFFD+'AA'), # 2 invalid continuation bytes
1753            (b'\xe1\x80\x41', FFFD+'A'), # only 1 valid continuation byte
1754            (b'\xe1\x80\xe1\x41', FFFD*2+'A'), # 1 valid and the other invalid
1755            (b'\xe1\x41\xe1\x80', FFFD+'A'+FFFD), # 1 invalid and the other valid
1756            # with start byte of a 4-byte sequence
1757            (b'\xf1', FFFD), # only the start byte
1758            (b'\xf1\xf1', FFFD*2), # 2 start bytes
1759            (b'\xf1\xf1\xf1', FFFD*3), # 3 start bytes
1760            (b'\xf1\xf1\xf1\xf1', FFFD*4), # 4 start bytes
1761            (b'\xf1\xf1\xf1\xf1\xf1', FFFD*5), # 5 start bytes
1762            (b'\xf1\x80', FFFD), # only 1 continuation bytes
1763            (b'\xf1\x80\x80', FFFD), # only 2 continuation bytes
1764            (b'\xf1\x80\x41', FFFD+'A'), # 1 valid cb and 1 invalid
1765            (b'\xf1\x80\x41\x41', FFFD+'AA'), # 1 valid cb and 1 invalid
1766            (b'\xf1\x80\x80\x41', FFFD+'A'), # 2 valid cb and 1 invalid
1767            (b'\xf1\x41\x80', FFFD+'A'+FFFD), # 1 invalid cv and 1 valid
1768            (b'\xf1\x41\x80\x80', FFFD+'A'+FFFD*2), # 1 invalid cb and 2 invalid
1769            (b'\xf1\x41\x80\x41', FFFD+'A'+FFFD+'A'), # 2 invalid cb and 1 invalid
1770            (b'\xf1\x41\x41\x80', FFFD+'AA'+FFFD), # 1 valid cb and 1 invalid
1771            (b'\xf1\x41\xf1\x80', FFFD+'A'+FFFD),
1772            (b'\xf1\x41\x80\xf1', FFFD+'A'+FFFD*2),
1773            (b'\xf1\xf1\x80\x41', FFFD*2+'A'),
1774            (b'\xf1\x41\xf1\xf1', FFFD+'A'+FFFD*2),
1775            # with invalid start byte of a 4-byte sequence (rfc2279)
1776            (b'\xf5', FFFD), # only the start byte
1777            (b'\xf5\xf5', FFFD*2), # 2 start bytes
1778            (b'\xf5\x80', FFFD*2), # only 1 continuation byte
1779            (b'\xf5\x80\x80', FFFD*3), # only 2 continuation byte
1780            (b'\xf5\x80\x80\x80', FFFD*4), # 3 continuation bytes
1781            (b'\xf5\x80\x41', FFFD*2+'A'), #  1 valid cb and 1 invalid
1782            (b'\xf5\x80\x41\xf5', FFFD*2+'A'+FFFD),
1783            (b'\xf5\x41\x80\x80\x41', FFFD+'A'+FFFD*2+'A'),
1784            # with invalid start byte of a 5-byte sequence (rfc2279)
1785            (b'\xf8', FFFD), # only the start byte
1786            (b'\xf8\xf8', FFFD*2), # 2 start bytes
1787            (b'\xf8\x80', FFFD*2), # only one continuation byte
1788            (b'\xf8\x80\x41', FFFD*2 + 'A'), # 1 valid cb and 1 invalid
1789            (b'\xf8\x80\x80\x80\x80', FFFD*5), # invalid 5 bytes seq with 5 bytes
1790            # with invalid start byte of a 6-byte sequence (rfc2279)
1791            (b'\xfc', FFFD), # only the start byte
1792            (b'\xfc\xfc', FFFD*2), # 2 start bytes
1793            (b'\xfc\x80\x80', FFFD*3), # only 2 continuation bytes
1794            (b'\xfc\x80\x80\x80\x80\x80', FFFD*6), # 6 continuation bytes
1795            # invalid start byte
1796            (b'\xfe', FFFD),
1797            (b'\xfe\x80\x80', FFFD*3),
1798            # other sequences
1799            (b'\xf1\x80\x41\x42\x43', '\ufffd\x41\x42\x43'),
1800            (b'\xf1\x80\xff\x42\x43', '\ufffd\ufffd\x42\x43'),
1801            (b'\xf1\x80\xc2\x81\x43', '\ufffd\x81\x43'),
1802            (b'\x61\xF1\x80\x80\xE1\x80\xC2\x62\x80\x63\x80\xBF\x64',
1803             '\x61\uFFFD\uFFFD\uFFFD\x62\uFFFD\x63\uFFFD\uFFFD\x64'),
1804        ]
1805        for n, (seq, res) in enumerate(sequences):
1806            self.assertRaises(UnicodeDecodeError, seq.decode, 'utf-8', 'strict')
1807            self.assertEqual(seq.decode('utf-8', 'replace'), res)
1808            self.assertEqual((seq+b'b').decode('utf-8', 'replace'), res+'b')
1809            self.assertEqual(seq.decode('utf-8', 'ignore'),
1810                             res.replace('\uFFFD', ''))
1811
1812    def to_bytestring(self, seq):
1813        return bytes(int(c, 16) for c in seq.split())
1814
1815    def assertCorrectUTF8Decoding(self, seq, res, err):
1816        """
1817        Check that an invalid UTF-8 sequence raises a UnicodeDecodeError when
1818        'strict' is used, returns res when 'replace' is used, and that doesn't
1819        return anything when 'ignore' is used.
1820        """
1821        with self.assertRaises(UnicodeDecodeError) as cm:
1822            seq.decode('utf-8')
1823        exc = cm.exception
1824
1825        self.assertIn(err, str(exc))
1826        self.assertEqual(seq.decode('utf-8', 'replace'), res)
1827        self.assertEqual((b'aaaa' + seq + b'bbbb').decode('utf-8', 'replace'),
1828                         'aaaa' + res + 'bbbb')
1829        res = res.replace('\ufffd', '')
1830        self.assertEqual(seq.decode('utf-8', 'ignore'), res)
1831        self.assertEqual((b'aaaa' + seq + b'bbbb').decode('utf-8', 'ignore'),
1832                          'aaaa' + res + 'bbbb')
1833
1834    def test_invalid_start_byte(self):
1835        """
1836        Test that an 'invalid start byte' error is raised when the first byte
1837        is not in the ASCII range or is not a valid start byte of a 2-, 3-, or
1838        4-bytes sequence. The invalid start byte is replaced with a single
1839        U+FFFD when errors='replace'.
1840        E.g. <80> is a continuation byte and can appear only after a start byte.
1841        """
1842        FFFD = '\ufffd'
1843        for byte in b'\x80\xA0\x9F\xBF\xC0\xC1\xF5\xFF':
1844            self.assertCorrectUTF8Decoding(bytes([byte]), '\ufffd',
1845                                           'invalid start byte')
1846
1847    def test_unexpected_end_of_data(self):
1848        """
1849        Test that an 'unexpected end of data' error is raised when the string
1850        ends after a start byte of a 2-, 3-, or 4-bytes sequence without having
1851        enough continuation bytes.  The incomplete sequence is replaced with a
1852        single U+FFFD when errors='replace'.
1853        E.g. in the sequence <F3 80 80>, F3 is the start byte of a 4-bytes
1854        sequence, but it's followed by only 2 valid continuation bytes and the
1855        last continuation bytes is missing.
1856        Note: the continuation bytes must be all valid, if one of them is
1857        invalid another error will be raised.
1858        """
1859        sequences = [
1860            'C2', 'DF',
1861            'E0 A0', 'E0 BF', 'E1 80', 'E1 BF', 'EC 80', 'EC BF',
1862            'ED 80', 'ED 9F', 'EE 80', 'EE BF', 'EF 80', 'EF BF',
1863            'F0 90', 'F0 BF', 'F0 90 80', 'F0 90 BF', 'F0 BF 80', 'F0 BF BF',
1864            'F1 80', 'F1 BF', 'F1 80 80', 'F1 80 BF', 'F1 BF 80', 'F1 BF BF',
1865            'F3 80', 'F3 BF', 'F3 80 80', 'F3 80 BF', 'F3 BF 80', 'F3 BF BF',
1866            'F4 80', 'F4 8F', 'F4 80 80', 'F4 80 BF', 'F4 8F 80', 'F4 8F BF'
1867        ]
1868        FFFD = '\ufffd'
1869        for seq in sequences:
1870            self.assertCorrectUTF8Decoding(self.to_bytestring(seq), '\ufffd',
1871                                           'unexpected end of data')
1872
1873    def test_invalid_cb_for_2bytes_seq(self):
1874        """
1875        Test that an 'invalid continuation byte' error is raised when the
1876        continuation byte of a 2-bytes sequence is invalid.  The start byte
1877        is replaced by a single U+FFFD and the second byte is handled
1878        separately when errors='replace'.
1879        E.g. in the sequence <C2 41>, C2 is the start byte of a 2-bytes
1880        sequence, but 41 is not a valid continuation byte because it's the
1881        ASCII letter 'A'.
1882        """
1883        FFFD = '\ufffd'
1884        FFFDx2 = FFFD * 2
1885        sequences = [
1886            ('C2 00', FFFD+'\x00'), ('C2 7F', FFFD+'\x7f'),
1887            ('C2 C0', FFFDx2), ('C2 FF', FFFDx2),
1888            ('DF 00', FFFD+'\x00'), ('DF 7F', FFFD+'\x7f'),
1889            ('DF C0', FFFDx2), ('DF FF', FFFDx2),
1890        ]
1891        for seq, res in sequences:
1892            self.assertCorrectUTF8Decoding(self.to_bytestring(seq), res,
1893                                           'invalid continuation byte')
1894
1895    def test_invalid_cb_for_3bytes_seq(self):
1896        """
1897        Test that an 'invalid continuation byte' error is raised when the
1898        continuation byte(s) of a 3-bytes sequence are invalid.  When
1899        errors='replace', if the first continuation byte is valid, the first
1900        two bytes (start byte + 1st cb) are replaced by a single U+FFFD and the
1901        third byte is handled separately, otherwise only the start byte is
1902        replaced with a U+FFFD and the other continuation bytes are handled
1903        separately.
1904        E.g. in the sequence <E1 80 41>, E1 is the start byte of a 3-bytes
1905        sequence, 80 is a valid continuation byte, but 41 is not a valid cb
1906        because it's the ASCII letter 'A'.
1907        Note: when the start byte is E0 or ED, the valid ranges for the first
1908        continuation byte are limited to A0..BF and 80..9F respectively.
1909        Python 2 used to consider all the bytes in range 80..BF valid when the
1910        start byte was ED.  This is fixed in Python 3.
1911        """
1912        FFFD = '\ufffd'
1913        FFFDx2 = FFFD * 2
1914        sequences = [
1915            ('E0 00', FFFD+'\x00'), ('E0 7F', FFFD+'\x7f'), ('E0 80', FFFDx2),
1916            ('E0 9F', FFFDx2), ('E0 C0', FFFDx2), ('E0 FF', FFFDx2),
1917            ('E0 A0 00', FFFD+'\x00'), ('E0 A0 7F', FFFD+'\x7f'),
1918            ('E0 A0 C0', FFFDx2), ('E0 A0 FF', FFFDx2),
1919            ('E0 BF 00', FFFD+'\x00'), ('E0 BF 7F', FFFD+'\x7f'),
1920            ('E0 BF C0', FFFDx2), ('E0 BF FF', FFFDx2), ('E1 00', FFFD+'\x00'),
1921            ('E1 7F', FFFD+'\x7f'), ('E1 C0', FFFDx2), ('E1 FF', FFFDx2),
1922            ('E1 80 00', FFFD+'\x00'), ('E1 80 7F', FFFD+'\x7f'),
1923            ('E1 80 C0', FFFDx2), ('E1 80 FF', FFFDx2),
1924            ('E1 BF 00', FFFD+'\x00'), ('E1 BF 7F', FFFD+'\x7f'),
1925            ('E1 BF C0', FFFDx2), ('E1 BF FF', FFFDx2), ('EC 00', FFFD+'\x00'),
1926            ('EC 7F', FFFD+'\x7f'), ('EC C0', FFFDx2), ('EC FF', FFFDx2),
1927            ('EC 80 00', FFFD+'\x00'), ('EC 80 7F', FFFD+'\x7f'),
1928            ('EC 80 C0', FFFDx2), ('EC 80 FF', FFFDx2),
1929            ('EC BF 00', FFFD+'\x00'), ('EC BF 7F', FFFD+'\x7f'),
1930            ('EC BF C0', FFFDx2), ('EC BF FF', FFFDx2), ('ED 00', FFFD+'\x00'),
1931            ('ED 7F', FFFD+'\x7f'),
1932            ('ED A0', FFFDx2), ('ED BF', FFFDx2), # see note ^
1933            ('ED C0', FFFDx2), ('ED FF', FFFDx2), ('ED 80 00', FFFD+'\x00'),
1934            ('ED 80 7F', FFFD+'\x7f'), ('ED 80 C0', FFFDx2),
1935            ('ED 80 FF', FFFDx2), ('ED 9F 00', FFFD+'\x00'),
1936            ('ED 9F 7F', FFFD+'\x7f'), ('ED 9F C0', FFFDx2),
1937            ('ED 9F FF', FFFDx2), ('EE 00', FFFD+'\x00'),
1938            ('EE 7F', FFFD+'\x7f'), ('EE C0', FFFDx2), ('EE FF', FFFDx2),
1939            ('EE 80 00', FFFD+'\x00'), ('EE 80 7F', FFFD+'\x7f'),
1940            ('EE 80 C0', FFFDx2), ('EE 80 FF', FFFDx2),
1941            ('EE BF 00', FFFD+'\x00'), ('EE BF 7F', FFFD+'\x7f'),
1942            ('EE BF C0', FFFDx2), ('EE BF FF', FFFDx2), ('EF 00', FFFD+'\x00'),
1943            ('EF 7F', FFFD+'\x7f'), ('EF C0', FFFDx2), ('EF FF', FFFDx2),
1944            ('EF 80 00', FFFD+'\x00'), ('EF 80 7F', FFFD+'\x7f'),
1945            ('EF 80 C0', FFFDx2), ('EF 80 FF', FFFDx2),
1946            ('EF BF 00', FFFD+'\x00'), ('EF BF 7F', FFFD+'\x7f'),
1947            ('EF BF C0', FFFDx2), ('EF BF FF', FFFDx2),
1948        ]
1949        for seq, res in sequences:
1950            self.assertCorrectUTF8Decoding(self.to_bytestring(seq), res,
1951                                           'invalid continuation byte')
1952
1953    def test_invalid_cb_for_4bytes_seq(self):
1954        """
1955        Test that an 'invalid continuation byte' error is raised when the
1956        continuation byte(s) of a 4-bytes sequence are invalid.  When
1957        errors='replace',the start byte and all the following valid
1958        continuation bytes are replaced with a single U+FFFD, and all the bytes
1959        starting from the first invalid continuation bytes (included) are
1960        handled separately.
1961        E.g. in the sequence <E1 80 41>, E1 is the start byte of a 3-bytes
1962        sequence, 80 is a valid continuation byte, but 41 is not a valid cb
1963        because it's the ASCII letter 'A'.
1964        Note: when the start byte is E0 or ED, the valid ranges for the first
1965        continuation byte are limited to A0..BF and 80..9F respectively.
1966        However, when the start byte is ED, Python 2 considers all the bytes
1967        in range 80..BF valid.  This is fixed in Python 3.
1968        """
1969        FFFD = '\ufffd'
1970        FFFDx2 = FFFD * 2
1971        sequences = [
1972            ('F0 00', FFFD+'\x00'), ('F0 7F', FFFD+'\x7f'), ('F0 80', FFFDx2),
1973            ('F0 8F', FFFDx2), ('F0 C0', FFFDx2), ('F0 FF', FFFDx2),
1974            ('F0 90 00', FFFD+'\x00'), ('F0 90 7F', FFFD+'\x7f'),
1975            ('F0 90 C0', FFFDx2), ('F0 90 FF', FFFDx2),
1976            ('F0 BF 00', FFFD+'\x00'), ('F0 BF 7F', FFFD+'\x7f'),
1977            ('F0 BF C0', FFFDx2), ('F0 BF FF', FFFDx2),
1978            ('F0 90 80 00', FFFD+'\x00'), ('F0 90 80 7F', FFFD+'\x7f'),
1979            ('F0 90 80 C0', FFFDx2), ('F0 90 80 FF', FFFDx2),
1980            ('F0 90 BF 00', FFFD+'\x00'), ('F0 90 BF 7F', FFFD+'\x7f'),
1981            ('F0 90 BF C0', FFFDx2), ('F0 90 BF FF', FFFDx2),
1982            ('F0 BF 80 00', FFFD+'\x00'), ('F0 BF 80 7F', FFFD+'\x7f'),
1983            ('F0 BF 80 C0', FFFDx2), ('F0 BF 80 FF', FFFDx2),
1984            ('F0 BF BF 00', FFFD+'\x00'), ('F0 BF BF 7F', FFFD+'\x7f'),
1985            ('F0 BF BF C0', FFFDx2), ('F0 BF BF FF', FFFDx2),
1986            ('F1 00', FFFD+'\x00'), ('F1 7F', FFFD+'\x7f'), ('F1 C0', FFFDx2),
1987            ('F1 FF', FFFDx2), ('F1 80 00', FFFD+'\x00'),
1988            ('F1 80 7F', FFFD+'\x7f'), ('F1 80 C0', FFFDx2),
1989            ('F1 80 FF', FFFDx2), ('F1 BF 00', FFFD+'\x00'),
1990            ('F1 BF 7F', FFFD+'\x7f'), ('F1 BF C0', FFFDx2),
1991            ('F1 BF FF', FFFDx2), ('F1 80 80 00', FFFD+'\x00'),
1992            ('F1 80 80 7F', FFFD+'\x7f'), ('F1 80 80 C0', FFFDx2),
1993            ('F1 80 80 FF', FFFDx2), ('F1 80 BF 00', FFFD+'\x00'),
1994            ('F1 80 BF 7F', FFFD+'\x7f'), ('F1 80 BF C0', FFFDx2),
1995            ('F1 80 BF FF', FFFDx2), ('F1 BF 80 00', FFFD+'\x00'),
1996            ('F1 BF 80 7F', FFFD+'\x7f'), ('F1 BF 80 C0', FFFDx2),
1997            ('F1 BF 80 FF', FFFDx2), ('F1 BF BF 00', FFFD+'\x00'),
1998            ('F1 BF BF 7F', FFFD+'\x7f'), ('F1 BF BF C0', FFFDx2),
1999            ('F1 BF BF FF', FFFDx2), ('F3 00', FFFD+'\x00'),
2000            ('F3 7F', FFFD+'\x7f'), ('F3 C0', FFFDx2), ('F3 FF', FFFDx2),
2001            ('F3 80 00', FFFD+'\x00'), ('F3 80 7F', FFFD+'\x7f'),
2002            ('F3 80 C0', FFFDx2), ('F3 80 FF', FFFDx2),
2003            ('F3 BF 00', FFFD+'\x00'), ('F3 BF 7F', FFFD+'\x7f'),
2004            ('F3 BF C0', FFFDx2), ('F3 BF FF', FFFDx2),
2005            ('F3 80 80 00', FFFD+'\x00'), ('F3 80 80 7F', FFFD+'\x7f'),
2006            ('F3 80 80 C0', FFFDx2), ('F3 80 80 FF', FFFDx2),
2007            ('F3 80 BF 00', FFFD+'\x00'), ('F3 80 BF 7F', FFFD+'\x7f'),
2008            ('F3 80 BF C0', FFFDx2), ('F3 80 BF FF', FFFDx2),
2009            ('F3 BF 80 00', FFFD+'\x00'), ('F3 BF 80 7F', FFFD+'\x7f'),
2010            ('F3 BF 80 C0', FFFDx2), ('F3 BF 80 FF', FFFDx2),
2011            ('F3 BF BF 00', FFFD+'\x00'), ('F3 BF BF 7F', FFFD+'\x7f'),
2012            ('F3 BF BF C0', FFFDx2), ('F3 BF BF FF', FFFDx2),
2013            ('F4 00', FFFD+'\x00'), ('F4 7F', FFFD+'\x7f'), ('F4 90', FFFDx2),
2014            ('F4 BF', FFFDx2), ('F4 C0', FFFDx2), ('F4 FF', FFFDx2),
2015            ('F4 80 00', FFFD+'\x00'), ('F4 80 7F', FFFD+'\x7f'),
2016            ('F4 80 C0', FFFDx2), ('F4 80 FF', FFFDx2),
2017            ('F4 8F 00', FFFD+'\x00'), ('F4 8F 7F', FFFD+'\x7f'),
2018            ('F4 8F C0', FFFDx2), ('F4 8F FF', FFFDx2),
2019            ('F4 80 80 00', FFFD+'\x00'), ('F4 80 80 7F', FFFD+'\x7f'),
2020            ('F4 80 80 C0', FFFDx2), ('F4 80 80 FF', FFFDx2),
2021            ('F4 80 BF 00', FFFD+'\x00'), ('F4 80 BF 7F', FFFD+'\x7f'),
2022            ('F4 80 BF C0', FFFDx2), ('F4 80 BF FF', FFFDx2),
2023            ('F4 8F 80 00', FFFD+'\x00'), ('F4 8F 80 7F', FFFD+'\x7f'),
2024            ('F4 8F 80 C0', FFFDx2), ('F4 8F 80 FF', FFFDx2),
2025            ('F4 8F BF 00', FFFD+'\x00'), ('F4 8F BF 7F', FFFD+'\x7f'),
2026            ('F4 8F BF C0', FFFDx2), ('F4 8F BF FF', FFFDx2)
2027        ]
2028        for seq, res in sequences:
2029            self.assertCorrectUTF8Decoding(self.to_bytestring(seq), res,
2030                                           'invalid continuation byte')
2031
2032    def test_codecs_idna(self):
2033        # Test whether trailing dot is preserved
2034        self.assertEqual("www.python.org.".encode("idna"), b"www.python.org.")
2035
2036    def test_codecs_errors(self):
2037        # Error handling (encoding)
2038        self.assertRaises(UnicodeError, 'Andr\202 x'.encode, 'ascii')
2039        self.assertRaises(UnicodeError, 'Andr\202 x'.encode, 'ascii','strict')
2040        self.assertEqual('Andr\202 x'.encode('ascii','ignore'), b"Andr x")
2041        self.assertEqual('Andr\202 x'.encode('ascii','replace'), b"Andr? x")
2042        self.assertEqual('Andr\202 x'.encode('ascii', 'replace'),
2043                         'Andr\202 x'.encode('ascii', errors='replace'))
2044        self.assertEqual('Andr\202 x'.encode('ascii', 'ignore'),
2045                         'Andr\202 x'.encode(encoding='ascii', errors='ignore'))
2046
2047        # Error handling (decoding)
2048        self.assertRaises(UnicodeError, str, b'Andr\202 x', 'ascii')
2049        self.assertRaises(UnicodeError, str, b'Andr\202 x', 'ascii', 'strict')
2050        self.assertEqual(str(b'Andr\202 x', 'ascii', 'ignore'), "Andr x")
2051        self.assertEqual(str(b'Andr\202 x', 'ascii', 'replace'), 'Andr\uFFFD x')
2052        self.assertEqual(str(b'\202 x', 'ascii', 'replace'), '\uFFFD x')
2053
2054        # Error handling (unknown character names)
2055        self.assertEqual(b"\\N{foo}xx".decode("unicode-escape", "ignore"), "xx")
2056
2057        # Error handling (truncated escape sequence)
2058        self.assertRaises(UnicodeError, b"\\".decode, "unicode-escape")
2059
2060        self.assertRaises(TypeError, b"hello".decode, "test.unicode1")
2061        self.assertRaises(TypeError, str, b"hello", "test.unicode2")
2062        self.assertRaises(TypeError, "hello".encode, "test.unicode1")
2063        self.assertRaises(TypeError, "hello".encode, "test.unicode2")
2064
2065        # Error handling (wrong arguments)
2066        self.assertRaises(TypeError, "hello".encode, 42, 42, 42)
2067
2068        # Error handling (lone surrogate in PyUnicode_TransformDecimalToASCII())
2069        self.assertRaises(UnicodeError, float, "\ud800")
2070        self.assertRaises(UnicodeError, float, "\udf00")
2071        self.assertRaises(UnicodeError, complex, "\ud800")
2072        self.assertRaises(UnicodeError, complex, "\udf00")
2073
2074    def test_codecs(self):
2075        # Encoding
2076        self.assertEqual('hello'.encode('ascii'), b'hello')
2077        self.assertEqual('hello'.encode('utf-7'), b'hello')
2078        self.assertEqual('hello'.encode('utf-8'), b'hello')
2079        self.assertEqual('hello'.encode('utf-8'), b'hello')
2080        self.assertEqual('hello'.encode('utf-16-le'), b'h\000e\000l\000l\000o\000')
2081        self.assertEqual('hello'.encode('utf-16-be'), b'\000h\000e\000l\000l\000o')
2082        self.assertEqual('hello'.encode('latin-1'), b'hello')
2083
2084        # Default encoding is utf-8
2085        self.assertEqual('\u2603'.encode(), b'\xe2\x98\x83')
2086
2087        # Roundtrip safety for BMP (just the first 1024 chars)
2088        for c in range(1024):
2089            u = chr(c)
2090            for encoding in ('utf-7', 'utf-8', 'utf-16', 'utf-16-le',
2091                             'utf-16-be', 'raw_unicode_escape',
2092                             'unicode_escape', 'unicode_internal'):
2093                with warnings.catch_warnings():
2094                    # unicode-internal has been deprecated
2095                    warnings.simplefilter("ignore", DeprecationWarning)
2096
2097                    self.assertEqual(str(u.encode(encoding),encoding), u)
2098
2099        # Roundtrip safety for BMP (just the first 256 chars)
2100        for c in range(256):
2101            u = chr(c)
2102            for encoding in ('latin-1',):
2103                self.assertEqual(str(u.encode(encoding),encoding), u)
2104
2105        # Roundtrip safety for BMP (just the first 128 chars)
2106        for c in range(128):
2107            u = chr(c)
2108            for encoding in ('ascii',):
2109                self.assertEqual(str(u.encode(encoding),encoding), u)
2110
2111        # Roundtrip safety for non-BMP (just a few chars)
2112        with warnings.catch_warnings():
2113            # unicode-internal has been deprecated
2114            warnings.simplefilter("ignore", DeprecationWarning)
2115
2116            u = '\U00010001\U00020002\U00030003\U00040004\U00050005'
2117            for encoding in ('utf-8', 'utf-16', 'utf-16-le', 'utf-16-be',
2118                             'raw_unicode_escape',
2119                             'unicode_escape', 'unicode_internal'):
2120                self.assertEqual(str(u.encode(encoding),encoding), u)
2121
2122        # UTF-8 must be roundtrip safe for all code points
2123        # (except surrogates, which are forbidden).
2124        u = ''.join(map(chr, list(range(0, 0xd800)) +
2125                             list(range(0xe000, 0x110000))))
2126        for encoding in ('utf-8',):
2127            self.assertEqual(str(u.encode(encoding),encoding), u)
2128
2129    def test_codecs_charmap(self):
2130        # 0-127
2131        s = bytes(range(128))
2132        for encoding in (
2133            'cp037', 'cp1026', 'cp273',
2134            'cp437', 'cp500', 'cp720', 'cp737', 'cp775', 'cp850',
2135            'cp852', 'cp855', 'cp858', 'cp860', 'cp861', 'cp862',
2136            'cp863', 'cp865', 'cp866', 'cp1125',
2137            'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
2138            'iso8859_2', 'iso8859_3', 'iso8859_4', 'iso8859_5', 'iso8859_6',
2139            'iso8859_7', 'iso8859_9',
2140            'koi8_r', 'koi8_t', 'koi8_u', 'kz1048', 'latin_1',
2141            'mac_cyrillic', 'mac_latin2',
2142
2143            'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
2144            'cp1256', 'cp1257', 'cp1258',
2145            'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
2146
2147            'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
2148            'cp1006', 'iso8859_8',
2149
2150            ### These have undefined mappings:
2151            #'cp424',
2152
2153            ### These fail the round-trip:
2154            #'cp875'
2155
2156            ):
2157            self.assertEqual(str(s, encoding).encode(encoding), s)
2158
2159        # 128-255
2160        s = bytes(range(128, 256))
2161        for encoding in (
2162            'cp037', 'cp1026', 'cp273',
2163            'cp437', 'cp500', 'cp720', 'cp737', 'cp775', 'cp850',
2164            'cp852', 'cp855', 'cp858', 'cp860', 'cp861', 'cp862',
2165            'cp863', 'cp865', 'cp866', 'cp1125',
2166            'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
2167            'iso8859_2', 'iso8859_4', 'iso8859_5',
2168            'iso8859_9', 'koi8_r', 'koi8_u', 'latin_1',
2169            'mac_cyrillic', 'mac_latin2',
2170
2171            ### These have undefined mappings:
2172            #'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
2173            #'cp1256', 'cp1257', 'cp1258',
2174            #'cp424', 'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
2175            #'iso8859_3', 'iso8859_6', 'iso8859_7', 'koi8_t', 'kz1048',
2176            #'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
2177
2178            ### These fail the round-trip:
2179            #'cp1006', 'cp875', 'iso8859_8',
2180
2181            ):
2182            self.assertEqual(str(s, encoding).encode(encoding), s)
2183
2184    def test_concatenation(self):
2185        self.assertEqual(("abc" "def"), "abcdef")
2186        self.assertEqual(("abc" "def"), "abcdef")
2187        self.assertEqual(("abc" "def"), "abcdef")
2188        self.assertEqual(("abc" "def" "ghi"), "abcdefghi")
2189        self.assertEqual(("abc" "def" "ghi"), "abcdefghi")
2190
2191    def test_printing(self):
2192        class BitBucket:
2193            def write(self, text):
2194                pass
2195
2196        out = BitBucket()
2197        print('abc', file=out)
2198        print('abc', 'def', file=out)
2199        print('abc', 'def', file=out)
2200        print('abc', 'def', file=out)
2201        print('abc\n', file=out)
2202        print('abc\n', end=' ', file=out)
2203        print('abc\n', end=' ', file=out)
2204        print('def\n', file=out)
2205        print('def\n', file=out)
2206
2207    def test_ucs4(self):
2208        x = '\U00100000'
2209        y = x.encode("raw-unicode-escape").decode("raw-unicode-escape")
2210        self.assertEqual(x, y)
2211
2212        y = br'\U00100000'
2213        x = y.decode("raw-unicode-escape").encode("raw-unicode-escape")
2214        self.assertEqual(x, y)
2215        y = br'\U00010000'
2216        x = y.decode("raw-unicode-escape").encode("raw-unicode-escape")
2217        self.assertEqual(x, y)
2218
2219        try:
2220            br'\U11111111'.decode("raw-unicode-escape")
2221        except UnicodeDecodeError as e:
2222            self.assertEqual(e.start, 0)
2223            self.assertEqual(e.end, 10)
2224        else:
2225            self.fail("Should have raised UnicodeDecodeError")
2226
2227    def test_conversion(self):
2228        # Make sure __str__() works properly
2229        class ObjectToStr:
2230            def __str__(self):
2231                return "foo"
2232
2233        class StrSubclassToStr(str):
2234            def __str__(self):
2235                return "foo"
2236
2237        class StrSubclassToStrSubclass(str):
2238            def __new__(cls, content=""):
2239                return str.__new__(cls, 2*content)
2240            def __str__(self):
2241                return self
2242
2243        self.assertEqual(str(ObjectToStr()), "foo")
2244        self.assertEqual(str(StrSubclassToStr("bar")), "foo")
2245        s = str(StrSubclassToStrSubclass("foo"))
2246        self.assertEqual(s, "foofoo")
2247        self.assertIs(type(s), StrSubclassToStrSubclass)
2248        s = StrSubclass(StrSubclassToStrSubclass("foo"))
2249        self.assertEqual(s, "foofoo")
2250        self.assertIs(type(s), StrSubclass)
2251
2252    def test_unicode_repr(self):
2253        class s1:
2254            def __repr__(self):
2255                return '\\n'
2256
2257        class s2:
2258            def __repr__(self):
2259                return '\\n'
2260
2261        self.assertEqual(repr(s1()), '\\n')
2262        self.assertEqual(repr(s2()), '\\n')
2263
2264    def test_printable_repr(self):
2265        self.assertEqual(repr('\U00010000'), "'%c'" % (0x10000,)) # printable
2266        self.assertEqual(repr('\U00014000'), "'\\U00014000'")     # nonprintable
2267
2268    # This test only affects 32-bit platforms because expandtabs can only take
2269    # an int as the max value, not a 64-bit C long.  If expandtabs is changed
2270    # to take a 64-bit long, this test should apply to all platforms.
2271    @unittest.skipIf(sys.maxsize > (1 << 32) or struct.calcsize('P') != 4,
2272                     'only applies to 32-bit platforms')
2273    def test_expandtabs_overflows_gracefully(self):
2274        self.assertRaises(OverflowError, 't\tt\t'.expandtabs, sys.maxsize)
2275
2276    @support.cpython_only
2277    def test_expandtabs_optimization(self):
2278        s = 'abc'
2279        self.assertIs(s.expandtabs(), s)
2280
2281    def test_raiseMemError(self):
2282        if struct.calcsize('P') == 8:
2283            # 64 bits pointers
2284            ascii_struct_size = 48
2285            compact_struct_size = 72
2286        else:
2287            # 32 bits pointers
2288            ascii_struct_size = 24
2289            compact_struct_size = 36
2290
2291        for char in ('a', '\xe9', '\u20ac', '\U0010ffff'):
2292            code = ord(char)
2293            if code < 0x100:
2294                char_size = 1  # sizeof(Py_UCS1)
2295                struct_size = ascii_struct_size
2296            elif code < 0x10000:
2297                char_size = 2  # sizeof(Py_UCS2)
2298                struct_size = compact_struct_size
2299            else:
2300                char_size = 4  # sizeof(Py_UCS4)
2301                struct_size = compact_struct_size
2302            # Note: sys.maxsize is half of the actual max allocation because of
2303            # the signedness of Py_ssize_t. Strings of maxlen-1 should in principle
2304            # be allocatable, given enough memory.
2305            maxlen = ((sys.maxsize - struct_size) // char_size)
2306            alloc = lambda: char * maxlen
2307            self.assertRaises(MemoryError, alloc)
2308            self.assertRaises(MemoryError, alloc)
2309
2310    def test_format_subclass(self):
2311        class S(str):
2312            def __str__(self):
2313                return '__str__ overridden'
2314        s = S('xxx')
2315        self.assertEqual("%s" % s, '__str__ overridden')
2316        self.assertEqual("{}".format(s), '__str__ overridden')
2317
2318    def test_subclass_add(self):
2319        class S(str):
2320            def __add__(self, o):
2321                return "3"
2322        self.assertEqual(S("4") + S("5"), "3")
2323        class S(str):
2324            def __iadd__(self, o):
2325                return "3"
2326        s = S("1")
2327        s += "4"
2328        self.assertEqual(s, "3")
2329
2330    def test_getnewargs(self):
2331        text = 'abc'
2332        args = text.__getnewargs__()
2333        self.assertIsNot(args[0], text)
2334        self.assertEqual(args[0], text)
2335        self.assertEqual(len(args), 1)
2336
2337    def test_resize(self):
2338        for length in range(1, 100, 7):
2339            # generate a fresh string (refcount=1)
2340            text = 'a' * length + 'b'
2341
2342            with support.check_warnings(('unicode_internal codec has been '
2343                                         'deprecated', DeprecationWarning)):
2344                # fill wstr internal field
2345                abc = text.encode('unicode_internal')
2346                self.assertEqual(abc.decode('unicode_internal'), text)
2347
2348                # resize text: wstr field must be cleared and then recomputed
2349                text += 'c'
2350                abcdef = text.encode('unicode_internal')
2351                self.assertNotEqual(abc, abcdef)
2352                self.assertEqual(abcdef.decode('unicode_internal'), text)
2353
2354    def test_compare(self):
2355        # Issue #17615
2356        N = 10
2357        ascii = 'a' * N
2358        ascii2 = 'z' * N
2359        latin = '\x80' * N
2360        latin2 = '\xff' * N
2361        bmp = '\u0100' * N
2362        bmp2 = '\uffff' * N
2363        astral = '\U00100000' * N
2364        astral2 = '\U0010ffff' * N
2365        strings = (
2366            ascii, ascii2,
2367            latin, latin2,
2368            bmp, bmp2,
2369            astral, astral2)
2370        for text1, text2 in itertools.combinations(strings, 2):
2371            equal = (text1 is text2)
2372            self.assertEqual(text1 == text2, equal)
2373            self.assertEqual(text1 != text2, not equal)
2374
2375            if equal:
2376                self.assertTrue(text1 <= text2)
2377                self.assertTrue(text1 >= text2)
2378
2379                # text1 is text2: duplicate strings to skip the "str1 == str2"
2380                # optimization in unicode_compare_eq() and really compare
2381                # character per character
2382                copy1 = duplicate_string(text1)
2383                copy2 = duplicate_string(text2)
2384                self.assertIsNot(copy1, copy2)
2385
2386                self.assertTrue(copy1 == copy2)
2387                self.assertFalse(copy1 != copy2)
2388
2389                self.assertTrue(copy1 <= copy2)
2390                self.assertTrue(copy2 >= copy2)
2391
2392        self.assertTrue(ascii < ascii2)
2393        self.assertTrue(ascii < latin)
2394        self.assertTrue(ascii < bmp)
2395        self.assertTrue(ascii < astral)
2396        self.assertFalse(ascii >= ascii2)
2397        self.assertFalse(ascii >= latin)
2398        self.assertFalse(ascii >= bmp)
2399        self.assertFalse(ascii >= astral)
2400
2401        self.assertFalse(latin < ascii)
2402        self.assertTrue(latin < latin2)
2403        self.assertTrue(latin < bmp)
2404        self.assertTrue(latin < astral)
2405        self.assertTrue(latin >= ascii)
2406        self.assertFalse(latin >= latin2)
2407        self.assertFalse(latin >= bmp)
2408        self.assertFalse(latin >= astral)
2409
2410        self.assertFalse(bmp < ascii)
2411        self.assertFalse(bmp < latin)
2412        self.assertTrue(bmp < bmp2)
2413        self.assertTrue(bmp < astral)
2414        self.assertTrue(bmp >= ascii)
2415        self.assertTrue(bmp >= latin)
2416        self.assertFalse(bmp >= bmp2)
2417        self.assertFalse(bmp >= astral)
2418
2419        self.assertFalse(astral < ascii)
2420        self.assertFalse(astral < latin)
2421        self.assertFalse(astral < bmp2)
2422        self.assertTrue(astral < astral2)
2423        self.assertTrue(astral >= ascii)
2424        self.assertTrue(astral >= latin)
2425        self.assertTrue(astral >= bmp2)
2426        self.assertFalse(astral >= astral2)
2427
2428    def test_free_after_iterating(self):
2429        support.check_free_after_iterating(self, iter, str)
2430        support.check_free_after_iterating(self, reversed, str)
2431
2432
2433class CAPITest(unittest.TestCase):
2434
2435    # Test PyUnicode_FromFormat()
2436    def test_from_format(self):
2437        support.import_module('ctypes')
2438        from ctypes import (
2439            pythonapi, py_object, sizeof,
2440            c_int, c_long, c_longlong, c_ssize_t,
2441            c_uint, c_ulong, c_ulonglong, c_size_t, c_void_p)
2442        name = "PyUnicode_FromFormat"
2443        _PyUnicode_FromFormat = getattr(pythonapi, name)
2444        _PyUnicode_FromFormat.restype = py_object
2445
2446        def PyUnicode_FromFormat(format, *args):
2447            cargs = tuple(
2448                py_object(arg) if isinstance(arg, str) else arg
2449                for arg in args)
2450            return _PyUnicode_FromFormat(format, *cargs)
2451
2452        def check_format(expected, format, *args):
2453            text = PyUnicode_FromFormat(format, *args)
2454            self.assertEqual(expected, text)
2455
2456        # ascii format, non-ascii argument
2457        check_format('ascii\x7f=unicode\xe9',
2458                     b'ascii\x7f=%U', 'unicode\xe9')
2459
2460        # non-ascii format, ascii argument: ensure that PyUnicode_FromFormatV()
2461        # raises an error
2462        self.assertRaisesRegex(ValueError,
2463            r'^PyUnicode_FromFormatV\(\) expects an ASCII-encoded format '
2464            'string, got a non-ASCII byte: 0xe9$',
2465            PyUnicode_FromFormat, b'unicode\xe9=%s', 'ascii')
2466
2467        # test "%c"
2468        check_format('\uabcd',
2469                     b'%c', c_int(0xabcd))
2470        check_format('\U0010ffff',
2471                     b'%c', c_int(0x10ffff))
2472        with self.assertRaises(OverflowError):
2473            PyUnicode_FromFormat(b'%c', c_int(0x110000))
2474        # Issue #18183
2475        check_format('\U00010000\U00100000',
2476                     b'%c%c', c_int(0x10000), c_int(0x100000))
2477
2478        # test "%"
2479        check_format('%',
2480                     b'%')
2481        check_format('%',
2482                     b'%%')
2483        check_format('%s',
2484                     b'%%s')
2485        check_format('[%]',
2486                     b'[%%]')
2487        check_format('%abc',
2488                     b'%%%s', b'abc')
2489
2490        # truncated string
2491        check_format('abc',
2492                     b'%.3s', b'abcdef')
2493        check_format('abc[\ufffd',
2494                     b'%.5s', 'abc[\u20ac]'.encode('utf8'))
2495        check_format("'\\u20acABC'",
2496                     b'%A', '\u20acABC')
2497        check_format("'\\u20",
2498                     b'%.5A', '\u20acABCDEF')
2499        check_format("'\u20acABC'",
2500                     b'%R', '\u20acABC')
2501        check_format("'\u20acA",
2502                     b'%.3R', '\u20acABCDEF')
2503        check_format('\u20acAB',
2504                     b'%.3S', '\u20acABCDEF')
2505        check_format('\u20acAB',
2506                     b'%.3U', '\u20acABCDEF')
2507        check_format('\u20acAB',
2508                     b'%.3V', '\u20acABCDEF', None)
2509        check_format('abc[\ufffd',
2510                     b'%.5V', None, 'abc[\u20ac]'.encode('utf8'))
2511
2512        # following tests comes from #7330
2513        # test width modifier and precision modifier with %S
2514        check_format("repr=  abc",
2515                     b'repr=%5S', 'abc')
2516        check_format("repr=ab",
2517                     b'repr=%.2S', 'abc')
2518        check_format("repr=   ab",
2519                     b'repr=%5.2S', 'abc')
2520
2521        # test width modifier and precision modifier with %R
2522        check_format("repr=   'abc'",
2523                     b'repr=%8R', 'abc')
2524        check_format("repr='ab",
2525                     b'repr=%.3R', 'abc')
2526        check_format("repr=  'ab",
2527                     b'repr=%5.3R', 'abc')
2528
2529        # test width modifier and precision modifier with %A
2530        check_format("repr=   'abc'",
2531                     b'repr=%8A', 'abc')
2532        check_format("repr='ab",
2533                     b'repr=%.3A', 'abc')
2534        check_format("repr=  'ab",
2535                     b'repr=%5.3A', 'abc')
2536
2537        # test width modifier and precision modifier with %s
2538        check_format("repr=  abc",
2539                     b'repr=%5s', b'abc')
2540        check_format("repr=ab",
2541                     b'repr=%.2s', b'abc')
2542        check_format("repr=   ab",
2543                     b'repr=%5.2s', b'abc')
2544
2545        # test width modifier and precision modifier with %U
2546        check_format("repr=  abc",
2547                     b'repr=%5U', 'abc')
2548        check_format("repr=ab",
2549                     b'repr=%.2U', 'abc')
2550        check_format("repr=   ab",
2551                     b'repr=%5.2U', 'abc')
2552
2553        # test width modifier and precision modifier with %V
2554        check_format("repr=  abc",
2555                     b'repr=%5V', 'abc', b'123')
2556        check_format("repr=ab",
2557                     b'repr=%.2V', 'abc', b'123')
2558        check_format("repr=   ab",
2559                     b'repr=%5.2V', 'abc', b'123')
2560        check_format("repr=  123",
2561                     b'repr=%5V', None, b'123')
2562        check_format("repr=12",
2563                     b'repr=%.2V', None, b'123')
2564        check_format("repr=   12",
2565                     b'repr=%5.2V', None, b'123')
2566
2567        # test integer formats (%i, %d, %u)
2568        check_format('010',
2569                     b'%03i', c_int(10))
2570        check_format('0010',
2571                     b'%0.4i', c_int(10))
2572        check_format('-123',
2573                     b'%i', c_int(-123))
2574        check_format('-123',
2575                     b'%li', c_long(-123))
2576        check_format('-123',
2577                     b'%lli', c_longlong(-123))
2578        check_format('-123',
2579                     b'%zi', c_ssize_t(-123))
2580
2581        check_format('-123',
2582                     b'%d', c_int(-123))
2583        check_format('-123',
2584                     b'%ld', c_long(-123))
2585        check_format('-123',
2586                     b'%lld', c_longlong(-123))
2587        check_format('-123',
2588                     b'%zd', c_ssize_t(-123))
2589
2590        check_format('123',
2591                     b'%u', c_uint(123))
2592        check_format('123',
2593                     b'%lu', c_ulong(123))
2594        check_format('123',
2595                     b'%llu', c_ulonglong(123))
2596        check_format('123',
2597                     b'%zu', c_size_t(123))
2598
2599        # test long output
2600        min_longlong = -(2 ** (8 * sizeof(c_longlong) - 1))
2601        max_longlong = -min_longlong - 1
2602        check_format(str(min_longlong),
2603                     b'%lld', c_longlong(min_longlong))
2604        check_format(str(max_longlong),
2605                     b'%lld', c_longlong(max_longlong))
2606        max_ulonglong = 2 ** (8 * sizeof(c_ulonglong)) - 1
2607        check_format(str(max_ulonglong),
2608                     b'%llu', c_ulonglong(max_ulonglong))
2609        PyUnicode_FromFormat(b'%p', c_void_p(-1))
2610
2611        # test padding (width and/or precision)
2612        check_format('123'.rjust(10, '0'),
2613                     b'%010i', c_int(123))
2614        check_format('123'.rjust(100),
2615                     b'%100i', c_int(123))
2616        check_format('123'.rjust(100, '0'),
2617                     b'%.100i', c_int(123))
2618        check_format('123'.rjust(80, '0').rjust(100),
2619                     b'%100.80i', c_int(123))
2620
2621        check_format('123'.rjust(10, '0'),
2622                     b'%010u', c_uint(123))
2623        check_format('123'.rjust(100),
2624                     b'%100u', c_uint(123))
2625        check_format('123'.rjust(100, '0'),
2626                     b'%.100u', c_uint(123))
2627        check_format('123'.rjust(80, '0').rjust(100),
2628                     b'%100.80u', c_uint(123))
2629
2630        check_format('123'.rjust(10, '0'),
2631                     b'%010x', c_int(0x123))
2632        check_format('123'.rjust(100),
2633                     b'%100x', c_int(0x123))
2634        check_format('123'.rjust(100, '0'),
2635                     b'%.100x', c_int(0x123))
2636        check_format('123'.rjust(80, '0').rjust(100),
2637                     b'%100.80x', c_int(0x123))
2638
2639        # test %A
2640        check_format(r"%A:'abc\xe9\uabcd\U0010ffff'",
2641                     b'%%A:%A', 'abc\xe9\uabcd\U0010ffff')
2642
2643        # test %V
2644        check_format('repr=abc',
2645                     b'repr=%V', 'abc', b'xyz')
2646
2647        # Test string decode from parameter of %s using utf-8.
2648        # b'\xe4\xba\xba\xe6\xb0\x91' is utf-8 encoded byte sequence of
2649        # '\u4eba\u6c11'
2650        check_format('repr=\u4eba\u6c11',
2651                     b'repr=%V', None, b'\xe4\xba\xba\xe6\xb0\x91')
2652
2653        #Test replace error handler.
2654        check_format('repr=abc\ufffd',
2655                     b'repr=%V', None, b'abc\xff')
2656
2657        # not supported: copy the raw format string. these tests are just here
2658        # to check for crashes and should not be considered as specifications
2659        check_format('%s',
2660                     b'%1%s', b'abc')
2661        check_format('%1abc',
2662                     b'%1abc')
2663        check_format('%+i',
2664                     b'%+i', c_int(10))
2665        check_format('%.%s',
2666                     b'%.%s', b'abc')
2667
2668    # Test PyUnicode_AsWideChar()
2669    @support.cpython_only
2670    def test_aswidechar(self):
2671        from _testcapi import unicode_aswidechar
2672        support.import_module('ctypes')
2673        from ctypes import c_wchar, sizeof
2674
2675        wchar, size = unicode_aswidechar('abcdef', 2)
2676        self.assertEqual(size, 2)
2677        self.assertEqual(wchar, 'ab')
2678
2679        wchar, size = unicode_aswidechar('abc', 3)
2680        self.assertEqual(size, 3)
2681        self.assertEqual(wchar, 'abc')
2682
2683        wchar, size = unicode_aswidechar('abc', 4)
2684        self.assertEqual(size, 3)
2685        self.assertEqual(wchar, 'abc\0')
2686
2687        wchar, size = unicode_aswidechar('abc', 10)
2688        self.assertEqual(size, 3)
2689        self.assertEqual(wchar, 'abc\0')
2690
2691        wchar, size = unicode_aswidechar('abc\0def', 20)
2692        self.assertEqual(size, 7)
2693        self.assertEqual(wchar, 'abc\0def\0')
2694
2695        nonbmp = chr(0x10ffff)
2696        if sizeof(c_wchar) == 2:
2697            buflen = 3
2698            nchar = 2
2699        else: # sizeof(c_wchar) == 4
2700            buflen = 2
2701            nchar = 1
2702        wchar, size = unicode_aswidechar(nonbmp, buflen)
2703        self.assertEqual(size, nchar)
2704        self.assertEqual(wchar, nonbmp + '\0')
2705
2706    # Test PyUnicode_AsWideCharString()
2707    @support.cpython_only
2708    def test_aswidecharstring(self):
2709        from _testcapi import unicode_aswidecharstring
2710        support.import_module('ctypes')
2711        from ctypes import c_wchar, sizeof
2712
2713        wchar, size = unicode_aswidecharstring('abc')
2714        self.assertEqual(size, 3)
2715        self.assertEqual(wchar, 'abc\0')
2716
2717        wchar, size = unicode_aswidecharstring('abc\0def')
2718        self.assertEqual(size, 7)
2719        self.assertEqual(wchar, 'abc\0def\0')
2720
2721        nonbmp = chr(0x10ffff)
2722        if sizeof(c_wchar) == 2:
2723            nchar = 2
2724        else: # sizeof(c_wchar) == 4
2725            nchar = 1
2726        wchar, size = unicode_aswidecharstring(nonbmp)
2727        self.assertEqual(size, nchar)
2728        self.assertEqual(wchar, nonbmp + '\0')
2729
2730    # Test PyUnicode_AsUCS4()
2731    @support.cpython_only
2732    def test_asucs4(self):
2733        from _testcapi import unicode_asucs4
2734        for s in ['abc', '\xa1\xa2', '\u4f60\u597d', 'a\U0001f600',
2735                  'a\ud800b\udfffc', '\ud834\udd1e']:
2736            l = len(s)
2737            self.assertEqual(unicode_asucs4(s, l, 1), s+'\0')
2738            self.assertEqual(unicode_asucs4(s, l, 0), s+'\uffff')
2739            self.assertEqual(unicode_asucs4(s, l+1, 1), s+'\0\uffff')
2740            self.assertEqual(unicode_asucs4(s, l+1, 0), s+'\0\uffff')
2741            self.assertRaises(SystemError, unicode_asucs4, s, l-1, 1)
2742            self.assertRaises(SystemError, unicode_asucs4, s, l-2, 0)
2743            s = '\0'.join([s, s])
2744            self.assertEqual(unicode_asucs4(s, len(s), 1), s+'\0')
2745            self.assertEqual(unicode_asucs4(s, len(s), 0), s+'\uffff')
2746
2747    # Test PyUnicode_CopyCharacters()
2748    @support.cpython_only
2749    def test_copycharacters(self):
2750        from _testcapi import unicode_copycharacters
2751
2752        strings = [
2753            'abcde', '\xa1\xa2\xa3\xa4\xa5',
2754            '\u4f60\u597d\u4e16\u754c\uff01',
2755            '\U0001f600\U0001f601\U0001f602\U0001f603\U0001f604'
2756        ]
2757
2758        for idx, from_ in enumerate(strings):
2759            # wide -> narrow: exceed maxchar limitation
2760            for to in strings[:idx]:
2761                self.assertRaises(
2762                    SystemError,
2763                    unicode_copycharacters, to, 0, from_, 0, 5
2764                )
2765            # same kind
2766            for from_start in range(5):
2767                self.assertEqual(
2768                    unicode_copycharacters(from_, 0, from_, from_start, 5),
2769                    (from_[from_start:from_start+5].ljust(5, '\0'),
2770                     5-from_start)
2771                )
2772            for to_start in range(5):
2773                self.assertEqual(
2774                    unicode_copycharacters(from_, to_start, from_, to_start, 5),
2775                    (from_[to_start:to_start+5].rjust(5, '\0'),
2776                     5-to_start)
2777                )
2778            # narrow -> wide
2779            # Tests omitted since this creates invalid strings.
2780
2781        s = strings[0]
2782        self.assertRaises(IndexError, unicode_copycharacters, s, 6, s, 0, 5)
2783        self.assertRaises(IndexError, unicode_copycharacters, s, -1, s, 0, 5)
2784        self.assertRaises(IndexError, unicode_copycharacters, s, 0, s, 6, 5)
2785        self.assertRaises(IndexError, unicode_copycharacters, s, 0, s, -1, 5)
2786        self.assertRaises(SystemError, unicode_copycharacters, s, 1, s, 0, 5)
2787        self.assertRaises(SystemError, unicode_copycharacters, s, 0, s, 0, -1)
2788        self.assertRaises(SystemError, unicode_copycharacters, s, 0, b'', 0, 0)
2789
2790    @support.cpython_only
2791    def test_encode_decimal(self):
2792        from _testcapi import unicode_encodedecimal
2793        self.assertEqual(unicode_encodedecimal('123'),
2794                         b'123')
2795        self.assertEqual(unicode_encodedecimal('\u0663.\u0661\u0664'),
2796                         b'3.14')
2797        self.assertEqual(unicode_encodedecimal("\N{EM SPACE}3.14\N{EN SPACE}"),
2798                         b' 3.14 ')
2799        self.assertRaises(UnicodeEncodeError,
2800                          unicode_encodedecimal, "123\u20ac", "strict")
2801        self.assertRaisesRegex(
2802            ValueError,
2803            "^'decimal' codec can't encode character",
2804            unicode_encodedecimal, "123\u20ac", "replace")
2805
2806    @support.cpython_only
2807    def test_transform_decimal(self):
2808        from _testcapi import unicode_transformdecimaltoascii as transform_decimal
2809        self.assertEqual(transform_decimal('123'),
2810                         '123')
2811        self.assertEqual(transform_decimal('\u0663.\u0661\u0664'),
2812                         '3.14')
2813        self.assertEqual(transform_decimal("\N{EM SPACE}3.14\N{EN SPACE}"),
2814                         "\N{EM SPACE}3.14\N{EN SPACE}")
2815        self.assertEqual(transform_decimal('123\u20ac'),
2816                         '123\u20ac')
2817
2818    @support.cpython_only
2819    def test_pep393_utf8_caching_bug(self):
2820        # Issue #25709: Problem with string concatenation and utf-8 cache
2821        from _testcapi import getargs_s_hash
2822        for k in 0x24, 0xa4, 0x20ac, 0x1f40d:
2823            s = ''
2824            for i in range(5):
2825                # Due to CPython specific optimization the 's' string can be
2826                # resized in-place.
2827                s += chr(k)
2828                # Parsing with the "s#" format code calls indirectly
2829                # PyUnicode_AsUTF8AndSize() which creates the UTF-8
2830                # encoded string cached in the Unicode object.
2831                self.assertEqual(getargs_s_hash(s), chr(k).encode() * (i + 1))
2832                # Check that the second call returns the same result
2833                self.assertEqual(getargs_s_hash(s), chr(k).encode() * (i + 1))
2834
2835class StringModuleTest(unittest.TestCase):
2836    def test_formatter_parser(self):
2837        def parse(format):
2838            return list(_string.formatter_parser(format))
2839
2840        formatter = parse("prefix {2!s}xxx{0:^+10.3f}{obj.attr!s} {z[0]!s:10}")
2841        self.assertEqual(formatter, [
2842            ('prefix ', '2', '', 's'),
2843            ('xxx', '0', '^+10.3f', None),
2844            ('', 'obj.attr', '', 's'),
2845            (' ', 'z[0]', '10', 's'),
2846        ])
2847
2848        formatter = parse("prefix {} suffix")
2849        self.assertEqual(formatter, [
2850            ('prefix ', '', '', None),
2851            (' suffix', None, None, None),
2852        ])
2853
2854        formatter = parse("str")
2855        self.assertEqual(formatter, [
2856            ('str', None, None, None),
2857        ])
2858
2859        formatter = parse("")
2860        self.assertEqual(formatter, [])
2861
2862        formatter = parse("{0}")
2863        self.assertEqual(formatter, [
2864            ('', '0', '', None),
2865        ])
2866
2867        self.assertRaises(TypeError, _string.formatter_parser, 1)
2868
2869    def test_formatter_field_name_split(self):
2870        def split(name):
2871            items = list(_string.formatter_field_name_split(name))
2872            items[1] = list(items[1])
2873            return items
2874        self.assertEqual(split("obj"), ["obj", []])
2875        self.assertEqual(split("obj.arg"), ["obj", [(True, 'arg')]])
2876        self.assertEqual(split("obj[key]"), ["obj", [(False, 'key')]])
2877        self.assertEqual(split("obj.arg[key1][key2]"), [
2878            "obj",
2879            [(True, 'arg'),
2880             (False, 'key1'),
2881             (False, 'key2'),
2882            ]])
2883        self.assertRaises(TypeError, _string.formatter_field_name_split, 1)
2884
2885
2886if __name__ == "__main__":
2887    unittest.main()
2888