1""" Test script for the Unicode implementation.
2
3Written by Marc-Andre Lemburg (mal@lemburg.com).
4
5(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
6
7"""#"
8import sys
9import struct
10import codecs
11import unittest
12from test import test_support, string_tests
13
14# decorator to skip tests on narrow builds
15requires_wide_build = unittest.skipIf(sys.maxunicode == 65535,
16                                      'requires wide build')
17
18# Error handling (bad decoder return)
19def search_function(encoding):
20    def decode1(input, errors="strict"):
21        return 42 # not a tuple
22    def encode1(input, errors="strict"):
23        return 42 # not a tuple
24    def encode2(input, errors="strict"):
25        return (42, 42) # no unicode
26    def decode2(input, errors="strict"):
27        return (42, 42) # no unicode
28    if encoding=="test.unicode1":
29        return (encode1, decode1, None, None)
30    elif encoding=="test.unicode2":
31        return (encode2, decode2, None, None)
32    else:
33        return None
34codecs.register(search_function)
35
36class UnicodeTest(
37    string_tests.CommonTest,
38    string_tests.MixinStrUnicodeUserStringTest,
39    string_tests.MixinStrUnicodeTest,
40    ):
41    type2test = unicode
42
43    def assertEqual(self, first, second, msg=None):
44        # strict assertEqual method: reject implicit bytes/unicode equality
45        super(UnicodeTest, self).assertEqual(first, second, msg)
46        if isinstance(first, unicode) or isinstance(second, unicode):
47            self.assertIsInstance(first, unicode)
48            self.assertIsInstance(second, unicode)
49        elif isinstance(first, str) or isinstance(second, str):
50            self.assertIsInstance(first, str)
51            self.assertIsInstance(second, str)
52
53    def checkequalnofix(self, result, object, methodname, *args):
54        method = getattr(object, methodname)
55        realresult = method(*args)
56        self.assertEqual(realresult, result)
57        self.assertTrue(type(realresult) is type(result))
58
59        # if the original is returned make sure that
60        # this doesn't happen with subclasses
61        if realresult is object:
62            class usub(unicode):
63                def __repr__(self):
64                    return 'usub(%r)' % unicode.__repr__(self)
65            object = usub(object)
66            method = getattr(object, methodname)
67            realresult = method(*args)
68            self.assertEqual(realresult, result)
69            self.assertTrue(object is not realresult)
70
71    def test_literals(self):
72        self.assertEqual(u'\xff', u'\u00ff')
73        self.assertEqual(u'\uffff', u'\U0000ffff')
74        self.assertRaises(SyntaxError, eval, 'u\'\\Ufffffffe\'')
75        self.assertRaises(SyntaxError, eval, 'u\'\\Uffffffff\'')
76        self.assertRaises(SyntaxError, eval, 'u\'\\U%08x\'' % 0x110000)
77
78    def test_repr(self):
79        if not sys.platform.startswith('java'):
80            # Test basic sanity of repr()
81            self.assertEqual(repr(u'abc'), "u'abc'")
82            self.assertEqual(repr(u'ab\\c'), "u'ab\\\\c'")
83            self.assertEqual(repr(u'ab\\'), "u'ab\\\\'")
84            self.assertEqual(repr(u'\\c'), "u'\\\\c'")
85            self.assertEqual(repr(u'\\'), "u'\\\\'")
86            self.assertEqual(repr(u'\n'), "u'\\n'")
87            self.assertEqual(repr(u'\r'), "u'\\r'")
88            self.assertEqual(repr(u'\t'), "u'\\t'")
89            self.assertEqual(repr(u'\b'), "u'\\x08'")
90            self.assertEqual(repr(u"'\""), """u'\\'"'""")
91            self.assertEqual(repr(u"'\""), """u'\\'"'""")
92            self.assertEqual(repr(u"'"), '''u"'"''')
93            self.assertEqual(repr(u'"'), """u'"'""")
94            latin1repr = (
95                "u'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r"
96                "\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a"
97                "\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHI"
98                "JKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f"
99                "\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d"
100                "\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b"
101                "\\x9c\\x9d\\x9e\\x9f\\xa0\\xa1\\xa2\\xa3\\xa4\\xa5\\xa6\\xa7\\xa8\\xa9"
102                "\\xaa\\xab\\xac\\xad\\xae\\xaf\\xb0\\xb1\\xb2\\xb3\\xb4\\xb5\\xb6\\xb7"
103                "\\xb8\\xb9\\xba\\xbb\\xbc\\xbd\\xbe\\xbf\\xc0\\xc1\\xc2\\xc3\\xc4\\xc5"
104                "\\xc6\\xc7\\xc8\\xc9\\xca\\xcb\\xcc\\xcd\\xce\\xcf\\xd0\\xd1\\xd2\\xd3"
105                "\\xd4\\xd5\\xd6\\xd7\\xd8\\xd9\\xda\\xdb\\xdc\\xdd\\xde\\xdf\\xe0\\xe1"
106                "\\xe2\\xe3\\xe4\\xe5\\xe6\\xe7\\xe8\\xe9\\xea\\xeb\\xec\\xed\\xee\\xef"
107                "\\xf0\\xf1\\xf2\\xf3\\xf4\\xf5\\xf6\\xf7\\xf8\\xf9\\xfa\\xfb\\xfc\\xfd"
108                "\\xfe\\xff'")
109            testrepr = repr(u''.join(map(unichr, xrange(256))))
110            self.assertEqual(testrepr, latin1repr)
111            # Test repr works on wide unicode escapes without overflow.
112            self.assertEqual(repr(u"\U00010000" * 39 + u"\uffff" * 4096),
113                             repr(u"\U00010000" * 39 + u"\uffff" * 4096))
114
115
116    def test_count(self):
117        string_tests.CommonTest.test_count(self)
118        # check mixed argument types
119        self.checkequalnofix(3,  'aaa', 'count', u'a')
120        self.checkequalnofix(0,  'aaa', 'count', u'b')
121        self.checkequalnofix(3, u'aaa', 'count',  'a')
122        self.checkequalnofix(0, u'aaa', 'count',  'b')
123        self.checkequalnofix(0, u'aaa', 'count',  'b')
124        self.checkequalnofix(1, u'aaa', 'count',  'a', -1)
125        self.checkequalnofix(3, u'aaa', 'count',  'a', -10)
126        self.checkequalnofix(2, u'aaa', 'count',  'a', 0, -1)
127        self.checkequalnofix(0, u'aaa', 'count',  'a', 0, -10)
128
129    def test_find(self):
130        self.checkequalnofix(0,  u'abcdefghiabc', 'find', u'abc')
131        self.checkequalnofix(9,  u'abcdefghiabc', 'find', u'abc', 1)
132        self.checkequalnofix(-1, u'abcdefghiabc', 'find', u'def', 4)
133
134        self.assertRaises(TypeError, u'hello'.find)
135        self.assertRaises(TypeError, u'hello'.find, 42)
136
137    def test_rfind(self):
138        string_tests.CommonTest.test_rfind(self)
139        # check mixed argument types
140        self.checkequalnofix(9,   'abcdefghiabc', 'rfind', u'abc')
141        self.checkequalnofix(12,  'abcdefghiabc', 'rfind', u'')
142        self.checkequalnofix(12, u'abcdefghiabc', 'rfind',  '')
143
144    def test_index(self):
145        string_tests.CommonTest.test_index(self)
146        # check mixed argument types
147        for (t1, t2) in ((str, unicode), (unicode, str)):
148            self.checkequalnofix(0, t1('abcdefghiabc'), 'index',  t2(''))
149            self.checkequalnofix(3, t1('abcdefghiabc'), 'index',  t2('def'))
150            self.checkequalnofix(0, t1('abcdefghiabc'), 'index',  t2('abc'))
151            self.checkequalnofix(9, t1('abcdefghiabc'), 'index',  t2('abc'), 1)
152            self.assertRaises(ValueError, t1('abcdefghiabc').index, t2('hib'))
153            self.assertRaises(ValueError, t1('abcdefghiab').index,  t2('abc'), 1)
154            self.assertRaises(ValueError, t1('abcdefghi').index,  t2('ghi'), 8)
155            self.assertRaises(ValueError, t1('abcdefghi').index,  t2('ghi'), -1)
156
157    def test_rindex(self):
158        string_tests.CommonTest.test_rindex(self)
159        # check mixed argument types
160        for (t1, t2) in ((str, unicode), (unicode, str)):
161            self.checkequalnofix(12, t1('abcdefghiabc'), 'rindex',  t2(''))
162            self.checkequalnofix(3,  t1('abcdefghiabc'), 'rindex',  t2('def'))
163            self.checkequalnofix(9,  t1('abcdefghiabc'), 'rindex',  t2('abc'))
164            self.checkequalnofix(0,  t1('abcdefghiabc'), 'rindex',  t2('abc'), 0, -1)
165
166            self.assertRaises(ValueError, t1('abcdefghiabc').rindex,  t2('hib'))
167            self.assertRaises(ValueError, t1('defghiabc').rindex,  t2('def'), 1)
168            self.assertRaises(ValueError, t1('defghiabc').rindex,  t2('abc'), 0, -1)
169            self.assertRaises(ValueError, t1('abcdefghi').rindex,  t2('ghi'), 0, 8)
170            self.assertRaises(ValueError, t1('abcdefghi').rindex,  t2('ghi'), 0, -1)
171
172    def test_translate(self):
173        self.checkequalnofix(u'bbbc', u'abababc', 'translate', {ord('a'):None})
174        self.checkequalnofix(u'iiic', u'abababc', 'translate', {ord('a'):None, ord('b'):ord('i')})
175        self.checkequalnofix(u'iiix', u'abababc', 'translate', {ord('a'):None, ord('b'):ord('i'), ord('c'):u'x'})
176        self.checkequalnofix(u'<i><i><i>c', u'abababc', 'translate', {ord('a'):None, ord('b'):u'<i>'})
177        self.checkequalnofix(u'c', u'abababc', 'translate', {ord('a'):None, ord('b'):u''})
178        self.checkequalnofix(u'xyyx', u'xzx', 'translate', {ord('z'):u'yy'})
179
180        self.assertRaises(TypeError, u'hello'.translate)
181        self.assertRaises(TypeError, u'abababc'.translate, {ord('a'):''})
182
183    def test_split(self):
184        string_tests.CommonTest.test_split(self)
185
186        # Mixed arguments
187        self.checkequalnofix([u'a', u'b', u'c', u'd'], u'a//b//c//d', 'split', '//')
188        self.checkequalnofix([u'a', u'b', u'c', u'd'], 'a//b//c//d', 'split', u'//')
189        self.checkequalnofix([u'endcase ', u''], u'endcase test', 'split', 'test')
190
191    def test_join(self):
192        string_tests.MixinStrUnicodeUserStringTest.test_join(self)
193
194        # mixed arguments
195        self.checkequalnofix(u'a b c d', u' ', 'join', ['a', 'b', u'c', u'd'])
196        self.checkequalnofix(u'abcd', u'', 'join', (u'a', u'b', u'c', u'd'))
197        self.checkequalnofix(u'w x y z', u' ', 'join', string_tests.Sequence('wxyz'))
198        self.checkequalnofix(u'a b c d', ' ', 'join', [u'a', u'b', u'c', u'd'])
199        self.checkequalnofix(u'a b c d', ' ', 'join', ['a', 'b', u'c', u'd'])
200        self.checkequalnofix(u'abcd', '', 'join', (u'a', u'b', u'c', u'd'))
201        self.checkequalnofix(u'w x y z', ' ', 'join', string_tests.Sequence(u'wxyz'))
202
203    def test_strip(self):
204        string_tests.CommonTest.test_strip(self)
205        self.assertRaises(UnicodeError, u"hello".strip, "\xff")
206
207    def test_replace(self):
208        string_tests.CommonTest.test_replace(self)
209
210        # method call forwarded from str implementation because of unicode argument
211        self.checkequalnofix(u'one@two!three!', 'one!two!three!', 'replace', u'!', u'@', 1)
212        self.assertRaises(TypeError, 'replace'.replace, u"r", 42)
213
214    def test_comparison(self):
215        # Comparisons:
216        self.assertTrue(u'abc' == 'abc')
217        self.assertTrue('abc' == u'abc')
218        self.assertTrue(u'abc' == u'abc')
219        self.assertTrue(u'abcd' > 'abc')
220        self.assertTrue('abcd' > u'abc')
221        self.assertTrue(u'abcd' > u'abc')
222        self.assertTrue(u'abc' < 'abcd')
223        self.assertTrue('abc' < u'abcd')
224        self.assertTrue(u'abc' < u'abcd')
225
226        if 0:
227            # Move these tests to a Unicode collation module test...
228            # Testing UTF-16 code point order comparisons...
229
230            # No surrogates, no fixup required.
231            self.assertTrue(u'\u0061' < u'\u20ac')
232            # Non surrogate below surrogate value, no fixup required
233            self.assertTrue(u'\u0061' < u'\ud800\udc02')
234
235            # Non surrogate above surrogate value, fixup required
236            def test_lecmp(s, s2):
237                self.assertTrue(s < s2)
238
239            def test_fixup(s):
240                s2 = u'\ud800\udc01'
241                test_lecmp(s, s2)
242                s2 = u'\ud900\udc01'
243                test_lecmp(s, s2)
244                s2 = u'\uda00\udc01'
245                test_lecmp(s, s2)
246                s2 = u'\udb00\udc01'
247                test_lecmp(s, s2)
248                s2 = u'\ud800\udd01'
249                test_lecmp(s, s2)
250                s2 = u'\ud900\udd01'
251                test_lecmp(s, s2)
252                s2 = u'\uda00\udd01'
253                test_lecmp(s, s2)
254                s2 = u'\udb00\udd01'
255                test_lecmp(s, s2)
256                s2 = u'\ud800\ude01'
257                test_lecmp(s, s2)
258                s2 = u'\ud900\ude01'
259                test_lecmp(s, s2)
260                s2 = u'\uda00\ude01'
261                test_lecmp(s, s2)
262                s2 = u'\udb00\ude01'
263                test_lecmp(s, s2)
264                s2 = u'\ud800\udfff'
265                test_lecmp(s, s2)
266                s2 = u'\ud900\udfff'
267                test_lecmp(s, s2)
268                s2 = u'\uda00\udfff'
269                test_lecmp(s, s2)
270                s2 = u'\udb00\udfff'
271                test_lecmp(s, s2)
272
273                test_fixup(u'\ue000')
274                test_fixup(u'\uff61')
275
276        # Surrogates on both sides, no fixup required
277        self.assertTrue(u'\ud800\udc02' < u'\ud84d\udc56')
278
279    def test_capitalize(self):
280        string_tests.CommonTest.test_capitalize(self)
281        # check that titlecased chars are lowered correctly
282        # \u1ffc is the titlecased char
283        self.checkequal(u'\u1ffc\u1ff3\u1ff3\u1ff3',
284                        u'\u1ff3\u1ff3\u1ffc\u1ffc', 'capitalize')
285        # check with cased non-letter chars
286        self.checkequal(u'\u24c5\u24e8\u24e3\u24d7\u24de\u24dd',
287                        u'\u24c5\u24ce\u24c9\u24bd\u24c4\u24c3', 'capitalize')
288        self.checkequal(u'\u24c5\u24e8\u24e3\u24d7\u24de\u24dd',
289                        u'\u24df\u24e8\u24e3\u24d7\u24de\u24dd', 'capitalize')
290        self.checkequal(u'\u2160\u2171\u2172',
291                        u'\u2160\u2161\u2162', 'capitalize')
292        self.checkequal(u'\u2160\u2171\u2172',
293                        u'\u2170\u2171\u2172', 'capitalize')
294        # check with Ll chars with no upper - nothing changes here
295        self.checkequal(u'\u019b\u1d00\u1d86\u0221\u1fb7',
296                        u'\u019b\u1d00\u1d86\u0221\u1fb7', 'capitalize')
297
298    def test_islower(self):
299        string_tests.MixinStrUnicodeUserStringTest.test_islower(self)
300        self.checkequalnofix(False, u'\u1FFc', 'islower')
301
302    @requires_wide_build
303    def test_islower_non_bmp(self):
304        # non-BMP, uppercase
305        self.assertFalse(u'\U00010401'.islower())
306        self.assertFalse(u'\U00010427'.islower())
307        # non-BMP, lowercase
308        self.assertTrue(u'\U00010429'.islower())
309        self.assertTrue(u'\U0001044E'.islower())
310        # non-BMP, non-cased
311        self.assertFalse(u'\U0001F40D'.islower())
312        self.assertFalse(u'\U0001F46F'.islower())
313
314    def test_isupper(self):
315        string_tests.MixinStrUnicodeUserStringTest.test_isupper(self)
316        if not sys.platform.startswith('java'):
317            self.checkequalnofix(False, u'\u1FFc', 'isupper')
318
319    @requires_wide_build
320    def test_isupper_non_bmp(self):
321        # non-BMP, uppercase
322        self.assertTrue(u'\U00010401'.isupper())
323        self.assertTrue(u'\U00010427'.isupper())
324        # non-BMP, lowercase
325        self.assertFalse(u'\U00010429'.isupper())
326        self.assertFalse(u'\U0001044E'.isupper())
327        # non-BMP, non-cased
328        self.assertFalse(u'\U0001F40D'.isupper())
329        self.assertFalse(u'\U0001F46F'.isupper())
330
331    def test_istitle(self):
332        string_tests.MixinStrUnicodeUserStringTest.test_istitle(self)
333        self.checkequalnofix(True, u'\u1FFc', 'istitle')
334        self.checkequalnofix(True, u'Greek \u1FFcitlecases ...', 'istitle')
335
336    @requires_wide_build
337    def test_istitle_non_bmp(self):
338        # non-BMP, uppercase + lowercase
339        self.assertTrue(u'\U00010401\U00010429'.istitle())
340        self.assertTrue(u'\U00010427\U0001044E'.istitle())
341        # apparently there are no titlecased (Lt) non-BMP chars in Unicode 6
342        for ch in [u'\U00010429', u'\U0001044E', u'\U0001F40D', u'\U0001F46F']:
343            self.assertFalse(ch.istitle(), '{!r} is not title'.format(ch))
344
345    def test_isspace(self):
346        string_tests.MixinStrUnicodeUserStringTest.test_isspace(self)
347        self.checkequalnofix(True, u'\u2000', 'isspace')
348        self.checkequalnofix(True, u'\u200a', 'isspace')
349        self.checkequalnofix(False, u'\u2014', 'isspace')
350
351    @requires_wide_build
352    def test_isspace_non_bmp(self):
353        # apparently there are no non-BMP spaces chars in Unicode 6
354        for ch in [u'\U00010401', u'\U00010427', u'\U00010429', u'\U0001044E',
355                   u'\U0001F40D', u'\U0001F46F']:
356            self.assertFalse(ch.isspace(), '{!r} is not space.'.format(ch))
357
358    @requires_wide_build
359    def test_isalnum_non_bmp(self):
360        for ch in [u'\U00010401', u'\U00010427', u'\U00010429', u'\U0001044E',
361                   u'\U0001D7F6', u'\U000104A0', u'\U000104A0', u'\U0001F107']:
362            self.assertTrue(ch.isalnum(), '{!r} is alnum.'.format(ch))
363
364    def test_isalpha(self):
365        string_tests.MixinStrUnicodeUserStringTest.test_isalpha(self)
366        self.checkequalnofix(True, u'\u1FFc', 'isalpha')
367
368    @requires_wide_build
369    def test_isalpha_non_bmp(self):
370        # non-BMP, cased
371        self.assertTrue(u'\U00010401'.isalpha())
372        self.assertTrue(u'\U00010427'.isalpha())
373        self.assertTrue(u'\U00010429'.isalpha())
374        self.assertTrue(u'\U0001044E'.isalpha())
375        # non-BMP, non-cased
376        self.assertFalse(u'\U0001F40D'.isalpha())
377        self.assertFalse(u'\U0001F46F'.isalpha())
378
379    def test_isdecimal(self):
380        self.checkequalnofix(False, u'', 'isdecimal')
381        self.checkequalnofix(False, u'a', 'isdecimal')
382        self.checkequalnofix(True, u'0', 'isdecimal')
383        self.checkequalnofix(False, u'\u2460', 'isdecimal') # CIRCLED DIGIT ONE
384        self.checkequalnofix(False, u'\xbc', 'isdecimal') # VULGAR FRACTION ONE QUARTER
385        self.checkequalnofix(True, u'\u0660', 'isdecimal') # ARABIC-INDIC DIGIT ZERO
386        self.checkequalnofix(True, u'0123456789', 'isdecimal')
387        self.checkequalnofix(False, u'0123456789a', 'isdecimal')
388
389        self.checkraises(TypeError, 'abc', 'isdecimal', 42)
390
391    @requires_wide_build
392    def test_isdecimal_non_bmp(self):
393        for ch in [u'\U00010401', u'\U00010427', u'\U00010429', u'\U0001044E',
394                   u'\U0001F40D', u'\U0001F46F', u'\U00011065', u'\U0001F107']:
395            self.assertFalse(ch.isdecimal(), '{!r} is not decimal.'.format(ch))
396        for ch in [u'\U0001D7F6', u'\U000104A0', u'\U000104A0']:
397            self.assertTrue(ch.isdecimal(), '{!r} is decimal.'.format(ch))
398
399    def test_isdigit(self):
400        string_tests.MixinStrUnicodeUserStringTest.test_isdigit(self)
401        self.checkequalnofix(True, u'\u2460', 'isdigit')
402        self.checkequalnofix(False, u'\xbc', 'isdigit')
403        self.checkequalnofix(True, u'\u0660', 'isdigit')
404
405    @requires_wide_build
406    def test_isdigit_non_bmp(self):
407        for ch in [u'\U00010401', u'\U00010427', u'\U00010429', u'\U0001044E',
408                   u'\U0001F40D', u'\U0001F46F', u'\U00011065']:
409            self.assertFalse(ch.isdigit(), '{!r} is not a digit.'.format(ch))
410        for ch in [u'\U0001D7F6', u'\U000104A0', u'\U000104A0', u'\U0001F107']:
411            self.assertTrue(ch.isdigit(), '{!r} is a digit.'.format(ch))
412
413    def test_isnumeric(self):
414        self.checkequalnofix(False, u'', 'isnumeric')
415        self.checkequalnofix(False, u'a', 'isnumeric')
416        self.checkequalnofix(True, u'0', 'isnumeric')
417        self.checkequalnofix(True, u'\u2460', 'isnumeric')
418        self.checkequalnofix(True, u'\xbc', 'isnumeric')
419        self.checkequalnofix(True, u'\u0660', 'isnumeric')
420        self.checkequalnofix(True, u'0123456789', 'isnumeric')
421        self.checkequalnofix(False, u'0123456789a', 'isnumeric')
422
423        self.assertRaises(TypeError, u"abc".isnumeric, 42)
424
425    @requires_wide_build
426    def test_isnumeric_non_bmp(self):
427        for ch in [u'\U00010401', u'\U00010427', u'\U00010429', u'\U0001044E',
428                   u'\U0001F40D', u'\U0001F46F']:
429            self.assertFalse(ch.isnumeric(), '{!r} is not numeric.'.format(ch))
430        for ch in [u'\U00010107', u'\U0001D7F6', u'\U00023b1b',
431                   u'\U000104A0', u'\U0001F107']:
432            self.assertTrue(ch.isnumeric(), '{!r} is numeric.'.format(ch))
433
434    @requires_wide_build
435    def test_surrogates(self):
436        # this test actually passes on narrow too, but it's just by accident.
437        # Surrogates are seen as non-cased chars, so u'X\uD800X' is as
438        # uppercase as 'X X'
439        for s in (u'a\uD800b\uDFFF', u'a\uDFFFb\uD800',
440                  u'a\uD800b\uDFFFa', u'a\uDFFFb\uD800a'):
441            self.assertTrue(s.islower())
442            self.assertFalse(s.isupper())
443            self.assertFalse(s.istitle())
444        for s in (u'A\uD800B\uDFFF', u'A\uDFFFB\uD800',
445                  u'A\uD800B\uDFFFA', u'A\uDFFFB\uD800A'):
446            self.assertFalse(s.islower())
447            self.assertTrue(s.isupper())
448            self.assertTrue(s.istitle())
449
450        for meth_name in ('islower', 'isupper', 'istitle'):
451            meth = getattr(unicode, meth_name)
452            for s in (u'\uD800', u'\uDFFF', u'\uD800\uD800', u'\uDFFF\uDFFF'):
453                self.assertFalse(meth(s), '%r.%s() is False' % (s, meth_name))
454
455        for meth_name in ('isalpha', 'isalnum', 'isdigit', 'isspace',
456                          'isdecimal', 'isnumeric'):
457            meth = getattr(unicode, meth_name)
458            for s in (u'\uD800', u'\uDFFF', u'\uD800\uD800', u'\uDFFF\uDFFF',
459                      u'a\uD800b\uDFFF', u'a\uDFFFb\uD800',
460                      u'a\uD800b\uDFFFa', u'a\uDFFFb\uD800a'):
461                self.assertFalse(meth(s), '%r.%s() is False' % (s, meth_name))
462
463
464    @requires_wide_build
465    def test_lower(self):
466        string_tests.CommonTest.test_lower(self)
467        self.assertEqual(u'\U00010427'.lower(), u'\U0001044F')
468        self.assertEqual(u'\U00010427\U00010427'.lower(),
469                         u'\U0001044F\U0001044F')
470        self.assertEqual(u'\U00010427\U0001044F'.lower(),
471                         u'\U0001044F\U0001044F')
472        self.assertEqual(u'X\U00010427x\U0001044F'.lower(),
473                         u'x\U0001044Fx\U0001044F')
474
475    @requires_wide_build
476    def test_upper(self):
477        string_tests.CommonTest.test_upper(self)
478        self.assertEqual(u'\U0001044F'.upper(), u'\U00010427')
479        self.assertEqual(u'\U0001044F\U0001044F'.upper(),
480                         u'\U00010427\U00010427')
481        self.assertEqual(u'\U00010427\U0001044F'.upper(),
482                         u'\U00010427\U00010427')
483        self.assertEqual(u'X\U00010427x\U0001044F'.upper(),
484                         u'X\U00010427X\U00010427')
485
486    @requires_wide_build
487    def test_capitalize(self):
488        string_tests.CommonTest.test_capitalize(self)
489        self.assertEqual(u'\U0001044F'.capitalize(), u'\U00010427')
490        self.assertEqual(u'\U0001044F\U0001044F'.capitalize(),
491                         u'\U00010427\U0001044F')
492        self.assertEqual(u'\U00010427\U0001044F'.capitalize(),
493                         u'\U00010427\U0001044F')
494        self.assertEqual(u'\U0001044F\U00010427'.capitalize(),
495                         u'\U00010427\U0001044F')
496        self.assertEqual(u'X\U00010427x\U0001044F'.capitalize(),
497                         u'X\U0001044Fx\U0001044F')
498
499    @requires_wide_build
500    def test_title(self):
501        string_tests.MixinStrUnicodeUserStringTest.test_title(self)
502        self.assertEqual(u'\U0001044F'.title(), u'\U00010427')
503        self.assertEqual(u'\U0001044F\U0001044F'.title(),
504                         u'\U00010427\U0001044F')
505        self.assertEqual(u'\U0001044F\U0001044F \U0001044F\U0001044F'.title(),
506                         u'\U00010427\U0001044F \U00010427\U0001044F')
507        self.assertEqual(u'\U00010427\U0001044F \U00010427\U0001044F'.title(),
508                         u'\U00010427\U0001044F \U00010427\U0001044F')
509        self.assertEqual(u'\U0001044F\U00010427 \U0001044F\U00010427'.title(),
510                         u'\U00010427\U0001044F \U00010427\U0001044F')
511        self.assertEqual(u'X\U00010427x\U0001044F X\U00010427x\U0001044F'.title(),
512                         u'X\U0001044Fx\U0001044F X\U0001044Fx\U0001044F')
513
514    @requires_wide_build
515    def test_swapcase(self):
516        string_tests.CommonTest.test_swapcase(self)
517        self.assertEqual(u'\U0001044F'.swapcase(), u'\U00010427')
518        self.assertEqual(u'\U00010427'.swapcase(), u'\U0001044F')
519        self.assertEqual(u'\U0001044F\U0001044F'.swapcase(),
520                         u'\U00010427\U00010427')
521        self.assertEqual(u'\U00010427\U0001044F'.swapcase(),
522                         u'\U0001044F\U00010427')
523        self.assertEqual(u'\U0001044F\U00010427'.swapcase(),
524                         u'\U00010427\U0001044F')
525        self.assertEqual(u'X\U00010427x\U0001044F'.swapcase(),
526                         u'x\U0001044FX\U00010427')
527
528    def test_contains(self):
529        # Testing Unicode contains method
530        self.assertIn('a', u'abdb')
531        self.assertIn('a', u'bdab')
532        self.assertIn('a', u'bdaba')
533        self.assertIn('a', u'bdba')
534        self.assertIn('a', u'bdba')
535        self.assertIn(u'a', u'bdba')
536        self.assertNotIn(u'a', u'bdb')
537        self.assertNotIn(u'a', 'bdb')
538        self.assertIn(u'a', 'bdba')
539        self.assertIn(u'a', ('a',1,None))
540        self.assertIn(u'a', (1,None,'a'))
541        self.assertIn(u'a', (1,None,u'a'))
542        self.assertIn('a', ('a',1,None))
543        self.assertIn('a', (1,None,'a'))
544        self.assertIn('a', (1,None,u'a'))
545        self.assertNotIn('a', ('x',1,u'y'))
546        self.assertNotIn('a', ('x',1,None))
547        self.assertNotIn(u'abcd', u'abcxxxx')
548        self.assertIn(u'ab', u'abcd')
549        self.assertIn('ab', u'abc')
550        self.assertIn(u'ab', 'abc')
551        self.assertIn(u'ab', (1,None,u'ab'))
552        self.assertIn(u'', u'abc')
553        self.assertIn('', u'abc')
554
555        # If the following fails either
556        # the contains operator does not propagate UnicodeErrors or
557        # someone has changed the default encoding
558        self.assertRaises(UnicodeDecodeError, 'g\xe2teau'.__contains__, u'\xe2')
559        self.assertRaises(UnicodeDecodeError, u'g\xe2teau'.__contains__, '\xe2')
560
561        self.assertIn(u'', '')
562        self.assertIn('', u'')
563        self.assertIn(u'', u'')
564        self.assertIn(u'', 'abc')
565        self.assertIn('', u'abc')
566        self.assertIn(u'', u'abc')
567        self.assertNotIn(u'\0', 'abc')
568        self.assertNotIn('\0', u'abc')
569        self.assertNotIn(u'\0', u'abc')
570        self.assertIn(u'\0', '\0abc')
571        self.assertIn('\0', u'\0abc')
572        self.assertIn(u'\0', u'\0abc')
573        self.assertIn(u'\0', 'abc\0')
574        self.assertIn('\0', u'abc\0')
575        self.assertIn(u'\0', u'abc\0')
576        self.assertIn(u'a', '\0abc')
577        self.assertIn('a', u'\0abc')
578        self.assertIn(u'a', u'\0abc')
579        self.assertIn(u'asdf', 'asdf')
580        self.assertIn('asdf', u'asdf')
581        self.assertIn(u'asdf', u'asdf')
582        self.assertNotIn(u'asdf', 'asd')
583        self.assertNotIn('asdf', u'asd')
584        self.assertNotIn(u'asdf', u'asd')
585        self.assertNotIn(u'asdf', '')
586        self.assertNotIn('asdf', u'')
587        self.assertNotIn(u'asdf', u'')
588
589        self.assertRaises(TypeError, u"abc".__contains__)
590        self.assertRaises(TypeError, u"abc".__contains__, object())
591
592    def test_formatting(self):
593        string_tests.MixinStrUnicodeUserStringTest.test_formatting(self)
594        # Testing Unicode formatting strings...
595        self.assertEqual(u"%s, %s" % (u"abc", "abc"), u'abc, abc')
596        self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", 1, 2, 3), u'abc, abc, 1, 2.000000,  3.00')
597        self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", 1, -2, 3), u'abc, abc, 1, -2.000000,  3.00')
598        self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 3.5), u'abc, abc, -1, -2.000000,  3.50')
599        self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 3.57), u'abc, abc, -1, -2.000000,  3.57')
600        self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 1003.57), u'abc, abc, -1, -2.000000, 1003.57')
601        if not sys.platform.startswith('java'):
602            self.assertEqual(u"%r, %r" % (u"abc", "abc"), u"u'abc', 'abc'")
603        self.assertEqual(u"%(x)s, %(y)s" % {'x':u"abc", 'y':"def"}, u'abc, def')
604        self.assertEqual(u"%(x)s, %(\xfc)s" % {'x':u"abc", u'\xfc':"def"}, u'abc, def')
605
606        self.assertEqual(u'%c' % 0x1234, u'\u1234')
607        self.assertRaises(OverflowError, u"%c".__mod__, (sys.maxunicode+1,))
608        self.assertRaises(ValueError, u"%.1\u1032f".__mod__, (1.0/3))
609
610        for num in range(0x00,0x80):
611            char = chr(num)
612            self.assertEqual(u"%c" % char, unicode(char))
613            self.assertEqual(u"%c" % num, unicode(char))
614            self.assertTrue(char == u"%c" % char)
615            self.assertTrue(char == u"%c" % num)
616        # Issue 7649
617        for num in range(0x80,0x100):
618            uchar = unichr(num)
619            self.assertEqual(uchar, u"%c" % num)   # works only with ints
620            self.assertEqual(uchar, u"%c" % uchar) # and unicode chars
621            # the implicit decoding should fail for non-ascii chars
622            self.assertRaises(UnicodeDecodeError, u"%c".__mod__, chr(num))
623            self.assertRaises(UnicodeDecodeError, u"%s".__mod__, chr(num))
624
625        # formatting jobs delegated from the string implementation:
626        self.assertEqual('...%(foo)s...' % {'foo':u"abc"}, u'...abc...')
627        self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
628        self.assertEqual('...%(foo)s...' % {u'foo':"abc"}, '...abc...')
629        self.assertEqual('...%(foo)s...' % {u'foo':u"abc"}, u'...abc...')
630        self.assertEqual('...%(foo)s...' % {u'foo':u"abc",'def':123},  u'...abc...')
631        self.assertEqual('...%(foo)s...' % {u'foo':u"abc",u'def':123}, u'...abc...')
632        self.assertEqual('...%s...%s...%s...%s...' % (1,2,3,u"abc"), u'...1...2...3...abc...')
633        self.assertEqual('...%%...%%s...%s...%s...%s...%s...' % (1,2,3,u"abc"), u'...%...%s...1...2...3...abc...')
634        self.assertEqual('...%s...' % u"abc", u'...abc...')
635        self.assertEqual('%*s' % (5,u'abc',), u'  abc')
636        self.assertEqual('%*s' % (-5,u'abc',), u'abc  ')
637        self.assertEqual('%*.*s' % (5,2,u'abc',), u'   ab')
638        self.assertEqual('%*.*s' % (5,3,u'abc',), u'  abc')
639        self.assertEqual('%i %*.*s' % (10, 5,3,u'abc',), u'10   abc')
640        self.assertEqual('%i%s %*.*s' % (10, 3, 5, 3, u'abc',), u'103   abc')
641        self.assertEqual('%c' % u'a', u'a')
642        class Wrapper:
643            def __str__(self):
644                return u'\u1234'
645        self.assertEqual('%s' % Wrapper(), u'\u1234')
646
647    @test_support.cpython_only
648    def test_formatting_huge_precision(self):
649        from _testcapi import INT_MAX
650        format_string = u"%.{}f".format(INT_MAX + 1)
651        with self.assertRaises(ValueError):
652            result = format_string % 2.34
653
654    def test_formatting_huge_width(self):
655        format_string = u"%{}f".format(sys.maxsize + 1)
656        with self.assertRaises(ValueError):
657            result = format_string % 2.34
658
659    def test_startswith_endswith_errors(self):
660        for meth in (u'foo'.startswith, u'foo'.endswith):
661            with self.assertRaises(UnicodeDecodeError):
662                meth('\xff')
663            with self.assertRaises(TypeError) as cm:
664                meth(['f'])
665            exc = str(cm.exception)
666            self.assertIn('unicode', exc)
667            self.assertIn('str', exc)
668            self.assertIn('tuple', exc)
669
670    @test_support.run_with_locale('LC_ALL', 'de_DE', 'fr_FR')
671    def test_format_float(self):
672        # should not format with a comma, but always with C locale
673        self.assertEqual(u'1.0', u'%.1f' % 1.0)
674
675    def test_constructor(self):
676        # unicode(obj) tests (this maps to PyObject_Unicode() at C level)
677
678        self.assertEqual(
679            unicode(u'unicode remains unicode'),
680            u'unicode remains unicode'
681        )
682
683        class UnicodeSubclass(unicode):
684            pass
685
686        self.assertEqual(
687            unicode(UnicodeSubclass('unicode subclass becomes unicode')),
688            u'unicode subclass becomes unicode'
689        )
690
691        self.assertEqual(
692            unicode('strings are converted to unicode'),
693            u'strings are converted to unicode'
694        )
695
696        class UnicodeCompat:
697            def __init__(self, x):
698                self.x = x
699            def __unicode__(self):
700                return self.x
701
702        self.assertEqual(
703            unicode(UnicodeCompat('__unicode__ compatible objects are recognized')),
704            u'__unicode__ compatible objects are recognized')
705
706        class StringCompat:
707            def __init__(self, x):
708                self.x = x
709            def __str__(self):
710                return self.x
711
712        self.assertEqual(
713            unicode(StringCompat('__str__ compatible objects are recognized')),
714            u'__str__ compatible objects are recognized'
715        )
716
717        # unicode(obj) is compatible to str():
718
719        o = StringCompat('unicode(obj) is compatible to str()')
720        self.assertEqual(unicode(o), u'unicode(obj) is compatible to str()')
721        self.assertEqual(str(o), 'unicode(obj) is compatible to str()')
722
723        # %-formatting and .__unicode__()
724        self.assertEqual(u'%s' %
725                         UnicodeCompat(u"u'%s' % obj uses obj.__unicode__()"),
726                         u"u'%s' % obj uses obj.__unicode__()")
727        self.assertEqual(u'%s' %
728                         UnicodeCompat(u"u'%s' % obj falls back to obj.__str__()"),
729                         u"u'%s' % obj falls back to obj.__str__()")
730
731        for obj in (123, 123.45, 123L):
732            self.assertEqual(unicode(obj), unicode(str(obj)))
733
734        # unicode(obj, encoding, error) tests (this maps to
735        # PyUnicode_FromEncodedObject() at C level)
736
737        if not sys.platform.startswith('java'):
738            self.assertRaises(
739                TypeError,
740                unicode,
741                u'decoding unicode is not supported',
742                'utf-8',
743                'strict'
744            )
745
746        self.assertEqual(
747            unicode('strings are decoded to unicode', 'utf-8', 'strict'),
748            u'strings are decoded to unicode'
749        )
750
751        if not sys.platform.startswith('java'):
752            with test_support.check_py3k_warnings():
753                buf = buffer('character buffers are decoded to unicode')
754            self.assertEqual(
755                unicode(
756                    buf,
757                    'utf-8',
758                    'strict'
759                ),
760                u'character buffers are decoded to unicode'
761            )
762
763        self.assertRaises(TypeError, unicode, 42, 42, 42)
764
765    def test_codecs_utf7(self):
766        utfTests = [
767            (u'A\u2262\u0391.', 'A+ImIDkQ.'),             # RFC2152 example
768            (u'Hi Mom -\u263a-!', 'Hi Mom -+Jjo--!'),     # RFC2152 example
769            (u'\u65E5\u672C\u8A9E', '+ZeVnLIqe-'),        # RFC2152 example
770            (u'Item 3 is \u00a31.', 'Item 3 is +AKM-1.'), # RFC2152 example
771            (u'+', '+-'),
772            (u'+-', '+--'),
773            (u'+?', '+-?'),
774            (u'\?', '+AFw?'),
775            (u'+?', '+-?'),
776            (ur'\\?', '+AFwAXA?'),
777            (ur'\\\?', '+AFwAXABc?'),
778            (ur'++--', '+-+---'),
779            (u'\U000abcde', '+2m/c3g-'),                  # surrogate pairs
780            (u'/', '/'),
781        ]
782
783        for (x, y) in utfTests:
784            self.assertEqual(x.encode('utf-7'), y)
785
786        # Unpaired surrogates are passed through
787        self.assertEqual(u'\uD801'.encode('utf-7'), '+2AE-')
788        self.assertEqual(u'\uD801x'.encode('utf-7'), '+2AE-x')
789        self.assertEqual(u'\uDC01'.encode('utf-7'), '+3AE-')
790        self.assertEqual(u'\uDC01x'.encode('utf-7'), '+3AE-x')
791        self.assertEqual('+2AE-'.decode('utf-7'), u'\uD801')
792        self.assertEqual('+2AE-x'.decode('utf-7'), u'\uD801x')
793        self.assertEqual('+3AE-'.decode('utf-7'), u'\uDC01')
794        self.assertEqual('+3AE-x'.decode('utf-7'), u'\uDC01x')
795
796        self.assertEqual(u'\uD801\U000abcde'.encode('utf-7'), '+2AHab9ze-')
797        self.assertEqual('+2AHab9ze-'.decode('utf-7'), u'\uD801\U000abcde')
798
799        # Direct encoded characters
800        set_d = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789'(),-./:?"
801        # Optional direct characters
802        set_o = '!"#$%&*;<=>@[]^_`{|}'
803        for c in set_d:
804            self.assertEqual(c.encode('utf7'), c.encode('ascii'))
805            self.assertEqual(c.encode('ascii').decode('utf7'), unicode(c))
806            self.assertTrue(c == c.encode('ascii').decode('utf7'))
807        for c in set_o:
808            self.assertEqual(c.encode('ascii').decode('utf7'), unicode(c))
809            self.assertTrue(c == c.encode('ascii').decode('utf7'))
810
811    def test_codecs_utf8(self):
812        self.assertEqual(u''.encode('utf-8'), '')
813        self.assertEqual(u'\u20ac'.encode('utf-8'), '\xe2\x82\xac')
814        self.assertEqual(u'\ud800\udc02'.encode('utf-8'), '\xf0\x90\x80\x82')
815        self.assertEqual(u'\ud84d\udc56'.encode('utf-8'), '\xf0\xa3\x91\x96')
816        self.assertEqual(u'\ud800'.encode('utf-8'), '\xed\xa0\x80')
817        self.assertEqual(u'\udc00'.encode('utf-8'), '\xed\xb0\x80')
818        self.assertEqual(
819            (u'\ud800\udc02'*1000).encode('utf-8'),
820            '\xf0\x90\x80\x82'*1000
821        )
822        self.assertEqual(
823            u'\u6b63\u78ba\u306b\u8a00\u3046\u3068\u7ffb\u8a33\u306f'
824            u'\u3055\u308c\u3066\u3044\u307e\u305b\u3093\u3002\u4e00'
825            u'\u90e8\u306f\u30c9\u30a4\u30c4\u8a9e\u3067\u3059\u304c'
826            u'\u3001\u3042\u3068\u306f\u3067\u305f\u3089\u3081\u3067'
827            u'\u3059\u3002\u5b9f\u969b\u306b\u306f\u300cWenn ist das'
828            u' Nunstuck git und'.encode('utf-8'),
829            '\xe6\xad\xa3\xe7\xa2\xba\xe3\x81\xab\xe8\xa8\x80\xe3\x81'
830            '\x86\xe3\x81\xa8\xe7\xbf\xbb\xe8\xa8\xb3\xe3\x81\xaf\xe3'
831            '\x81\x95\xe3\x82\x8c\xe3\x81\xa6\xe3\x81\x84\xe3\x81\xbe'
832            '\xe3\x81\x9b\xe3\x82\x93\xe3\x80\x82\xe4\xb8\x80\xe9\x83'
833            '\xa8\xe3\x81\xaf\xe3\x83\x89\xe3\x82\xa4\xe3\x83\x84\xe8'
834            '\xaa\x9e\xe3\x81\xa7\xe3\x81\x99\xe3\x81\x8c\xe3\x80\x81'
835            '\xe3\x81\x82\xe3\x81\xa8\xe3\x81\xaf\xe3\x81\xa7\xe3\x81'
836            '\x9f\xe3\x82\x89\xe3\x82\x81\xe3\x81\xa7\xe3\x81\x99\xe3'
837            '\x80\x82\xe5\xae\x9f\xe9\x9a\x9b\xe3\x81\xab\xe3\x81\xaf'
838            '\xe3\x80\x8cWenn ist das Nunstuck git und'
839        )
840
841        # UTF-8 specific decoding tests
842        self.assertEqual(unicode('\xf0\xa3\x91\x96', 'utf-8'), u'\U00023456')
843        self.assertEqual(unicode('\xf0\x90\x80\x82', 'utf-8'), u'\U00010002')
844        self.assertEqual(unicode('\xe2\x82\xac', 'utf-8'), u'\u20ac')
845
846        # Other possible utf-8 test cases:
847        # * strict decoding testing for all of the
848        #   UTF8_ERROR cases in PyUnicode_DecodeUTF8
849
850    def test_utf8_decode_valid_sequences(self):
851        sequences = [
852            # single byte
853            ('\x00', u'\x00'), ('a', u'a'), ('\x7f', u'\x7f'),
854            # 2 bytes
855            ('\xc2\x80', u'\x80'), ('\xdf\xbf', u'\u07ff'),
856            # 3 bytes
857            ('\xe0\xa0\x80', u'\u0800'), ('\xed\x9f\xbf', u'\ud7ff'),
858            ('\xee\x80\x80', u'\uE000'), ('\xef\xbf\xbf', u'\uffff'),
859            # 4 bytes
860            ('\xF0\x90\x80\x80', u'\U00010000'),
861            ('\xf4\x8f\xbf\xbf', u'\U0010FFFF')
862        ]
863        for seq, res in sequences:
864            self.assertEqual(seq.decode('utf-8'), res)
865
866        for ch in map(unichr, range(0, sys.maxunicode)):
867            self.assertEqual(ch, ch.encode('utf-8').decode('utf-8'))
868
869    def test_utf8_decode_invalid_sequences(self):
870        # continuation bytes in a sequence of 2, 3, or 4 bytes
871        continuation_bytes = map(chr, range(0x80, 0xC0))
872        # start bytes of a 2-byte sequence equivalent to codepoints < 0x7F
873        invalid_2B_seq_start_bytes = map(chr, range(0xC0, 0xC2))
874        # start bytes of a 4-byte sequence equivalent to codepoints > 0x10FFFF
875        invalid_4B_seq_start_bytes = map(chr, range(0xF5, 0xF8))
876        invalid_start_bytes = (
877            continuation_bytes + invalid_2B_seq_start_bytes +
878            invalid_4B_seq_start_bytes + map(chr, range(0xF7, 0x100))
879        )
880
881        for byte in invalid_start_bytes:
882            self.assertRaises(UnicodeDecodeError, byte.decode, 'utf-8')
883
884        for sb in invalid_2B_seq_start_bytes:
885            for cb in continuation_bytes:
886                self.assertRaises(UnicodeDecodeError, (sb+cb).decode, 'utf-8')
887
888        for sb in invalid_4B_seq_start_bytes:
889            for cb1 in continuation_bytes[:3]:
890                for cb3 in continuation_bytes[:3]:
891                    self.assertRaises(UnicodeDecodeError,
892                                      (sb+cb1+'\x80'+cb3).decode, 'utf-8')
893
894        for cb in map(chr, range(0x80, 0xA0)):
895            self.assertRaises(UnicodeDecodeError,
896                              ('\xE0'+cb+'\x80').decode, 'utf-8')
897            self.assertRaises(UnicodeDecodeError,
898                              ('\xE0'+cb+'\xBF').decode, 'utf-8')
899        # XXX: surrogates shouldn't be valid UTF-8!
900        # see http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
901        # (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt
902        #for cb in map(chr, range(0xA0, 0xC0)):
903            #self.assertRaises(UnicodeDecodeError,
904                              #('\xED'+cb+'\x80').decode, 'utf-8')
905            #self.assertRaises(UnicodeDecodeError,
906                              #('\xED'+cb+'\xBF').decode, 'utf-8')
907        # but since they are valid on Python 2 add a test for that:
908        for cb, surrogate in zip(map(chr, range(0xA0, 0xC0)),
909                                 map(unichr, range(0xd800, 0xe000, 64))):
910            encoded = '\xED'+cb+'\x80'
911            self.assertEqual(encoded.decode('utf-8'), surrogate)
912            self.assertEqual(surrogate.encode('utf-8'), encoded)
913
914        for cb in map(chr, range(0x80, 0x90)):
915            self.assertRaises(UnicodeDecodeError,
916                              ('\xF0'+cb+'\x80\x80').decode, 'utf-8')
917            self.assertRaises(UnicodeDecodeError,
918                              ('\xF0'+cb+'\xBF\xBF').decode, 'utf-8')
919        for cb in map(chr, range(0x90, 0xC0)):
920            self.assertRaises(UnicodeDecodeError,
921                              ('\xF4'+cb+'\x80\x80').decode, 'utf-8')
922            self.assertRaises(UnicodeDecodeError,
923                              ('\xF4'+cb+'\xBF\xBF').decode, 'utf-8')
924
925    def test_issue8271(self):
926        # Issue #8271: during the decoding of an invalid UTF-8 byte sequence,
927        # only the start byte and the continuation byte(s) are now considered
928        # invalid, instead of the number of bytes specified by the start byte.
929        # See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf (page 95,
930        # table 3-8, Row 2) for more information about the algorithm used.
931        FFFD = u'\ufffd'
932        sequences = [
933            # invalid start bytes
934            ('\x80', FFFD), # continuation byte
935            ('\x80\x80', FFFD*2), # 2 continuation bytes
936            ('\xc0', FFFD),
937            ('\xc0\xc0', FFFD*2),
938            ('\xc1', FFFD),
939            ('\xc1\xc0', FFFD*2),
940            ('\xc0\xc1', FFFD*2),
941            # with start byte of a 2-byte sequence
942            ('\xc2', FFFD), # only the start byte
943            ('\xc2\xc2', FFFD*2), # 2 start bytes
944            ('\xc2\xc2\xc2', FFFD*3), # 2 start bytes
945            ('\xc2\x41', FFFD+'A'), # invalid continuation byte
946            # with start byte of a 3-byte sequence
947            ('\xe1', FFFD), # only the start byte
948            ('\xe1\xe1', FFFD*2), # 2 start bytes
949            ('\xe1\xe1\xe1', FFFD*3), # 3 start bytes
950            ('\xe1\xe1\xe1\xe1', FFFD*4), # 4 start bytes
951            ('\xe1\x80', FFFD), # only 1 continuation byte
952            ('\xe1\x41', FFFD+'A'), # invalid continuation byte
953            ('\xe1\x41\x80', FFFD+'A'+FFFD), # invalid cb followed by valid cb
954            ('\xe1\x41\x41', FFFD+'AA'), # 2 invalid continuation bytes
955            ('\xe1\x80\x41', FFFD+'A'), # only 1 valid continuation byte
956            ('\xe1\x80\xe1\x41', FFFD*2+'A'), # 1 valid and the other invalid
957            ('\xe1\x41\xe1\x80', FFFD+'A'+FFFD), # 1 invalid and the other valid
958            # with start byte of a 4-byte sequence
959            ('\xf1', FFFD), # only the start byte
960            ('\xf1\xf1', FFFD*2), # 2 start bytes
961            ('\xf1\xf1\xf1', FFFD*3), # 3 start bytes
962            ('\xf1\xf1\xf1\xf1', FFFD*4), # 4 start bytes
963            ('\xf1\xf1\xf1\xf1\xf1', FFFD*5), # 5 start bytes
964            ('\xf1\x80', FFFD), # only 1 continuation bytes
965            ('\xf1\x80\x80', FFFD), # only 2 continuation bytes
966            ('\xf1\x80\x41', FFFD+'A'), # 1 valid cb and 1 invalid
967            ('\xf1\x80\x41\x41', FFFD+'AA'), # 1 valid cb and 1 invalid
968            ('\xf1\x80\x80\x41', FFFD+'A'), # 2 valid cb and 1 invalid
969            ('\xf1\x41\x80', FFFD+'A'+FFFD), # 1 invalid cv and 1 valid
970            ('\xf1\x41\x80\x80', FFFD+'A'+FFFD*2), # 1 invalid cb and 2 invalid
971            ('\xf1\x41\x80\x41', FFFD+'A'+FFFD+'A'), # 2 invalid cb and 1 invalid
972            ('\xf1\x41\x41\x80', FFFD+'AA'+FFFD), # 1 valid cb and 1 invalid
973            ('\xf1\x41\xf1\x80', FFFD+'A'+FFFD),
974            ('\xf1\x41\x80\xf1', FFFD+'A'+FFFD*2),
975            ('\xf1\xf1\x80\x41', FFFD*2+'A'),
976            ('\xf1\x41\xf1\xf1', FFFD+'A'+FFFD*2),
977            # with invalid start byte of a 4-byte sequence (rfc2279)
978            ('\xf5', FFFD), # only the start byte
979            ('\xf5\xf5', FFFD*2), # 2 start bytes
980            ('\xf5\x80', FFFD*2), # only 1 continuation byte
981            ('\xf5\x80\x80', FFFD*3), # only 2 continuation byte
982            ('\xf5\x80\x80\x80', FFFD*4), # 3 continuation bytes
983            ('\xf5\x80\x41', FFFD*2+'A'), #  1 valid cb and 1 invalid
984            ('\xf5\x80\x41\xf5', FFFD*2+'A'+FFFD),
985            ('\xf5\x41\x80\x80\x41', FFFD+'A'+FFFD*2+'A'),
986            # with invalid start byte of a 5-byte sequence (rfc2279)
987            ('\xf8', FFFD), # only the start byte
988            ('\xf8\xf8', FFFD*2), # 2 start bytes
989            ('\xf8\x80', FFFD*2), # only one continuation byte
990            ('\xf8\x80\x41', FFFD*2 + 'A'), # 1 valid cb and 1 invalid
991            ('\xf8\x80\x80\x80\x80', FFFD*5), # invalid 5 bytes seq with 5 bytes
992            # with invalid start byte of a 6-byte sequence (rfc2279)
993            ('\xfc', FFFD), # only the start byte
994            ('\xfc\xfc', FFFD*2), # 2 start bytes
995            ('\xfc\x80\x80', FFFD*3), # only 2 continuation bytes
996            ('\xfc\x80\x80\x80\x80\x80', FFFD*6), # 6 continuation bytes
997            # invalid start byte
998            ('\xfe', FFFD),
999            ('\xfe\x80\x80', FFFD*3),
1000            # other sequences
1001            ('\xf1\x80\x41\x42\x43', u'\ufffd\x41\x42\x43'),
1002            ('\xf1\x80\xff\x42\x43', u'\ufffd\ufffd\x42\x43'),
1003            ('\xf1\x80\xc2\x81\x43', u'\ufffd\x81\x43'),
1004            ('\x61\xF1\x80\x80\xE1\x80\xC2\x62\x80\x63\x80\xBF\x64',
1005             u'\x61\uFFFD\uFFFD\uFFFD\x62\uFFFD\x63\uFFFD\uFFFD\x64'),
1006        ]
1007        for n, (seq, res) in enumerate(sequences):
1008            self.assertRaises(UnicodeDecodeError, seq.decode, 'utf-8', 'strict')
1009            self.assertEqual(seq.decode('utf-8', 'replace'), res)
1010            self.assertEqual((seq+'b').decode('utf-8', 'replace'), res+'b')
1011            self.assertEqual(seq.decode('utf-8', 'ignore'),
1012                             res.replace(u'\uFFFD', ''))
1013
1014    def test_codecs_idna(self):
1015        # Test whether trailing dot is preserved
1016        self.assertEqual(u"www.python.org.".encode("idna"), "www.python.org.")
1017
1018    def test_codecs_errors(self):
1019        # Error handling (encoding)
1020        self.assertRaises(UnicodeError, u'Andr\202 x'.encode, 'ascii')
1021        self.assertRaises(UnicodeError, u'Andr\202 x'.encode, 'ascii','strict')
1022        self.assertEqual(u'Andr\202 x'.encode('ascii','ignore'), "Andr x")
1023        self.assertEqual(u'Andr\202 x'.encode('ascii','replace'), "Andr? x")
1024        self.assertEqual(u'Andr\202 x'.encode('ascii', 'replace'),
1025                         u'Andr\202 x'.encode('ascii', errors='replace'))
1026        self.assertEqual(u'Andr\202 x'.encode('ascii', 'ignore'),
1027                         u'Andr\202 x'.encode(encoding='ascii', errors='ignore'))
1028
1029        # Error handling (decoding)
1030        self.assertRaises(UnicodeError, unicode, 'Andr\202 x', 'ascii')
1031        self.assertRaises(UnicodeError, unicode, 'Andr\202 x', 'ascii','strict')
1032        self.assertEqual(unicode('Andr\202 x','ascii','ignore'), u"Andr x")
1033        self.assertEqual(unicode('Andr\202 x','ascii','replace'), u'Andr\uFFFD x')
1034        self.assertEqual(u'abcde'.decode('ascii', 'ignore'),
1035                         u'abcde'.decode('ascii', errors='ignore'))
1036        self.assertEqual(u'abcde'.decode('ascii', 'replace'),
1037                         u'abcde'.decode(encoding='ascii', errors='replace'))
1038
1039        # Error handling (unknown character names)
1040        self.assertEqual("\\N{foo}xx".decode("unicode-escape", "ignore"), u"xx")
1041
1042        # Error handling (truncated escape sequence)
1043        self.assertRaises(UnicodeError, "\\".decode, "unicode-escape")
1044
1045        self.assertRaises(TypeError, "hello".decode, "test.unicode1")
1046        self.assertRaises(TypeError, unicode, "hello", "test.unicode2")
1047        self.assertRaises(TypeError, u"hello".encode, "test.unicode1")
1048        self.assertRaises(TypeError, u"hello".encode, "test.unicode2")
1049        # executes PyUnicode_Encode()
1050        import imp
1051        self.assertRaises(
1052            ImportError,
1053            imp.find_module,
1054            "non-existing module",
1055            [u"non-existing dir"]
1056        )
1057
1058        # Error handling (wrong arguments)
1059        self.assertRaises(TypeError, u"hello".encode, 42, 42, 42)
1060
1061        # Error handling (PyUnicode_EncodeDecimal())
1062        self.assertRaises(UnicodeError, int, u"\u0200")
1063
1064    def test_codecs(self):
1065        # Encoding
1066        self.assertEqual(u'hello'.encode('ascii'), 'hello')
1067        self.assertEqual(u'hello'.encode('utf-7'), 'hello')
1068        self.assertEqual(u'hello'.encode('utf-8'), 'hello')
1069        self.assertEqual(u'hello'.encode('utf8'), 'hello')
1070        self.assertEqual(u'hello'.encode('utf-16-le'), 'h\000e\000l\000l\000o\000')
1071        self.assertEqual(u'hello'.encode('utf-16-be'), '\000h\000e\000l\000l\000o')
1072        self.assertEqual(u'hello'.encode('latin-1'), 'hello')
1073
1074        # Roundtrip safety for BMP (just the first 1024 chars)
1075        for c in xrange(1024):
1076            u = unichr(c)
1077            for encoding in ('utf-7', 'utf-8', 'utf-16', 'utf-16-le',
1078                             'utf-16-be', 'raw_unicode_escape',
1079                             'unicode_escape', 'unicode_internal'):
1080                self.assertEqual(unicode(u.encode(encoding),encoding), u)
1081
1082        # Roundtrip safety for BMP (just the first 256 chars)
1083        for c in xrange(256):
1084            u = unichr(c)
1085            for encoding in ('latin-1',):
1086                self.assertEqual(unicode(u.encode(encoding),encoding), u)
1087
1088        # Roundtrip safety for BMP (just the first 128 chars)
1089        for c in xrange(128):
1090            u = unichr(c)
1091            for encoding in ('ascii',):
1092                self.assertEqual(unicode(u.encode(encoding),encoding), u)
1093
1094        # Roundtrip safety for non-BMP (just a few chars)
1095        u = u'\U00010001\U00020002\U00030003\U00040004\U00050005'
1096        for encoding in ('utf-8', 'utf-16', 'utf-16-le', 'utf-16-be',
1097                         #'raw_unicode_escape',
1098                         'unicode_escape', 'unicode_internal'):
1099            self.assertEqual(unicode(u.encode(encoding),encoding), u)
1100
1101        # UTF-8 must be roundtrip safe for all UCS-2 code points
1102        # This excludes surrogates: in the full range, there would be
1103        # a surrogate pair (\udbff\udc00), which gets converted back
1104        # to a non-BMP character (\U0010fc00)
1105        u = u''.join(map(unichr, range(0,0xd800)+range(0xe000,0x10000)))
1106        for encoding in ('utf-8',):
1107            self.assertEqual(unicode(u.encode(encoding),encoding), u)
1108
1109    def test_codecs_charmap(self):
1110        # 0-127
1111        s = ''.join(map(chr, xrange(128)))
1112        for encoding in (
1113            'cp037', 'cp1026',
1114            'cp437', 'cp500', 'cp720', 'cp737', 'cp775', 'cp850',
1115            'cp852', 'cp855', 'cp858', 'cp860', 'cp861', 'cp862',
1116            'cp863', 'cp865', 'cp866',
1117            'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
1118            'iso8859_2', 'iso8859_3', 'iso8859_4', 'iso8859_5', 'iso8859_6',
1119            'iso8859_7', 'iso8859_9', 'koi8_r', 'latin_1',
1120            'mac_cyrillic', 'mac_latin2',
1121
1122            'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
1123            'cp1256', 'cp1257', 'cp1258',
1124            'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
1125
1126            'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
1127            'cp1006', 'iso8859_8',
1128
1129            ### These have undefined mappings:
1130            #'cp424',
1131
1132            ### These fail the round-trip:
1133            #'cp875'
1134
1135            ):
1136            self.assertEqual(unicode(s, encoding).encode(encoding), s)
1137
1138        # 128-255
1139        s = ''.join(map(chr, xrange(128, 256)))
1140        for encoding in (
1141            'cp037', 'cp1026',
1142            'cp437', 'cp500', 'cp720', 'cp737', 'cp775', 'cp850',
1143            'cp852', 'cp855', 'cp858', 'cp860', 'cp861', 'cp862',
1144            'cp863', 'cp865', 'cp866',
1145            'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
1146            'iso8859_2', 'iso8859_4', 'iso8859_5',
1147            'iso8859_9', 'koi8_r', 'latin_1',
1148            'mac_cyrillic', 'mac_latin2',
1149
1150            ### These have undefined mappings:
1151            #'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
1152            #'cp1256', 'cp1257', 'cp1258',
1153            #'cp424', 'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
1154            #'iso8859_3', 'iso8859_6', 'iso8859_7',
1155            #'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
1156
1157            ### These fail the round-trip:
1158            #'cp1006', 'cp875', 'iso8859_8',
1159
1160            ):
1161            self.assertEqual(unicode(s, encoding).encode(encoding), s)
1162
1163    def test_concatenation(self):
1164        self.assertEqual((u"abc" u"def"), u"abcdef")
1165        self.assertEqual(("abc" u"def"), u"abcdef")
1166        self.assertEqual((u"abc" "def"), u"abcdef")
1167        self.assertEqual((u"abc" u"def" "ghi"), u"abcdefghi")
1168        self.assertEqual(("abc" "def" u"ghi"), u"abcdefghi")
1169
1170    def test_printing(self):
1171        class BitBucket:
1172            def write(self, text):
1173                pass
1174
1175        out = BitBucket()
1176        print >>out, u'abc'
1177        print >>out, u'abc', u'def'
1178        print >>out, u'abc', 'def'
1179        print >>out, 'abc', u'def'
1180        print >>out, u'abc\n'
1181        print >>out, u'abc\n',
1182        print >>out, u'abc\n',
1183        print >>out, u'def\n'
1184        print >>out, u'def\n'
1185
1186    def test_ucs4(self):
1187        x = u'\U00100000'
1188        y = x.encode("raw-unicode-escape").decode("raw-unicode-escape")
1189        self.assertEqual(x, y)
1190
1191        y = r'\U00100000'
1192        x = y.decode("raw-unicode-escape").encode("raw-unicode-escape")
1193        self.assertEqual(x, y)
1194        y = r'\U00010000'
1195        x = y.decode("raw-unicode-escape").encode("raw-unicode-escape")
1196        self.assertEqual(x, y)
1197
1198        try:
1199            '\U11111111'.decode("raw-unicode-escape")
1200        except UnicodeDecodeError as e:
1201            self.assertEqual(e.start, 0)
1202            self.assertEqual(e.end, 10)
1203        else:
1204            self.fail("Should have raised UnicodeDecodeError")
1205
1206    def test_conversion(self):
1207        # Make sure __unicode__() works properly
1208        class Foo0:
1209            def __str__(self):
1210                return "foo"
1211
1212        class Foo1:
1213            def __unicode__(self):
1214                return u"foo"
1215
1216        class Foo2(object):
1217            def __unicode__(self):
1218                return u"foo"
1219
1220        class Foo3(object):
1221            def __unicode__(self):
1222                return "foo"
1223
1224        class Foo4(str):
1225            def __unicode__(self):
1226                return "foo"
1227
1228        class Foo5(unicode):
1229            def __unicode__(self):
1230                return "foo"
1231
1232        class Foo6(str):
1233            def __str__(self):
1234                return "foos"
1235
1236            def __unicode__(self):
1237                return u"foou"
1238
1239        class Foo7(unicode):
1240            def __str__(self):
1241                return "foos"
1242            def __unicode__(self):
1243                return u"foou"
1244
1245        class Foo8(unicode):
1246            def __new__(cls, content=""):
1247                return unicode.__new__(cls, 2*content)
1248            def __unicode__(self):
1249                return self
1250
1251        class Foo9(unicode):
1252            def __str__(self):
1253                return "string"
1254            def __unicode__(self):
1255                return "not unicode"
1256
1257        self.assertEqual(unicode(Foo0()), u"foo")
1258        self.assertEqual(unicode(Foo1()), u"foo")
1259        self.assertEqual(unicode(Foo2()), u"foo")
1260        self.assertEqual(unicode(Foo3()), u"foo")
1261        self.assertEqual(unicode(Foo4("bar")), u"foo")
1262        self.assertEqual(unicode(Foo5("bar")), u"foo")
1263        self.assertEqual(unicode(Foo6("bar")), u"foou")
1264        self.assertEqual(unicode(Foo7("bar")), u"foou")
1265        self.assertEqual(unicode(Foo8("foo")), u"foofoo")
1266        self.assertEqual(str(Foo9("foo")), "string")
1267        self.assertEqual(unicode(Foo9("foo")), u"not unicode")
1268
1269    def test_unicode_repr(self):
1270        class s1:
1271            def __repr__(self):
1272                return '\\n'
1273
1274        class s2:
1275            def __repr__(self):
1276                return u'\\n'
1277
1278        self.assertEqual(repr(s1()), '\\n')
1279        self.assertEqual(repr(s2()), '\\n')
1280
1281    def test_expandtabs_overflows_gracefully(self):
1282        # This test only affects 32-bit platforms because expandtabs can only take
1283        # an int as the max value, not a 64-bit C long.  If expandtabs is changed
1284        # to take a 64-bit long, this test should apply to all platforms.
1285        if sys.maxint > (1 << 32) or struct.calcsize('P') != 4:
1286            return
1287        self.assertRaises(OverflowError, u't\tt\t'.expandtabs, sys.maxint)
1288
1289    def test__format__(self):
1290        def test(value, format, expected):
1291            # test both with and without the trailing 's'
1292            self.assertEqual(value.__format__(format), expected)
1293            self.assertEqual(value.__format__(format + u's'), expected)
1294
1295        test(u'', u'', u'')
1296        test(u'abc', u'', u'abc')
1297        test(u'abc', u'.3', u'abc')
1298        test(u'ab', u'.3', u'ab')
1299        test(u'abcdef', u'.3', u'abc')
1300        test(u'abcdef', u'.0', u'')
1301        test(u'abc', u'3.3', u'abc')
1302        test(u'abc', u'2.3', u'abc')
1303        test(u'abc', u'2.2', u'ab')
1304        test(u'abc', u'3.2', u'ab ')
1305        test(u'result', u'x<0', u'result')
1306        test(u'result', u'x<5', u'result')
1307        test(u'result', u'x<6', u'result')
1308        test(u'result', u'x<7', u'resultx')
1309        test(u'result', u'x<8', u'resultxx')
1310        test(u'result', u' <7', u'result ')
1311        test(u'result', u'<7', u'result ')
1312        test(u'result', u'>7', u' result')
1313        test(u'result', u'>8', u'  result')
1314        test(u'result', u'^8', u' result ')
1315        test(u'result', u'^9', u' result  ')
1316        test(u'result', u'^10', u'  result  ')
1317        test(u'a', u'10000', u'a' + u' ' * 9999)
1318        test(u'', u'10000', u' ' * 10000)
1319        test(u'', u'10000000', u' ' * 10000000)
1320
1321        # test mixing unicode and str
1322        self.assertEqual(u'abc'.__format__('s'), u'abc')
1323        self.assertEqual(u'abc'.__format__('->10s'), u'-------abc')
1324
1325    def test_format(self):
1326        self.assertEqual(u''.format(), u'')
1327        self.assertEqual(u'a'.format(), u'a')
1328        self.assertEqual(u'ab'.format(), u'ab')
1329        self.assertEqual(u'a{{'.format(), u'a{')
1330        self.assertEqual(u'a}}'.format(), u'a}')
1331        self.assertEqual(u'{{b'.format(), u'{b')
1332        self.assertEqual(u'}}b'.format(), u'}b')
1333        self.assertEqual(u'a{{b'.format(), u'a{b')
1334
1335        # examples from the PEP:
1336        import datetime
1337        self.assertEqual(u"My name is {0}".format(u'Fred'), u"My name is Fred")
1338        self.assertEqual(u"My name is {0[name]}".format(dict(name=u'Fred')),
1339                         u"My name is Fred")
1340        self.assertEqual(u"My name is {0} :-{{}}".format(u'Fred'),
1341                         u"My name is Fred :-{}")
1342
1343        # datetime.__format__ doesn't work with unicode
1344        #d = datetime.date(2007, 8, 18)
1345        #self.assertEqual("The year is {0.year}".format(d),
1346        #                 "The year is 2007")
1347
1348        # classes we'll use for testing
1349        class C:
1350            def __init__(self, x=100):
1351                self._x = x
1352            def __format__(self, spec):
1353                return spec
1354
1355        class D:
1356            def __init__(self, x):
1357                self.x = x
1358            def __format__(self, spec):
1359                return str(self.x)
1360
1361        # class with __str__, but no __format__
1362        class E:
1363            def __init__(self, x):
1364                self.x = x
1365            def __str__(self):
1366                return u'E(' + self.x + u')'
1367
1368        # class with __repr__, but no __format__ or __str__
1369        class F:
1370            def __init__(self, x):
1371                self.x = x
1372            def __repr__(self):
1373                return u'F(' + self.x + u')'
1374
1375        # class with __format__ that forwards to string, for some format_spec's
1376        class G:
1377            def __init__(self, x):
1378                self.x = x
1379            def __str__(self):
1380                return u"string is " + self.x
1381            def __format__(self, format_spec):
1382                if format_spec == 'd':
1383                    return u'G(' + self.x + u')'
1384                return object.__format__(self, format_spec)
1385
1386        # class that returns a bad type from __format__
1387        class H:
1388            def __format__(self, format_spec):
1389                return 1.0
1390
1391        class I(datetime.date):
1392            def __format__(self, format_spec):
1393                return self.strftime(format_spec)
1394
1395        class J(int):
1396            def __format__(self, format_spec):
1397                return int.__format__(self * 2, format_spec)
1398
1399
1400        self.assertEqual(u''.format(), u'')
1401        self.assertEqual(u'abc'.format(), u'abc')
1402        self.assertEqual(u'{0}'.format(u'abc'), u'abc')
1403        self.assertEqual(u'{0:}'.format(u'abc'), u'abc')
1404        self.assertEqual(u'X{0}'.format(u'abc'), u'Xabc')
1405        self.assertEqual(u'{0}X'.format(u'abc'), u'abcX')
1406        self.assertEqual(u'X{0}Y'.format(u'abc'), u'XabcY')
1407        self.assertEqual(u'{1}'.format(1, u'abc'), u'abc')
1408        self.assertEqual(u'X{1}'.format(1, u'abc'), u'Xabc')
1409        self.assertEqual(u'{1}X'.format(1, u'abc'), u'abcX')
1410        self.assertEqual(u'X{1}Y'.format(1, u'abc'), u'XabcY')
1411        self.assertEqual(u'{0}'.format(-15), u'-15')
1412        self.assertEqual(u'{0}{1}'.format(-15, u'abc'), u'-15abc')
1413        self.assertEqual(u'{0}X{1}'.format(-15, u'abc'), u'-15Xabc')
1414        self.assertEqual(u'{{'.format(), u'{')
1415        self.assertEqual(u'}}'.format(), u'}')
1416        self.assertEqual(u'{{}}'.format(), u'{}')
1417        self.assertEqual(u'{{x}}'.format(), u'{x}')
1418        self.assertEqual(u'{{{0}}}'.format(123), u'{123}')
1419        self.assertEqual(u'{{{{0}}}}'.format(), u'{{0}}')
1420        self.assertEqual(u'}}{{'.format(), u'}{')
1421        self.assertEqual(u'}}x{{'.format(), u'}x{')
1422
1423        # weird field names
1424        self.assertEqual(u"{0[foo-bar]}".format({u'foo-bar':u'baz'}), u'baz')
1425        self.assertEqual(u"{0[foo bar]}".format({u'foo bar':u'baz'}), u'baz')
1426        self.assertEqual(u"{0[ ]}".format({u' ':3}), u'3')
1427
1428        self.assertEqual(u'{foo._x}'.format(foo=C(20)), u'20')
1429        self.assertEqual(u'{1}{0}'.format(D(10), D(20)), u'2010')
1430        self.assertEqual(u'{0._x.x}'.format(C(D(u'abc'))), u'abc')
1431        self.assertEqual(u'{0[0]}'.format([u'abc', u'def']), u'abc')
1432        self.assertEqual(u'{0[1]}'.format([u'abc', u'def']), u'def')
1433        self.assertEqual(u'{0[1][0]}'.format([u'abc', [u'def']]), u'def')
1434        self.assertEqual(u'{0[1][0].x}'.format(['abc', [D(u'def')]]), u'def')
1435
1436        # strings
1437        self.assertEqual(u'{0:.3s}'.format(u'abc'), u'abc')
1438        self.assertEqual(u'{0:.3s}'.format(u'ab'), u'ab')
1439        self.assertEqual(u'{0:.3s}'.format(u'abcdef'), u'abc')
1440        self.assertEqual(u'{0:.0s}'.format(u'abcdef'), u'')
1441        self.assertEqual(u'{0:3.3s}'.format(u'abc'), u'abc')
1442        self.assertEqual(u'{0:2.3s}'.format(u'abc'), u'abc')
1443        self.assertEqual(u'{0:2.2s}'.format(u'abc'), u'ab')
1444        self.assertEqual(u'{0:3.2s}'.format(u'abc'), u'ab ')
1445        self.assertEqual(u'{0:x<0s}'.format(u'result'), u'result')
1446        self.assertEqual(u'{0:x<5s}'.format(u'result'), u'result')
1447        self.assertEqual(u'{0:x<6s}'.format(u'result'), u'result')
1448        self.assertEqual(u'{0:x<7s}'.format(u'result'), u'resultx')
1449        self.assertEqual(u'{0:x<8s}'.format(u'result'), u'resultxx')
1450        self.assertEqual(u'{0: <7s}'.format(u'result'), u'result ')
1451        self.assertEqual(u'{0:<7s}'.format(u'result'), u'result ')
1452        self.assertEqual(u'{0:>7s}'.format(u'result'), u' result')
1453        self.assertEqual(u'{0:>8s}'.format(u'result'), u'  result')
1454        self.assertEqual(u'{0:^8s}'.format(u'result'), u' result ')
1455        self.assertEqual(u'{0:^9s}'.format(u'result'), u' result  ')
1456        self.assertEqual(u'{0:^10s}'.format(u'result'), u'  result  ')
1457        self.assertEqual(u'{0:10000}'.format(u'a'), u'a' + u' ' * 9999)
1458        self.assertEqual(u'{0:10000}'.format(u''), u' ' * 10000)
1459        self.assertEqual(u'{0:10000000}'.format(u''), u' ' * 10000000)
1460
1461        # format specifiers for user defined type
1462        self.assertEqual(u'{0:abc}'.format(C()), u'abc')
1463
1464        # !r and !s coercions
1465        self.assertEqual(u'{0!s}'.format(u'Hello'), u'Hello')
1466        self.assertEqual(u'{0!s:}'.format(u'Hello'), u'Hello')
1467        self.assertEqual(u'{0!s:15}'.format(u'Hello'), u'Hello          ')
1468        self.assertEqual(u'{0!s:15s}'.format(u'Hello'), u'Hello          ')
1469        self.assertEqual(u'{0!r}'.format(u'Hello'), u"u'Hello'")
1470        self.assertEqual(u'{0!r:}'.format(u'Hello'), u"u'Hello'")
1471        self.assertEqual(u'{0!r}'.format(F(u'Hello')), u'F(Hello)')
1472
1473        # test fallback to object.__format__
1474        self.assertEqual(u'{0}'.format({}), u'{}')
1475        self.assertEqual(u'{0}'.format([]), u'[]')
1476        self.assertEqual(u'{0}'.format([1]), u'[1]')
1477        self.assertEqual(u'{0}'.format(E(u'data')), u'E(data)')
1478        self.assertEqual(u'{0:d}'.format(G(u'data')), u'G(data)')
1479        self.assertEqual(u'{0!s}'.format(G(u'data')), u'string is data')
1480
1481        msg = 'object.__format__ with a non-empty format string is deprecated'
1482        with test_support.check_warnings((msg, PendingDeprecationWarning)):
1483            self.assertEqual(u'{0:^10}'.format(E(u'data')), u' E(data)  ')
1484            self.assertEqual(u'{0:^10s}'.format(E(u'data')), u' E(data)  ')
1485            self.assertEqual(u'{0:>15s}'.format(G(u'data')), u' string is data')
1486
1487        self.assertEqual(u"{0:date: %Y-%m-%d}".format(I(year=2007,
1488                                                        month=8,
1489                                                        day=27)),
1490                         u"date: 2007-08-27")
1491
1492        # test deriving from a builtin type and overriding __format__
1493        self.assertEqual(u"{0}".format(J(10)), u"20")
1494
1495
1496        # string format specifiers
1497        self.assertEqual(u'{0:}'.format('a'), u'a')
1498
1499        # computed format specifiers
1500        self.assertEqual(u"{0:.{1}}".format(u'hello world', 5), u'hello')
1501        self.assertEqual(u"{0:.{1}s}".format(u'hello world', 5), u'hello')
1502        self.assertEqual(u"{0:.{precision}s}".format('hello world', precision=5), u'hello')
1503        self.assertEqual(u"{0:{width}.{precision}s}".format('hello world', width=10, precision=5), u'hello     ')
1504        self.assertEqual(u"{0:{width}.{precision}s}".format('hello world', width='10', precision='5'), u'hello     ')
1505
1506        # test various errors
1507        self.assertRaises(ValueError, u'{'.format)
1508        self.assertRaises(ValueError, u'}'.format)
1509        self.assertRaises(ValueError, u'a{'.format)
1510        self.assertRaises(ValueError, u'a}'.format)
1511        self.assertRaises(ValueError, u'{a'.format)
1512        self.assertRaises(ValueError, u'}a'.format)
1513        self.assertRaises(IndexError, u'{0}'.format)
1514        self.assertRaises(IndexError, u'{1}'.format, u'abc')
1515        self.assertRaises(KeyError,   u'{x}'.format)
1516        self.assertRaises(ValueError, u"}{".format)
1517        self.assertRaises(ValueError, u"{".format)
1518        self.assertRaises(ValueError, u"}".format)
1519        self.assertRaises(ValueError, u"abc{0:{}".format)
1520        self.assertRaises(ValueError, u"{0".format)
1521        self.assertRaises(IndexError, u"{0.}".format)
1522        self.assertRaises(ValueError, u"{0.}".format, 0)
1523        self.assertRaises(IndexError, u"{0[}".format)
1524        self.assertRaises(ValueError, u"{0[}".format, [])
1525        self.assertRaises(KeyError,   u"{0]}".format)
1526        self.assertRaises(ValueError, u"{0.[]}".format, 0)
1527        self.assertRaises(ValueError, u"{0..foo}".format, 0)
1528        self.assertRaises(ValueError, u"{0[0}".format, 0)
1529        self.assertRaises(ValueError, u"{0[0:foo}".format, 0)
1530        self.assertRaises(KeyError,   u"{c]}".format)
1531        self.assertRaises(ValueError, u"{{ {{{0}}".format, 0)
1532        self.assertRaises(ValueError, u"{0}}".format, 0)
1533        self.assertRaises(KeyError,   u"{foo}".format, bar=3)
1534        self.assertRaises(ValueError, u"{0!x}".format, 3)
1535        self.assertRaises(ValueError, u"{0!}".format, 0)
1536        self.assertRaises(ValueError, u"{0!rs}".format, 0)
1537        self.assertRaises(ValueError, u"{!}".format)
1538        self.assertRaises(IndexError, u"{:}".format)
1539        self.assertRaises(IndexError, u"{:s}".format)
1540        self.assertRaises(IndexError, u"{}".format)
1541        big = u"23098475029384702983476098230754973209482573"
1542        self.assertRaises(ValueError, (u"{" + big + u"}").format)
1543        self.assertRaises(ValueError, (u"{[" + big + u"]}").format, [0])
1544
1545        # issue 6089
1546        self.assertRaises(ValueError, u"{0[0]x}".format, [None])
1547        self.assertRaises(ValueError, u"{0[0](10)}".format, [None])
1548
1549        # can't have a replacement on the field name portion
1550        self.assertRaises(TypeError, u'{0[{1}]}'.format, u'abcdefg', 4)
1551
1552        # exceed maximum recursion depth
1553        self.assertRaises(ValueError, u"{0:{1:{2}}}".format, u'abc', u's', u'')
1554        self.assertRaises(ValueError, u"{0:{1:{2:{3:{4:{5:{6}}}}}}}".format,
1555                          0, 1, 2, 3, 4, 5, 6, 7)
1556
1557        # string format spec errors
1558        self.assertRaises(ValueError, u"{0:-s}".format, u'')
1559        self.assertRaises(ValueError, format, u"", u"-")
1560        self.assertRaises(ValueError, u"{0:=s}".format, u'')
1561
1562        # test combining string and unicode
1563        self.assertEqual(u"foo{0}".format('bar'), u'foobar')
1564        # This will try to convert the argument from unicode to str, which
1565        #  will succeed
1566        self.assertEqual("foo{0}".format(u'bar'), 'foobar')
1567        # This will try to convert the argument from unicode to str, which
1568        #  will fail
1569        self.assertRaises(UnicodeEncodeError, "foo{0}".format, u'\u1000bar')
1570
1571    def test_format_huge_precision(self):
1572        format_string = u".{}f".format(sys.maxsize + 1)
1573        with self.assertRaises(ValueError):
1574            result = format(2.34, format_string)
1575
1576    def test_format_huge_width(self):
1577        format_string = u"{}f".format(sys.maxsize + 1)
1578        with self.assertRaises(ValueError):
1579            result = format(2.34, format_string)
1580
1581    def test_format_huge_item_number(self):
1582        format_string = u"{{{}:.6f}}".format(sys.maxsize + 1)
1583        with self.assertRaises(ValueError):
1584            result = format_string.format(2.34)
1585
1586    def test_format_auto_numbering(self):
1587        class C:
1588            def __init__(self, x=100):
1589                self._x = x
1590            def __format__(self, spec):
1591                return spec
1592
1593        self.assertEqual(u'{}'.format(10), u'10')
1594        self.assertEqual(u'{:5}'.format('s'), u's    ')
1595        self.assertEqual(u'{!r}'.format('s'), u"'s'")
1596        self.assertEqual(u'{._x}'.format(C(10)), u'10')
1597        self.assertEqual(u'{[1]}'.format([1, 2]), u'2')
1598        self.assertEqual(u'{[a]}'.format({'a':4, 'b':2}), u'4')
1599        self.assertEqual(u'a{}b{}c'.format(0, 1), u'a0b1c')
1600
1601        self.assertEqual(u'a{:{}}b'.format('x', '^10'), u'a    x     b')
1602        self.assertEqual(u'a{:{}x}b'.format(20, '#'), u'a0x14b')
1603
1604        # can't mix and match numbering and auto-numbering
1605        self.assertRaises(ValueError, u'{}{1}'.format, 1, 2)
1606        self.assertRaises(ValueError, u'{1}{}'.format, 1, 2)
1607        self.assertRaises(ValueError, u'{:{1}}'.format, 1, 2)
1608        self.assertRaises(ValueError, u'{0:{}}'.format, 1, 2)
1609
1610        # can mix and match auto-numbering and named
1611        self.assertEqual(u'{f}{}'.format(4, f='test'), u'test4')
1612        self.assertEqual(u'{}{f}'.format(4, f='test'), u'4test')
1613        self.assertEqual(u'{:{f}}{g}{}'.format(1, 3, g='g', f=2), u' 1g3')
1614        self.assertEqual(u'{f:{}}{}{g}'.format(2, 4, f=1, g='g'), u' 14g')
1615
1616    def test_raiseMemError(self):
1617        # Ensure that the freelist contains a consistent object, even
1618        # when a string allocation fails with a MemoryError.
1619        # This used to crash the interpreter,
1620        # or leak references when the number was smaller.
1621        charwidth = 4 if sys.maxunicode >= 0x10000 else 2
1622        # Note: sys.maxsize is half of the actual max allocation because of
1623        # the signedness of Py_ssize_t.
1624        alloc = lambda: u"a" * (sys.maxsize // charwidth * 2)
1625        self.assertRaises(MemoryError, alloc)
1626        self.assertRaises(MemoryError, alloc)
1627
1628    def test_format_subclass(self):
1629        class U(unicode):
1630            def __unicode__(self):
1631                return u'__unicode__ overridden'
1632        u = U(u'xxx')
1633        self.assertEqual("%s" % u, u'__unicode__ overridden')
1634        self.assertEqual("{}".format(u), '__unicode__ overridden')
1635
1636    def test_encode_decimal(self):
1637        from _testcapi import unicode_encodedecimal
1638        self.assertEqual(unicode_encodedecimal(u'123'),
1639                         b'123')
1640        self.assertEqual(unicode_encodedecimal(u'\u0663.\u0661\u0664'),
1641                         b'3.14')
1642        self.assertEqual(unicode_encodedecimal(u"\N{EM SPACE}3.14\N{EN SPACE}"),
1643                         b' 3.14 ')
1644        self.assertRaises(UnicodeEncodeError,
1645                          unicode_encodedecimal, u"123\u20ac", "strict")
1646        self.assertEqual(unicode_encodedecimal(u"123\u20ac", "replace"),
1647                         b'123?')
1648        self.assertEqual(unicode_encodedecimal(u"123\u20ac", "ignore"),
1649                         b'123')
1650        self.assertEqual(unicode_encodedecimal(u"123\u20ac", "xmlcharrefreplace"),
1651                         b'123&#8364;')
1652        self.assertEqual(unicode_encodedecimal(u"123\u20ac", "backslashreplace"),
1653                         b'123\\u20ac')
1654        self.assertEqual(unicode_encodedecimal(u"123\u20ac\N{EM SPACE}", "replace"),
1655                         b'123? ')
1656        self.assertEqual(unicode_encodedecimal(u"123\u20ac\u20ac", "replace"),
1657                         b'123??')
1658        self.assertEqual(unicode_encodedecimal(u"123\u20ac\u0660", "replace"),
1659                         b'123?0')
1660
1661
1662def test_main():
1663    test_support.run_unittest(__name__)
1664
1665if __name__ == "__main__":
1666    test_main()
1667