unicodeobject.c revision 0d4df752acbaf14164f1e8b2b95ebe3fe288bb82
1/*
2
3Unicode implementation based on original code by Fredrik Lundh,
4modified by Marc-Andre Lemburg <mal@lemburg.com>.
5
6Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
9Copyright (c) Corporation for National Research Initiatives.
10
11--------------------------------------------------------------------
12The original string type implementation is:
13
14  Copyright (c) 1999 by Secret Labs AB
15  Copyright (c) 1999 by Fredrik Lundh
16
17By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
40
41#define PY_SSIZE_T_CLEAN
42#include "Python.h"
43#include "ucnhash.h"
44#include "bytes_methods.h"
45
46#ifdef MS_WINDOWS
47#include <windows.h>
48#endif
49
50/*[clinic input]
51class str "PyUnicodeObject *" "&PyUnicode_Type"
52[clinic start generated code]*/
53/*[clinic end generated code: output=da39a3ee5e6b4b0d input=604e916854800fa8]*/
54
55/* --- Globals ------------------------------------------------------------
56
57NOTE: In the interpreter's initialization phase, some globals are currently
58      initialized dynamically as needed. In the process Unicode objects may
59      be created before the Unicode type is ready.
60
61*/
62
63
64#ifdef __cplusplus
65extern "C" {
66#endif
67
68/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
69#define MAX_UNICODE 0x10ffff
70
71#ifdef Py_DEBUG
72#  define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
73#else
74#  define _PyUnicode_CHECK(op) PyUnicode_Check(op)
75#endif
76
77#define _PyUnicode_UTF8(op)                             \
78    (((PyCompactUnicodeObject*)(op))->utf8)
79#define PyUnicode_UTF8(op)                              \
80    (assert(_PyUnicode_CHECK(op)),                      \
81     assert(PyUnicode_IS_READY(op)),                    \
82     PyUnicode_IS_COMPACT_ASCII(op) ?                   \
83         ((char*)((PyASCIIObject*)(op) + 1)) :          \
84         _PyUnicode_UTF8(op))
85#define _PyUnicode_UTF8_LENGTH(op)                      \
86    (((PyCompactUnicodeObject*)(op))->utf8_length)
87#define PyUnicode_UTF8_LENGTH(op)                       \
88    (assert(_PyUnicode_CHECK(op)),                      \
89     assert(PyUnicode_IS_READY(op)),                    \
90     PyUnicode_IS_COMPACT_ASCII(op) ?                   \
91         ((PyASCIIObject*)(op))->length :               \
92         _PyUnicode_UTF8_LENGTH(op))
93#define _PyUnicode_WSTR(op)                             \
94    (((PyASCIIObject*)(op))->wstr)
95#define _PyUnicode_WSTR_LENGTH(op)                      \
96    (((PyCompactUnicodeObject*)(op))->wstr_length)
97#define _PyUnicode_LENGTH(op)                           \
98    (((PyASCIIObject *)(op))->length)
99#define _PyUnicode_STATE(op)                            \
100    (((PyASCIIObject *)(op))->state)
101#define _PyUnicode_HASH(op)                             \
102    (((PyASCIIObject *)(op))->hash)
103#define _PyUnicode_KIND(op)                             \
104    (assert(_PyUnicode_CHECK(op)),                      \
105     ((PyASCIIObject *)(op))->state.kind)
106#define _PyUnicode_GET_LENGTH(op)                       \
107    (assert(_PyUnicode_CHECK(op)),                      \
108     ((PyASCIIObject *)(op))->length)
109#define _PyUnicode_DATA_ANY(op)                         \
110    (((PyUnicodeObject*)(op))->data.any)
111
112#undef PyUnicode_READY
113#define PyUnicode_READY(op)                             \
114    (assert(_PyUnicode_CHECK(op)),                      \
115     (PyUnicode_IS_READY(op) ?                          \
116      0 :                                               \
117      _PyUnicode_Ready(op)))
118
119#define _PyUnicode_SHARE_UTF8(op)                       \
120    (assert(_PyUnicode_CHECK(op)),                      \
121     assert(!PyUnicode_IS_COMPACT_ASCII(op)),           \
122     (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
123#define _PyUnicode_SHARE_WSTR(op)                       \
124    (assert(_PyUnicode_CHECK(op)),                      \
125     (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
126
127/* true if the Unicode object has an allocated UTF-8 memory block
128   (not shared with other data) */
129#define _PyUnicode_HAS_UTF8_MEMORY(op)                  \
130    ((!PyUnicode_IS_COMPACT_ASCII(op)                   \
131      && _PyUnicode_UTF8(op)                            \
132      && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
133
134/* true if the Unicode object has an allocated wstr memory block
135   (not shared with other data) */
136#define _PyUnicode_HAS_WSTR_MEMORY(op)                  \
137    ((_PyUnicode_WSTR(op) &&                            \
138      (!PyUnicode_IS_READY(op) ||                       \
139       _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
140
141/* Generic helper macro to convert characters of different types.
142   from_type and to_type have to be valid type names, begin and end
143   are pointers to the source characters which should be of type
144   "from_type *".  to is a pointer of type "to_type *" and points to the
145   buffer where the result characters are written to. */
146#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
147    do {                                                \
148        to_type *_to = (to_type *)(to);                \
149        const from_type *_iter = (from_type *)(begin);  \
150        const from_type *_end = (from_type *)(end);     \
151        Py_ssize_t n = (_end) - (_iter);                \
152        const from_type *_unrolled_end =                \
153            _iter + _Py_SIZE_ROUND_DOWN(n, 4);          \
154        while (_iter < (_unrolled_end)) {               \
155            _to[0] = (to_type) _iter[0];                \
156            _to[1] = (to_type) _iter[1];                \
157            _to[2] = (to_type) _iter[2];                \
158            _to[3] = (to_type) _iter[3];                \
159            _iter += 4; _to += 4;                       \
160        }                                               \
161        while (_iter < (_end))                          \
162            *_to++ = (to_type) *_iter++;                \
163    } while (0)
164
165/* This dictionary holds all interned unicode strings.  Note that references
166   to strings in this dictionary are *not* counted in the string's ob_refcnt.
167   When the interned string reaches a refcnt of 0 the string deallocation
168   function will delete the reference from this dictionary.
169
170   Another way to look at this is that to say that the actual reference
171   count of a string is:  s->ob_refcnt + (s->state ? 2 : 0)
172*/
173static PyObject *interned = NULL;
174
175/* The empty Unicode object is shared to improve performance. */
176static PyObject *unicode_empty = NULL;
177
178#define _Py_INCREF_UNICODE_EMPTY()                      \
179    do {                                                \
180        if (unicode_empty != NULL)                      \
181            Py_INCREF(unicode_empty);                   \
182        else {                                          \
183            unicode_empty = PyUnicode_New(0, 0);        \
184            if (unicode_empty != NULL) {                \
185                Py_INCREF(unicode_empty);               \
186                assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
187            }                                           \
188        }                                               \
189    } while (0)
190
191#define _Py_RETURN_UNICODE_EMPTY()                      \
192    do {                                                \
193        _Py_INCREF_UNICODE_EMPTY();                     \
194        return unicode_empty;                           \
195    } while (0)
196
197/* Forward declaration */
198Py_LOCAL_INLINE(int)
199_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
200
201/* List of static strings. */
202static _Py_Identifier *static_strings = NULL;
203
204/* Single character Unicode strings in the Latin-1 range are being
205   shared as well. */
206static PyObject *unicode_latin1[256] = {NULL};
207
208/* Fast detection of the most frequent whitespace characters */
209const unsigned char _Py_ascii_whitespace[] = {
210    0, 0, 0, 0, 0, 0, 0, 0,
211/*     case 0x0009: * CHARACTER TABULATION */
212/*     case 0x000A: * LINE FEED */
213/*     case 0x000B: * LINE TABULATION */
214/*     case 0x000C: * FORM FEED */
215/*     case 0x000D: * CARRIAGE RETURN */
216    0, 1, 1, 1, 1, 1, 0, 0,
217    0, 0, 0, 0, 0, 0, 0, 0,
218/*     case 0x001C: * FILE SEPARATOR */
219/*     case 0x001D: * GROUP SEPARATOR */
220/*     case 0x001E: * RECORD SEPARATOR */
221/*     case 0x001F: * UNIT SEPARATOR */
222    0, 0, 0, 0, 1, 1, 1, 1,
223/*     case 0x0020: * SPACE */
224    1, 0, 0, 0, 0, 0, 0, 0,
225    0, 0, 0, 0, 0, 0, 0, 0,
226    0, 0, 0, 0, 0, 0, 0, 0,
227    0, 0, 0, 0, 0, 0, 0, 0,
228
229    0, 0, 0, 0, 0, 0, 0, 0,
230    0, 0, 0, 0, 0, 0, 0, 0,
231    0, 0, 0, 0, 0, 0, 0, 0,
232    0, 0, 0, 0, 0, 0, 0, 0,
233    0, 0, 0, 0, 0, 0, 0, 0,
234    0, 0, 0, 0, 0, 0, 0, 0,
235    0, 0, 0, 0, 0, 0, 0, 0,
236    0, 0, 0, 0, 0, 0, 0, 0
237};
238
239/* forward */
240static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
241static PyObject* get_latin1_char(unsigned char ch);
242static int unicode_modifiable(PyObject *unicode);
243
244
245static PyObject *
246_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
247static PyObject *
248_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
249static PyObject *
250_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
251
252static PyObject *
253unicode_encode_call_errorhandler(const char *errors,
254       PyObject **errorHandler,const char *encoding, const char *reason,
255       PyObject *unicode, PyObject **exceptionObject,
256       Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
257
258static void
259raise_encode_exception(PyObject **exceptionObject,
260                       const char *encoding,
261                       PyObject *unicode,
262                       Py_ssize_t startpos, Py_ssize_t endpos,
263                       const char *reason);
264
265/* Same for linebreaks */
266static unsigned char ascii_linebreak[] = {
267    0, 0, 0, 0, 0, 0, 0, 0,
268/*         0x000A, * LINE FEED */
269/*         0x000B, * LINE TABULATION */
270/*         0x000C, * FORM FEED */
271/*         0x000D, * CARRIAGE RETURN */
272    0, 0, 1, 1, 1, 1, 0, 0,
273    0, 0, 0, 0, 0, 0, 0, 0,
274/*         0x001C, * FILE SEPARATOR */
275/*         0x001D, * GROUP SEPARATOR */
276/*         0x001E, * RECORD SEPARATOR */
277    0, 0, 0, 0, 1, 1, 1, 0,
278    0, 0, 0, 0, 0, 0, 0, 0,
279    0, 0, 0, 0, 0, 0, 0, 0,
280    0, 0, 0, 0, 0, 0, 0, 0,
281    0, 0, 0, 0, 0, 0, 0, 0,
282
283    0, 0, 0, 0, 0, 0, 0, 0,
284    0, 0, 0, 0, 0, 0, 0, 0,
285    0, 0, 0, 0, 0, 0, 0, 0,
286    0, 0, 0, 0, 0, 0, 0, 0,
287    0, 0, 0, 0, 0, 0, 0, 0,
288    0, 0, 0, 0, 0, 0, 0, 0,
289    0, 0, 0, 0, 0, 0, 0, 0,
290    0, 0, 0, 0, 0, 0, 0, 0
291};
292
293#include "clinic/unicodeobject.c.h"
294
295/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
296   This function is kept for backward compatibility with the old API. */
297Py_UNICODE
298PyUnicode_GetMax(void)
299{
300#ifdef Py_UNICODE_WIDE
301    return 0x10FFFF;
302#else
303    /* This is actually an illegal character, so it should
304       not be passed to unichr. */
305    return 0xFFFF;
306#endif
307}
308
309#ifdef Py_DEBUG
310int
311_PyUnicode_CheckConsistency(PyObject *op, int check_content)
312{
313    PyASCIIObject *ascii;
314    unsigned int kind;
315
316    assert(PyUnicode_Check(op));
317
318    ascii = (PyASCIIObject *)op;
319    kind = ascii->state.kind;
320
321    if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
322        assert(kind == PyUnicode_1BYTE_KIND);
323        assert(ascii->state.ready == 1);
324    }
325    else {
326        PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
327        void *data;
328
329        if (ascii->state.compact == 1) {
330            data = compact + 1;
331            assert(kind == PyUnicode_1BYTE_KIND
332                   || kind == PyUnicode_2BYTE_KIND
333                   || kind == PyUnicode_4BYTE_KIND);
334            assert(ascii->state.ascii == 0);
335            assert(ascii->state.ready == 1);
336            assert (compact->utf8 != data);
337        }
338        else {
339            PyUnicodeObject *unicode = (PyUnicodeObject *)op;
340
341            data = unicode->data.any;
342            if (kind == PyUnicode_WCHAR_KIND) {
343                assert(ascii->length == 0);
344                assert(ascii->hash == -1);
345                assert(ascii->state.compact == 0);
346                assert(ascii->state.ascii == 0);
347                assert(ascii->state.ready == 0);
348                assert(ascii->state.interned == SSTATE_NOT_INTERNED);
349                assert(ascii->wstr != NULL);
350                assert(data == NULL);
351                assert(compact->utf8 == NULL);
352            }
353            else {
354                assert(kind == PyUnicode_1BYTE_KIND
355                       || kind == PyUnicode_2BYTE_KIND
356                       || kind == PyUnicode_4BYTE_KIND);
357                assert(ascii->state.compact == 0);
358                assert(ascii->state.ready == 1);
359                assert(data != NULL);
360                if (ascii->state.ascii) {
361                    assert (compact->utf8 == data);
362                    assert (compact->utf8_length == ascii->length);
363                }
364                else
365                    assert (compact->utf8 != data);
366            }
367        }
368        if (kind != PyUnicode_WCHAR_KIND) {
369            if (
370#if SIZEOF_WCHAR_T == 2
371                kind == PyUnicode_2BYTE_KIND
372#else
373                kind == PyUnicode_4BYTE_KIND
374#endif
375               )
376            {
377                assert(ascii->wstr == data);
378                assert(compact->wstr_length == ascii->length);
379            } else
380                assert(ascii->wstr != data);
381        }
382
383        if (compact->utf8 == NULL)
384            assert(compact->utf8_length == 0);
385        if (ascii->wstr == NULL)
386            assert(compact->wstr_length == 0);
387    }
388    /* check that the best kind is used */
389    if (check_content && kind != PyUnicode_WCHAR_KIND)
390    {
391        Py_ssize_t i;
392        Py_UCS4 maxchar = 0;
393        void *data;
394        Py_UCS4 ch;
395
396        data = PyUnicode_DATA(ascii);
397        for (i=0; i < ascii->length; i++)
398        {
399            ch = PyUnicode_READ(kind, data, i);
400            if (ch > maxchar)
401                maxchar = ch;
402        }
403        if (kind == PyUnicode_1BYTE_KIND) {
404            if (ascii->state.ascii == 0) {
405                assert(maxchar >= 128);
406                assert(maxchar <= 255);
407            }
408            else
409                assert(maxchar < 128);
410        }
411        else if (kind == PyUnicode_2BYTE_KIND) {
412            assert(maxchar >= 0x100);
413            assert(maxchar <= 0xFFFF);
414        }
415        else {
416            assert(maxchar >= 0x10000);
417            assert(maxchar <= MAX_UNICODE);
418        }
419        assert(PyUnicode_READ(kind, data, ascii->length) == 0);
420    }
421    return 1;
422}
423#endif
424
425static PyObject*
426unicode_result_wchar(PyObject *unicode)
427{
428#ifndef Py_DEBUG
429    Py_ssize_t len;
430
431    len = _PyUnicode_WSTR_LENGTH(unicode);
432    if (len == 0) {
433        Py_DECREF(unicode);
434        _Py_RETURN_UNICODE_EMPTY();
435    }
436
437    if (len == 1) {
438        wchar_t ch = _PyUnicode_WSTR(unicode)[0];
439        if ((Py_UCS4)ch < 256) {
440            PyObject *latin1_char = get_latin1_char((unsigned char)ch);
441            Py_DECREF(unicode);
442            return latin1_char;
443        }
444    }
445
446    if (_PyUnicode_Ready(unicode) < 0) {
447        Py_DECREF(unicode);
448        return NULL;
449    }
450#else
451    assert(Py_REFCNT(unicode) == 1);
452
453    /* don't make the result ready in debug mode to ensure that the caller
454       makes the string ready before using it */
455    assert(_PyUnicode_CheckConsistency(unicode, 1));
456#endif
457    return unicode;
458}
459
460static PyObject*
461unicode_result_ready(PyObject *unicode)
462{
463    Py_ssize_t length;
464
465    length = PyUnicode_GET_LENGTH(unicode);
466    if (length == 0) {
467        if (unicode != unicode_empty) {
468            Py_DECREF(unicode);
469            _Py_RETURN_UNICODE_EMPTY();
470        }
471        return unicode_empty;
472    }
473
474    if (length == 1) {
475        void *data = PyUnicode_DATA(unicode);
476        int kind = PyUnicode_KIND(unicode);
477        Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
478        if (ch < 256) {
479            PyObject *latin1_char = unicode_latin1[ch];
480            if (latin1_char != NULL) {
481                if (unicode != latin1_char) {
482                    Py_INCREF(latin1_char);
483                    Py_DECREF(unicode);
484                }
485                return latin1_char;
486            }
487            else {
488                assert(_PyUnicode_CheckConsistency(unicode, 1));
489                Py_INCREF(unicode);
490                unicode_latin1[ch] = unicode;
491                return unicode;
492            }
493        }
494    }
495
496    assert(_PyUnicode_CheckConsistency(unicode, 1));
497    return unicode;
498}
499
500static PyObject*
501unicode_result(PyObject *unicode)
502{
503    assert(_PyUnicode_CHECK(unicode));
504    if (PyUnicode_IS_READY(unicode))
505        return unicode_result_ready(unicode);
506    else
507        return unicode_result_wchar(unicode);
508}
509
510static PyObject*
511unicode_result_unchanged(PyObject *unicode)
512{
513    if (PyUnicode_CheckExact(unicode)) {
514        if (PyUnicode_READY(unicode) == -1)
515            return NULL;
516        Py_INCREF(unicode);
517        return unicode;
518    }
519    else
520        /* Subtype -- return genuine unicode string with the same value. */
521        return _PyUnicode_Copy(unicode);
522}
523
524/* --- Bloom Filters ----------------------------------------------------- */
525
526/* stuff to implement simple "bloom filters" for Unicode characters.
527   to keep things simple, we use a single bitmask, using the least 5
528   bits from each unicode characters as the bit index. */
529
530/* the linebreak mask is set up by Unicode_Init below */
531
532#if LONG_BIT >= 128
533#define BLOOM_WIDTH 128
534#elif LONG_BIT >= 64
535#define BLOOM_WIDTH 64
536#elif LONG_BIT >= 32
537#define BLOOM_WIDTH 32
538#else
539#error "LONG_BIT is smaller than 32"
540#endif
541
542#define BLOOM_MASK unsigned long
543
544static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
545
546#define BLOOM(mask, ch)     ((mask &  (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
547
548#define BLOOM_LINEBREAK(ch)                                             \
549    ((ch) < 128U ? ascii_linebreak[(ch)] :                              \
550     (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
551
552Py_LOCAL_INLINE(BLOOM_MASK)
553make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
554{
555#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN)             \
556    do {                                               \
557        TYPE *data = (TYPE *)PTR;                      \
558        TYPE *end = data + LEN;                        \
559        Py_UCS4 ch;                                    \
560        for (; data != end; data++) {                  \
561            ch = *data;                                \
562            MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
563        }                                              \
564        break;                                         \
565    } while (0)
566
567    /* calculate simple bloom-style bitmask for a given unicode string */
568
569    BLOOM_MASK mask;
570
571    mask = 0;
572    switch (kind) {
573    case PyUnicode_1BYTE_KIND:
574        BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
575        break;
576    case PyUnicode_2BYTE_KIND:
577        BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
578        break;
579    case PyUnicode_4BYTE_KIND:
580        BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
581        break;
582    default:
583        assert(0);
584    }
585    return mask;
586
587#undef BLOOM_UPDATE
588}
589
590/* Compilation of templated routines */
591
592#include "stringlib/asciilib.h"
593#include "stringlib/fastsearch.h"
594#include "stringlib/partition.h"
595#include "stringlib/split.h"
596#include "stringlib/count.h"
597#include "stringlib/find.h"
598#include "stringlib/find_max_char.h"
599#include "stringlib/localeutil.h"
600#include "stringlib/undef.h"
601
602#include "stringlib/ucs1lib.h"
603#include "stringlib/fastsearch.h"
604#include "stringlib/partition.h"
605#include "stringlib/split.h"
606#include "stringlib/count.h"
607#include "stringlib/find.h"
608#include "stringlib/replace.h"
609#include "stringlib/find_max_char.h"
610#include "stringlib/localeutil.h"
611#include "stringlib/undef.h"
612
613#include "stringlib/ucs2lib.h"
614#include "stringlib/fastsearch.h"
615#include "stringlib/partition.h"
616#include "stringlib/split.h"
617#include "stringlib/count.h"
618#include "stringlib/find.h"
619#include "stringlib/replace.h"
620#include "stringlib/find_max_char.h"
621#include "stringlib/localeutil.h"
622#include "stringlib/undef.h"
623
624#include "stringlib/ucs4lib.h"
625#include "stringlib/fastsearch.h"
626#include "stringlib/partition.h"
627#include "stringlib/split.h"
628#include "stringlib/count.h"
629#include "stringlib/find.h"
630#include "stringlib/replace.h"
631#include "stringlib/find_max_char.h"
632#include "stringlib/localeutil.h"
633#include "stringlib/undef.h"
634
635#include "stringlib/unicodedefs.h"
636#include "stringlib/fastsearch.h"
637#include "stringlib/count.h"
638#include "stringlib/find.h"
639#include "stringlib/undef.h"
640
641/* --- Unicode Object ----------------------------------------------------- */
642
643static PyObject *
644fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
645
646Py_LOCAL_INLINE(Py_ssize_t) findchar(const void *s, int kind,
647                                     Py_ssize_t size, Py_UCS4 ch,
648                                     int direction)
649{
650    int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
651
652    switch (kind) {
653    case PyUnicode_1BYTE_KIND:
654        {
655            Py_UCS1 ch1 = (Py_UCS1) ch;
656            if (ch1 == ch)
657                return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
658            else
659                return -1;
660        }
661    case PyUnicode_2BYTE_KIND:
662        {
663            Py_UCS2 ch2 = (Py_UCS2) ch;
664            if (ch2 == ch)
665                return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
666            else
667                return -1;
668        }
669    case PyUnicode_4BYTE_KIND:
670        return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
671    default:
672        assert(0);
673        return -1;
674    }
675}
676
677#ifdef Py_DEBUG
678/* Fill the data of an Unicode string with invalid characters to detect bugs
679   earlier.
680
681   _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
682   ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
683   invalid character in Unicode 6.0. */
684static void
685unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
686{
687    int kind = PyUnicode_KIND(unicode);
688    Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
689    Py_ssize_t length = _PyUnicode_LENGTH(unicode);
690    if (length <= old_length)
691        return;
692    memset(data + old_length * kind, 0xff, (length - old_length) * kind);
693}
694#endif
695
696static PyObject*
697resize_compact(PyObject *unicode, Py_ssize_t length)
698{
699    Py_ssize_t char_size;
700    Py_ssize_t struct_size;
701    Py_ssize_t new_size;
702    int share_wstr;
703    PyObject *new_unicode;
704#ifdef Py_DEBUG
705    Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
706#endif
707
708    assert(unicode_modifiable(unicode));
709    assert(PyUnicode_IS_READY(unicode));
710    assert(PyUnicode_IS_COMPACT(unicode));
711
712    char_size = PyUnicode_KIND(unicode);
713    if (PyUnicode_IS_ASCII(unicode))
714        struct_size = sizeof(PyASCIIObject);
715    else
716        struct_size = sizeof(PyCompactUnicodeObject);
717    share_wstr = _PyUnicode_SHARE_WSTR(unicode);
718
719    if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
720        PyErr_NoMemory();
721        return NULL;
722    }
723    new_size = (struct_size + (length + 1) * char_size);
724
725    _Py_DEC_REFTOTAL;
726    _Py_ForgetReference(unicode);
727
728    new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
729    if (new_unicode == NULL) {
730        _Py_NewReference(unicode);
731        PyErr_NoMemory();
732        return NULL;
733    }
734    unicode = new_unicode;
735    _Py_NewReference(unicode);
736
737    _PyUnicode_LENGTH(unicode) = length;
738    if (share_wstr) {
739        _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
740        if (!PyUnicode_IS_ASCII(unicode))
741            _PyUnicode_WSTR_LENGTH(unicode) = length;
742    }
743    else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
744        PyObject_DEL(_PyUnicode_WSTR(unicode));
745        _PyUnicode_WSTR(unicode) = NULL;
746    }
747#ifdef Py_DEBUG
748    unicode_fill_invalid(unicode, old_length);
749#endif
750    PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
751                    length, 0);
752    assert(_PyUnicode_CheckConsistency(unicode, 0));
753    return unicode;
754}
755
756static int
757resize_inplace(PyObject *unicode, Py_ssize_t length)
758{
759    wchar_t *wstr;
760    Py_ssize_t new_size;
761    assert(!PyUnicode_IS_COMPACT(unicode));
762    assert(Py_REFCNT(unicode) == 1);
763
764    if (PyUnicode_IS_READY(unicode)) {
765        Py_ssize_t char_size;
766        int share_wstr, share_utf8;
767        void *data;
768#ifdef Py_DEBUG
769        Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
770#endif
771
772        data = _PyUnicode_DATA_ANY(unicode);
773        char_size = PyUnicode_KIND(unicode);
774        share_wstr = _PyUnicode_SHARE_WSTR(unicode);
775        share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
776
777        if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
778            PyErr_NoMemory();
779            return -1;
780        }
781        new_size = (length + 1) * char_size;
782
783        if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
784        {
785            PyObject_DEL(_PyUnicode_UTF8(unicode));
786            _PyUnicode_UTF8(unicode) = NULL;
787            _PyUnicode_UTF8_LENGTH(unicode) = 0;
788        }
789
790        data = (PyObject *)PyObject_REALLOC(data, new_size);
791        if (data == NULL) {
792            PyErr_NoMemory();
793            return -1;
794        }
795        _PyUnicode_DATA_ANY(unicode) = data;
796        if (share_wstr) {
797            _PyUnicode_WSTR(unicode) = data;
798            _PyUnicode_WSTR_LENGTH(unicode) = length;
799        }
800        if (share_utf8) {
801            _PyUnicode_UTF8(unicode) = data;
802            _PyUnicode_UTF8_LENGTH(unicode) = length;
803        }
804        _PyUnicode_LENGTH(unicode) = length;
805        PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
806#ifdef Py_DEBUG
807        unicode_fill_invalid(unicode, old_length);
808#endif
809        if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
810            assert(_PyUnicode_CheckConsistency(unicode, 0));
811            return 0;
812        }
813    }
814    assert(_PyUnicode_WSTR(unicode) != NULL);
815
816    /* check for integer overflow */
817    if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
818        PyErr_NoMemory();
819        return -1;
820    }
821    new_size = sizeof(wchar_t) * (length + 1);
822    wstr =  _PyUnicode_WSTR(unicode);
823    wstr = PyObject_REALLOC(wstr, new_size);
824    if (!wstr) {
825        PyErr_NoMemory();
826        return -1;
827    }
828    _PyUnicode_WSTR(unicode) = wstr;
829    _PyUnicode_WSTR(unicode)[length] = 0;
830    _PyUnicode_WSTR_LENGTH(unicode) = length;
831    assert(_PyUnicode_CheckConsistency(unicode, 0));
832    return 0;
833}
834
835static PyObject*
836resize_copy(PyObject *unicode, Py_ssize_t length)
837{
838    Py_ssize_t copy_length;
839    if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
840        PyObject *copy;
841
842        if (PyUnicode_READY(unicode) == -1)
843            return NULL;
844
845        copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
846        if (copy == NULL)
847            return NULL;
848
849        copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
850        _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
851        return copy;
852    }
853    else {
854        PyObject *w;
855
856        w = (PyObject*)_PyUnicode_New(length);
857        if (w == NULL)
858            return NULL;
859        copy_length = _PyUnicode_WSTR_LENGTH(unicode);
860        copy_length = Py_MIN(copy_length, length);
861        Py_MEMCPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
862                  copy_length * sizeof(wchar_t));
863        return w;
864    }
865}
866
867/* We allocate one more byte to make sure the string is
868   Ux0000 terminated; some code (e.g. new_identifier)
869   relies on that.
870
871   XXX This allocator could further be enhanced by assuring that the
872   free list never reduces its size below 1.
873
874*/
875
876static PyUnicodeObject *
877_PyUnicode_New(Py_ssize_t length)
878{
879    PyUnicodeObject *unicode;
880    size_t new_size;
881
882    /* Optimization for empty strings */
883    if (length == 0 && unicode_empty != NULL) {
884        Py_INCREF(unicode_empty);
885        return (PyUnicodeObject*)unicode_empty;
886    }
887
888    /* Ensure we won't overflow the size. */
889    if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
890        return (PyUnicodeObject *)PyErr_NoMemory();
891    }
892    if (length < 0) {
893        PyErr_SetString(PyExc_SystemError,
894                        "Negative size passed to _PyUnicode_New");
895        return NULL;
896    }
897
898    unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
899    if (unicode == NULL)
900        return NULL;
901    new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
902
903    _PyUnicode_WSTR_LENGTH(unicode) = length;
904    _PyUnicode_HASH(unicode) = -1;
905    _PyUnicode_STATE(unicode).interned = 0;
906    _PyUnicode_STATE(unicode).kind = 0;
907    _PyUnicode_STATE(unicode).compact = 0;
908    _PyUnicode_STATE(unicode).ready = 0;
909    _PyUnicode_STATE(unicode).ascii = 0;
910    _PyUnicode_DATA_ANY(unicode) = NULL;
911    _PyUnicode_LENGTH(unicode) = 0;
912    _PyUnicode_UTF8(unicode) = NULL;
913    _PyUnicode_UTF8_LENGTH(unicode) = 0;
914
915    _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
916    if (!_PyUnicode_WSTR(unicode)) {
917        Py_DECREF(unicode);
918        PyErr_NoMemory();
919        return NULL;
920    }
921
922    /* Initialize the first element to guard against cases where
923     * the caller fails before initializing str -- unicode_resize()
924     * reads str[0], and the Keep-Alive optimization can keep memory
925     * allocated for str alive across a call to unicode_dealloc(unicode).
926     * We don't want unicode_resize to read uninitialized memory in
927     * that case.
928     */
929    _PyUnicode_WSTR(unicode)[0] = 0;
930    _PyUnicode_WSTR(unicode)[length] = 0;
931
932    assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
933    return unicode;
934}
935
936static const char*
937unicode_kind_name(PyObject *unicode)
938{
939    /* don't check consistency: unicode_kind_name() is called from
940       _PyUnicode_Dump() */
941    if (!PyUnicode_IS_COMPACT(unicode))
942    {
943        if (!PyUnicode_IS_READY(unicode))
944            return "wstr";
945        switch (PyUnicode_KIND(unicode))
946        {
947        case PyUnicode_1BYTE_KIND:
948            if (PyUnicode_IS_ASCII(unicode))
949                return "legacy ascii";
950            else
951                return "legacy latin1";
952        case PyUnicode_2BYTE_KIND:
953            return "legacy UCS2";
954        case PyUnicode_4BYTE_KIND:
955            return "legacy UCS4";
956        default:
957            return "<legacy invalid kind>";
958        }
959    }
960    assert(PyUnicode_IS_READY(unicode));
961    switch (PyUnicode_KIND(unicode)) {
962    case PyUnicode_1BYTE_KIND:
963        if (PyUnicode_IS_ASCII(unicode))
964            return "ascii";
965        else
966            return "latin1";
967    case PyUnicode_2BYTE_KIND:
968        return "UCS2";
969    case PyUnicode_4BYTE_KIND:
970        return "UCS4";
971    default:
972        return "<invalid compact kind>";
973    }
974}
975
976#ifdef Py_DEBUG
977/* Functions wrapping macros for use in debugger */
978char *_PyUnicode_utf8(void *unicode){
979    return PyUnicode_UTF8(unicode);
980}
981
982void *_PyUnicode_compact_data(void *unicode) {
983    return _PyUnicode_COMPACT_DATA(unicode);
984}
985void *_PyUnicode_data(void *unicode){
986    printf("obj %p\n", unicode);
987    printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
988    printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
989    printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
990    printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
991    printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
992    return PyUnicode_DATA(unicode);
993}
994
995void
996_PyUnicode_Dump(PyObject *op)
997{
998    PyASCIIObject *ascii = (PyASCIIObject *)op;
999    PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1000    PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1001    void *data;
1002
1003    if (ascii->state.compact)
1004    {
1005        if (ascii->state.ascii)
1006            data = (ascii + 1);
1007        else
1008            data = (compact + 1);
1009    }
1010    else
1011        data = unicode->data.any;
1012    printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1013           unicode_kind_name(op), ascii->length);
1014
1015    if (ascii->wstr == data)
1016        printf("shared ");
1017    printf("wstr=%p", ascii->wstr);
1018
1019    if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
1020        printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
1021        if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1022            printf("shared ");
1023        printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
1024               compact->utf8, compact->utf8_length);
1025    }
1026    printf(", data=%p\n", data);
1027}
1028#endif
1029
1030PyObject *
1031PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1032{
1033    PyObject *obj;
1034    PyCompactUnicodeObject *unicode;
1035    void *data;
1036    enum PyUnicode_Kind kind;
1037    int is_sharing, is_ascii;
1038    Py_ssize_t char_size;
1039    Py_ssize_t struct_size;
1040
1041    /* Optimization for empty strings */
1042    if (size == 0 && unicode_empty != NULL) {
1043        Py_INCREF(unicode_empty);
1044        return unicode_empty;
1045    }
1046
1047    is_ascii = 0;
1048    is_sharing = 0;
1049    struct_size = sizeof(PyCompactUnicodeObject);
1050    if (maxchar < 128) {
1051        kind = PyUnicode_1BYTE_KIND;
1052        char_size = 1;
1053        is_ascii = 1;
1054        struct_size = sizeof(PyASCIIObject);
1055    }
1056    else if (maxchar < 256) {
1057        kind = PyUnicode_1BYTE_KIND;
1058        char_size = 1;
1059    }
1060    else if (maxchar < 65536) {
1061        kind = PyUnicode_2BYTE_KIND;
1062        char_size = 2;
1063        if (sizeof(wchar_t) == 2)
1064            is_sharing = 1;
1065    }
1066    else {
1067        if (maxchar > MAX_UNICODE) {
1068            PyErr_SetString(PyExc_SystemError,
1069                            "invalid maximum character passed to PyUnicode_New");
1070            return NULL;
1071        }
1072        kind = PyUnicode_4BYTE_KIND;
1073        char_size = 4;
1074        if (sizeof(wchar_t) == 4)
1075            is_sharing = 1;
1076    }
1077
1078    /* Ensure we won't overflow the size. */
1079    if (size < 0) {
1080        PyErr_SetString(PyExc_SystemError,
1081                        "Negative size passed to PyUnicode_New");
1082        return NULL;
1083    }
1084    if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1085        return PyErr_NoMemory();
1086
1087    /* Duplicated allocation code from _PyObject_New() instead of a call to
1088     * PyObject_New() so we are able to allocate space for the object and
1089     * it's data buffer.
1090     */
1091    obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1092    if (obj == NULL)
1093        return PyErr_NoMemory();
1094    obj = PyObject_INIT(obj, &PyUnicode_Type);
1095    if (obj == NULL)
1096        return NULL;
1097
1098    unicode = (PyCompactUnicodeObject *)obj;
1099    if (is_ascii)
1100        data = ((PyASCIIObject*)obj) + 1;
1101    else
1102        data = unicode + 1;
1103    _PyUnicode_LENGTH(unicode) = size;
1104    _PyUnicode_HASH(unicode) = -1;
1105    _PyUnicode_STATE(unicode).interned = 0;
1106    _PyUnicode_STATE(unicode).kind = kind;
1107    _PyUnicode_STATE(unicode).compact = 1;
1108    _PyUnicode_STATE(unicode).ready = 1;
1109    _PyUnicode_STATE(unicode).ascii = is_ascii;
1110    if (is_ascii) {
1111        ((char*)data)[size] = 0;
1112        _PyUnicode_WSTR(unicode) = NULL;
1113    }
1114    else if (kind == PyUnicode_1BYTE_KIND) {
1115        ((char*)data)[size] = 0;
1116        _PyUnicode_WSTR(unicode) = NULL;
1117        _PyUnicode_WSTR_LENGTH(unicode) = 0;
1118        unicode->utf8 = NULL;
1119        unicode->utf8_length = 0;
1120    }
1121    else {
1122        unicode->utf8 = NULL;
1123        unicode->utf8_length = 0;
1124        if (kind == PyUnicode_2BYTE_KIND)
1125            ((Py_UCS2*)data)[size] = 0;
1126        else /* kind == PyUnicode_4BYTE_KIND */
1127            ((Py_UCS4*)data)[size] = 0;
1128        if (is_sharing) {
1129            _PyUnicode_WSTR_LENGTH(unicode) = size;
1130            _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1131        }
1132        else {
1133            _PyUnicode_WSTR_LENGTH(unicode) = 0;
1134            _PyUnicode_WSTR(unicode) = NULL;
1135        }
1136    }
1137#ifdef Py_DEBUG
1138    unicode_fill_invalid((PyObject*)unicode, 0);
1139#endif
1140    assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
1141    return obj;
1142}
1143
1144#if SIZEOF_WCHAR_T == 2
1145/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1146   will decode surrogate pairs, the other conversions are implemented as macros
1147   for efficiency.
1148
1149   This function assumes that unicode can hold one more code point than wstr
1150   characters for a terminating null character. */
1151static void
1152unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
1153                              PyObject *unicode)
1154{
1155    const wchar_t *iter;
1156    Py_UCS4 *ucs4_out;
1157
1158    assert(unicode != NULL);
1159    assert(_PyUnicode_CHECK(unicode));
1160    assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1161    ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1162
1163    for (iter = begin; iter < end; ) {
1164        assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1165                           _PyUnicode_GET_LENGTH(unicode)));
1166        if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1167            && (iter+1) < end
1168            && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1169        {
1170            *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1171            iter += 2;
1172        }
1173        else {
1174            *ucs4_out++ = *iter;
1175            iter++;
1176        }
1177    }
1178    assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1179                        _PyUnicode_GET_LENGTH(unicode)));
1180
1181}
1182#endif
1183
1184static int
1185unicode_check_modifiable(PyObject *unicode)
1186{
1187    if (!unicode_modifiable(unicode)) {
1188        PyErr_SetString(PyExc_SystemError,
1189                        "Cannot modify a string currently used");
1190        return -1;
1191    }
1192    return 0;
1193}
1194
1195static int
1196_copy_characters(PyObject *to, Py_ssize_t to_start,
1197                 PyObject *from, Py_ssize_t from_start,
1198                 Py_ssize_t how_many, int check_maxchar)
1199{
1200    unsigned int from_kind, to_kind;
1201    void *from_data, *to_data;
1202
1203    assert(0 <= how_many);
1204    assert(0 <= from_start);
1205    assert(0 <= to_start);
1206    assert(PyUnicode_Check(from));
1207    assert(PyUnicode_IS_READY(from));
1208    assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
1209
1210    assert(PyUnicode_Check(to));
1211    assert(PyUnicode_IS_READY(to));
1212    assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1213
1214    if (how_many == 0)
1215        return 0;
1216
1217    from_kind = PyUnicode_KIND(from);
1218    from_data = PyUnicode_DATA(from);
1219    to_kind = PyUnicode_KIND(to);
1220    to_data = PyUnicode_DATA(to);
1221
1222#ifdef Py_DEBUG
1223    if (!check_maxchar
1224        && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1225    {
1226        const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1227        Py_UCS4 ch;
1228        Py_ssize_t i;
1229        for (i=0; i < how_many; i++) {
1230            ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1231            assert(ch <= to_maxchar);
1232        }
1233    }
1234#endif
1235
1236    if (from_kind == to_kind) {
1237        if (check_maxchar
1238            && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1239        {
1240            /* Writing Latin-1 characters into an ASCII string requires to
1241               check that all written characters are pure ASCII */
1242            Py_UCS4 max_char;
1243            max_char = ucs1lib_find_max_char(from_data,
1244                                             (Py_UCS1*)from_data + how_many);
1245            if (max_char >= 128)
1246                return -1;
1247        }
1248        Py_MEMCPY((char*)to_data + to_kind * to_start,
1249                  (char*)from_data + from_kind * from_start,
1250                  to_kind * how_many);
1251    }
1252    else if (from_kind == PyUnicode_1BYTE_KIND
1253             && to_kind == PyUnicode_2BYTE_KIND)
1254    {
1255        _PyUnicode_CONVERT_BYTES(
1256            Py_UCS1, Py_UCS2,
1257            PyUnicode_1BYTE_DATA(from) + from_start,
1258            PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1259            PyUnicode_2BYTE_DATA(to) + to_start
1260            );
1261    }
1262    else if (from_kind == PyUnicode_1BYTE_KIND
1263             && to_kind == PyUnicode_4BYTE_KIND)
1264    {
1265        _PyUnicode_CONVERT_BYTES(
1266            Py_UCS1, Py_UCS4,
1267            PyUnicode_1BYTE_DATA(from) + from_start,
1268            PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1269            PyUnicode_4BYTE_DATA(to) + to_start
1270            );
1271    }
1272    else if (from_kind == PyUnicode_2BYTE_KIND
1273             && to_kind == PyUnicode_4BYTE_KIND)
1274    {
1275        _PyUnicode_CONVERT_BYTES(
1276            Py_UCS2, Py_UCS4,
1277            PyUnicode_2BYTE_DATA(from) + from_start,
1278            PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1279            PyUnicode_4BYTE_DATA(to) + to_start
1280            );
1281    }
1282    else {
1283        assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1284
1285        if (!check_maxchar) {
1286            if (from_kind == PyUnicode_2BYTE_KIND
1287                && to_kind == PyUnicode_1BYTE_KIND)
1288            {
1289                _PyUnicode_CONVERT_BYTES(
1290                    Py_UCS2, Py_UCS1,
1291                    PyUnicode_2BYTE_DATA(from) + from_start,
1292                    PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1293                    PyUnicode_1BYTE_DATA(to) + to_start
1294                    );
1295            }
1296            else if (from_kind == PyUnicode_4BYTE_KIND
1297                     && to_kind == PyUnicode_1BYTE_KIND)
1298            {
1299                _PyUnicode_CONVERT_BYTES(
1300                    Py_UCS4, Py_UCS1,
1301                    PyUnicode_4BYTE_DATA(from) + from_start,
1302                    PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1303                    PyUnicode_1BYTE_DATA(to) + to_start
1304                    );
1305            }
1306            else if (from_kind == PyUnicode_4BYTE_KIND
1307                     && to_kind == PyUnicode_2BYTE_KIND)
1308            {
1309                _PyUnicode_CONVERT_BYTES(
1310                    Py_UCS4, Py_UCS2,
1311                    PyUnicode_4BYTE_DATA(from) + from_start,
1312                    PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1313                    PyUnicode_2BYTE_DATA(to) + to_start
1314                    );
1315            }
1316            else {
1317                assert(0);
1318                return -1;
1319            }
1320        }
1321        else {
1322            const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1323            Py_UCS4 ch;
1324            Py_ssize_t i;
1325
1326            for (i=0; i < how_many; i++) {
1327                ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1328                if (ch > to_maxchar)
1329                    return -1;
1330                PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1331            }
1332        }
1333    }
1334    return 0;
1335}
1336
1337void
1338_PyUnicode_FastCopyCharacters(
1339    PyObject *to, Py_ssize_t to_start,
1340    PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
1341{
1342    (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1343}
1344
1345Py_ssize_t
1346PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1347                         PyObject *from, Py_ssize_t from_start,
1348                         Py_ssize_t how_many)
1349{
1350    int err;
1351
1352    if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1353        PyErr_BadInternalCall();
1354        return -1;
1355    }
1356
1357    if (PyUnicode_READY(from) == -1)
1358        return -1;
1359    if (PyUnicode_READY(to) == -1)
1360        return -1;
1361
1362    if (from_start < 0) {
1363        PyErr_SetString(PyExc_IndexError, "string index out of range");
1364        return -1;
1365    }
1366    if (to_start < 0) {
1367        PyErr_SetString(PyExc_IndexError, "string index out of range");
1368        return -1;
1369    }
1370    how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1371    if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1372        PyErr_Format(PyExc_SystemError,
1373                     "Cannot write %zi characters at %zi "
1374                     "in a string of %zi characters",
1375                     how_many, to_start, PyUnicode_GET_LENGTH(to));
1376        return -1;
1377    }
1378
1379    if (how_many == 0)
1380        return 0;
1381
1382    if (unicode_check_modifiable(to))
1383        return -1;
1384
1385    err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1386    if (err) {
1387        PyErr_Format(PyExc_SystemError,
1388                     "Cannot copy %s characters "
1389                     "into a string of %s characters",
1390                     unicode_kind_name(from),
1391                     unicode_kind_name(to));
1392        return -1;
1393    }
1394    return how_many;
1395}
1396
1397/* Find the maximum code point and count the number of surrogate pairs so a
1398   correct string length can be computed before converting a string to UCS4.
1399   This function counts single surrogates as a character and not as a pair.
1400
1401   Return 0 on success, or -1 on error. */
1402static int
1403find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1404                        Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
1405{
1406    const wchar_t *iter;
1407    Py_UCS4 ch;
1408
1409    assert(num_surrogates != NULL && maxchar != NULL);
1410    *num_surrogates = 0;
1411    *maxchar = 0;
1412
1413    for (iter = begin; iter < end; ) {
1414#if SIZEOF_WCHAR_T == 2
1415        if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1416            && (iter+1) < end
1417            && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1418        {
1419            ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1420            ++(*num_surrogates);
1421            iter += 2;
1422        }
1423        else
1424#endif
1425        {
1426            ch = *iter;
1427            iter++;
1428        }
1429        if (ch > *maxchar) {
1430            *maxchar = ch;
1431            if (*maxchar > MAX_UNICODE) {
1432                PyErr_Format(PyExc_ValueError,
1433                             "character U+%x is not in range [U+0000; U+10ffff]",
1434                             ch);
1435                return -1;
1436            }
1437        }
1438    }
1439    return 0;
1440}
1441
1442int
1443_PyUnicode_Ready(PyObject *unicode)
1444{
1445    wchar_t *end;
1446    Py_UCS4 maxchar = 0;
1447    Py_ssize_t num_surrogates;
1448#if SIZEOF_WCHAR_T == 2
1449    Py_ssize_t length_wo_surrogates;
1450#endif
1451
1452    /* _PyUnicode_Ready() is only intended for old-style API usage where
1453       strings were created using _PyObject_New() and where no canonical
1454       representation (the str field) has been set yet aka strings
1455       which are not yet ready. */
1456    assert(_PyUnicode_CHECK(unicode));
1457    assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
1458    assert(_PyUnicode_WSTR(unicode) != NULL);
1459    assert(_PyUnicode_DATA_ANY(unicode) == NULL);
1460    assert(_PyUnicode_UTF8(unicode) == NULL);
1461    /* Actually, it should neither be interned nor be anything else: */
1462    assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
1463
1464    end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
1465    if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
1466                                &maxchar, &num_surrogates) == -1)
1467        return -1;
1468
1469    if (maxchar < 256) {
1470        _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1471        if (!_PyUnicode_DATA_ANY(unicode)) {
1472            PyErr_NoMemory();
1473            return -1;
1474        }
1475        _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
1476                                _PyUnicode_WSTR(unicode), end,
1477                                PyUnicode_1BYTE_DATA(unicode));
1478        PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1479        _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1480        _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1481        if (maxchar < 128) {
1482            _PyUnicode_STATE(unicode).ascii = 1;
1483            _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
1484            _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1485        }
1486        else {
1487            _PyUnicode_STATE(unicode).ascii = 0;
1488            _PyUnicode_UTF8(unicode) = NULL;
1489            _PyUnicode_UTF8_LENGTH(unicode) = 0;
1490        }
1491        PyObject_FREE(_PyUnicode_WSTR(unicode));
1492        _PyUnicode_WSTR(unicode) = NULL;
1493        _PyUnicode_WSTR_LENGTH(unicode) = 0;
1494    }
1495    /* In this case we might have to convert down from 4-byte native
1496       wchar_t to 2-byte unicode. */
1497    else if (maxchar < 65536) {
1498        assert(num_surrogates == 0 &&
1499               "FindMaxCharAndNumSurrogatePairs() messed up");
1500
1501#if SIZEOF_WCHAR_T == 2
1502        /* We can share representations and are done. */
1503        _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
1504        PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1505        _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1506        _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
1507        _PyUnicode_UTF8(unicode) = NULL;
1508        _PyUnicode_UTF8_LENGTH(unicode) = 0;
1509#else
1510        /* sizeof(wchar_t) == 4 */
1511        _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
1512            2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
1513        if (!_PyUnicode_DATA_ANY(unicode)) {
1514            PyErr_NoMemory();
1515            return -1;
1516        }
1517        _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1518                                _PyUnicode_WSTR(unicode), end,
1519                                PyUnicode_2BYTE_DATA(unicode));
1520        PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1521        _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1522        _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
1523        _PyUnicode_UTF8(unicode) = NULL;
1524        _PyUnicode_UTF8_LENGTH(unicode) = 0;
1525        PyObject_FREE(_PyUnicode_WSTR(unicode));
1526        _PyUnicode_WSTR(unicode) = NULL;
1527        _PyUnicode_WSTR_LENGTH(unicode) = 0;
1528#endif
1529    }
1530    /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1531    else {
1532#if SIZEOF_WCHAR_T == 2
1533        /* in case the native representation is 2-bytes, we need to allocate a
1534           new normalized 4-byte version. */
1535        length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
1536        if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1537            PyErr_NoMemory();
1538            return -1;
1539        }
1540        _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1541        if (!_PyUnicode_DATA_ANY(unicode)) {
1542            PyErr_NoMemory();
1543            return -1;
1544        }
1545        _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1546        _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1547        _PyUnicode_UTF8(unicode) = NULL;
1548        _PyUnicode_UTF8_LENGTH(unicode) = 0;
1549        /* unicode_convert_wchar_to_ucs4() requires a ready string */
1550        _PyUnicode_STATE(unicode).ready = 1;
1551        unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
1552        PyObject_FREE(_PyUnicode_WSTR(unicode));
1553        _PyUnicode_WSTR(unicode) = NULL;
1554        _PyUnicode_WSTR_LENGTH(unicode) = 0;
1555#else
1556        assert(num_surrogates == 0);
1557
1558        _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
1559        _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1560        _PyUnicode_UTF8(unicode) = NULL;
1561        _PyUnicode_UTF8_LENGTH(unicode) = 0;
1562        _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1563#endif
1564        PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1565    }
1566    _PyUnicode_STATE(unicode).ready = 1;
1567    assert(_PyUnicode_CheckConsistency(unicode, 1));
1568    return 0;
1569}
1570
1571static void
1572unicode_dealloc(PyObject *unicode)
1573{
1574    switch (PyUnicode_CHECK_INTERNED(unicode)) {
1575    case SSTATE_NOT_INTERNED:
1576        break;
1577
1578    case SSTATE_INTERNED_MORTAL:
1579        /* revive dead object temporarily for DelItem */
1580        Py_REFCNT(unicode) = 3;
1581        if (PyDict_DelItem(interned, unicode) != 0)
1582            Py_FatalError(
1583                "deletion of interned string failed");
1584        break;
1585
1586    case SSTATE_INTERNED_IMMORTAL:
1587        Py_FatalError("Immortal interned string died.");
1588
1589    default:
1590        Py_FatalError("Inconsistent interned string state.");
1591    }
1592
1593    if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
1594        PyObject_DEL(_PyUnicode_WSTR(unicode));
1595    if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
1596        PyObject_DEL(_PyUnicode_UTF8(unicode));
1597    if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1598        PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
1599
1600    Py_TYPE(unicode)->tp_free(unicode);
1601}
1602
1603#ifdef Py_DEBUG
1604static int
1605unicode_is_singleton(PyObject *unicode)
1606{
1607    PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1608    if (unicode == unicode_empty)
1609        return 1;
1610    if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1611    {
1612        Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1613        if (ch < 256 && unicode_latin1[ch] == unicode)
1614            return 1;
1615    }
1616    return 0;
1617}
1618#endif
1619
1620static int
1621unicode_modifiable(PyObject *unicode)
1622{
1623    assert(_PyUnicode_CHECK(unicode));
1624    if (Py_REFCNT(unicode) != 1)
1625        return 0;
1626    if (_PyUnicode_HASH(unicode) != -1)
1627        return 0;
1628    if (PyUnicode_CHECK_INTERNED(unicode))
1629        return 0;
1630    if (!PyUnicode_CheckExact(unicode))
1631        return 0;
1632#ifdef Py_DEBUG
1633    /* singleton refcount is greater than 1 */
1634    assert(!unicode_is_singleton(unicode));
1635#endif
1636    return 1;
1637}
1638
1639static int
1640unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1641{
1642    PyObject *unicode;
1643    Py_ssize_t old_length;
1644
1645    assert(p_unicode != NULL);
1646    unicode = *p_unicode;
1647
1648    assert(unicode != NULL);
1649    assert(PyUnicode_Check(unicode));
1650    assert(0 <= length);
1651
1652    if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
1653        old_length = PyUnicode_WSTR_LENGTH(unicode);
1654    else
1655        old_length = PyUnicode_GET_LENGTH(unicode);
1656    if (old_length == length)
1657        return 0;
1658
1659    if (length == 0) {
1660        _Py_INCREF_UNICODE_EMPTY();
1661        if (!unicode_empty)
1662            return -1;
1663        Py_DECREF(*p_unicode);
1664        *p_unicode = unicode_empty;
1665        return 0;
1666    }
1667
1668    if (!unicode_modifiable(unicode)) {
1669        PyObject *copy = resize_copy(unicode, length);
1670        if (copy == NULL)
1671            return -1;
1672        Py_DECREF(*p_unicode);
1673        *p_unicode = copy;
1674        return 0;
1675    }
1676
1677    if (PyUnicode_IS_COMPACT(unicode)) {
1678        PyObject *new_unicode = resize_compact(unicode, length);
1679        if (new_unicode == NULL)
1680            return -1;
1681        *p_unicode = new_unicode;
1682        return 0;
1683    }
1684    return resize_inplace(unicode, length);
1685}
1686
1687int
1688PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
1689{
1690    PyObject *unicode;
1691    if (p_unicode == NULL) {
1692        PyErr_BadInternalCall();
1693        return -1;
1694    }
1695    unicode = *p_unicode;
1696    if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
1697    {
1698        PyErr_BadInternalCall();
1699        return -1;
1700    }
1701    return unicode_resize(p_unicode, length);
1702}
1703
1704/* Copy a ASCII or latin1 char* string into a Python Unicode string.
1705
1706   WARNING: The function doesn't copy the terminating null character and
1707   doesn't check the maximum character (may write a latin1 character in an
1708   ASCII string). */
1709static void
1710unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1711                   const char *str, Py_ssize_t len)
1712{
1713    enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1714    void *data = PyUnicode_DATA(unicode);
1715    const char *end = str + len;
1716
1717    switch (kind) {
1718    case PyUnicode_1BYTE_KIND: {
1719        assert(index + len <= PyUnicode_GET_LENGTH(unicode));
1720#ifdef Py_DEBUG
1721        if (PyUnicode_IS_ASCII(unicode)) {
1722            Py_UCS4 maxchar = ucs1lib_find_max_char(
1723                (const Py_UCS1*)str,
1724                (const Py_UCS1*)str + len);
1725            assert(maxchar < 128);
1726        }
1727#endif
1728        memcpy((char *) data + index, str, len);
1729        break;
1730    }
1731    case PyUnicode_2BYTE_KIND: {
1732        Py_UCS2 *start = (Py_UCS2 *)data + index;
1733        Py_UCS2 *ucs2 = start;
1734        assert(index <= PyUnicode_GET_LENGTH(unicode));
1735
1736        for (; str < end; ++ucs2, ++str)
1737            *ucs2 = (Py_UCS2)*str;
1738
1739        assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
1740        break;
1741    }
1742    default: {
1743        Py_UCS4 *start = (Py_UCS4 *)data + index;
1744        Py_UCS4 *ucs4 = start;
1745        assert(kind == PyUnicode_4BYTE_KIND);
1746        assert(index <= PyUnicode_GET_LENGTH(unicode));
1747
1748        for (; str < end; ++ucs4, ++str)
1749            *ucs4 = (Py_UCS4)*str;
1750
1751        assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
1752    }
1753    }
1754}
1755
1756static PyObject*
1757get_latin1_char(unsigned char ch)
1758{
1759    PyObject *unicode = unicode_latin1[ch];
1760    if (!unicode) {
1761        unicode = PyUnicode_New(1, ch);
1762        if (!unicode)
1763            return NULL;
1764        PyUnicode_1BYTE_DATA(unicode)[0] = ch;
1765        assert(_PyUnicode_CheckConsistency(unicode, 1));
1766        unicode_latin1[ch] = unicode;
1767    }
1768    Py_INCREF(unicode);
1769    return unicode;
1770}
1771
1772static PyObject*
1773unicode_char(Py_UCS4 ch)
1774{
1775    PyObject *unicode;
1776
1777    assert(ch <= MAX_UNICODE);
1778
1779    if (ch < 256)
1780        return get_latin1_char(ch);
1781
1782    unicode = PyUnicode_New(1, ch);
1783    if (unicode == NULL)
1784        return NULL;
1785    switch (PyUnicode_KIND(unicode)) {
1786    case PyUnicode_1BYTE_KIND:
1787        PyUnicode_1BYTE_DATA(unicode)[0] = (Py_UCS1)ch;
1788        break;
1789    case PyUnicode_2BYTE_KIND:
1790        PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
1791        break;
1792    default:
1793        assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1794        PyUnicode_4BYTE_DATA(unicode)[0] = ch;
1795    }
1796    assert(_PyUnicode_CheckConsistency(unicode, 1));
1797    return unicode;
1798}
1799
1800PyObject *
1801PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
1802{
1803    PyObject *unicode;
1804    Py_UCS4 maxchar = 0;
1805    Py_ssize_t num_surrogates;
1806
1807    if (u == NULL)
1808        return (PyObject*)_PyUnicode_New(size);
1809
1810    /* If the Unicode data is known at construction time, we can apply
1811       some optimizations which share commonly used objects. */
1812
1813    /* Optimization for empty strings */
1814    if (size == 0)
1815        _Py_RETURN_UNICODE_EMPTY();
1816
1817    /* Single character Unicode objects in the Latin-1 range are
1818       shared when using this constructor */
1819    if (size == 1 && (Py_UCS4)*u < 256)
1820        return get_latin1_char((unsigned char)*u);
1821
1822    /* If not empty and not single character, copy the Unicode data
1823       into the new object */
1824    if (find_maxchar_surrogates(u, u + size,
1825                                &maxchar, &num_surrogates) == -1)
1826        return NULL;
1827
1828    unicode = PyUnicode_New(size - num_surrogates, maxchar);
1829    if (!unicode)
1830        return NULL;
1831
1832    switch (PyUnicode_KIND(unicode)) {
1833    case PyUnicode_1BYTE_KIND:
1834        _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
1835                                u, u + size, PyUnicode_1BYTE_DATA(unicode));
1836        break;
1837    case PyUnicode_2BYTE_KIND:
1838#if Py_UNICODE_SIZE == 2
1839        Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1840#else
1841        _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
1842                                u, u + size, PyUnicode_2BYTE_DATA(unicode));
1843#endif
1844        break;
1845    case PyUnicode_4BYTE_KIND:
1846#if SIZEOF_WCHAR_T == 2
1847        /* This is the only case which has to process surrogates, thus
1848           a simple copy loop is not enough and we need a function. */
1849        unicode_convert_wchar_to_ucs4(u, u + size, unicode);
1850#else
1851        assert(num_surrogates == 0);
1852        Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1853#endif
1854        break;
1855    default:
1856        assert(0 && "Impossible state");
1857    }
1858
1859    return unicode_result(unicode);
1860}
1861
1862PyObject *
1863PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
1864{
1865    if (size < 0) {
1866        PyErr_SetString(PyExc_SystemError,
1867                        "Negative size passed to PyUnicode_FromStringAndSize");
1868        return NULL;
1869    }
1870    if (u != NULL)
1871        return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
1872    else
1873        return (PyObject *)_PyUnicode_New(size);
1874}
1875
1876PyObject *
1877PyUnicode_FromString(const char *u)
1878{
1879    size_t size = strlen(u);
1880    if (size > PY_SSIZE_T_MAX) {
1881        PyErr_SetString(PyExc_OverflowError, "input too long");
1882        return NULL;
1883    }
1884    return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
1885}
1886
1887PyObject *
1888_PyUnicode_FromId(_Py_Identifier *id)
1889{
1890    if (!id->object) {
1891        id->object = PyUnicode_DecodeUTF8Stateful(id->string,
1892                                                  strlen(id->string),
1893                                                  NULL, NULL);
1894        if (!id->object)
1895            return NULL;
1896        PyUnicode_InternInPlace(&id->object);
1897        assert(!id->next);
1898        id->next = static_strings;
1899        static_strings = id;
1900    }
1901    return id->object;
1902}
1903
1904void
1905_PyUnicode_ClearStaticStrings()
1906{
1907    _Py_Identifier *tmp, *s = static_strings;
1908    while (s) {
1909        Py_CLEAR(s->object);
1910        tmp = s->next;
1911        s->next = NULL;
1912        s = tmp;
1913    }
1914    static_strings = NULL;
1915}
1916
1917/* Internal function, doesn't check maximum character */
1918
1919PyObject*
1920_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
1921{
1922    const unsigned char *s = (const unsigned char *)buffer;
1923    PyObject *unicode;
1924    if (size == 1) {
1925#ifdef Py_DEBUG
1926        assert((unsigned char)s[0] < 128);
1927#endif
1928        return get_latin1_char(s[0]);
1929    }
1930    unicode = PyUnicode_New(size, 127);
1931    if (!unicode)
1932        return NULL;
1933    memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
1934    assert(_PyUnicode_CheckConsistency(unicode, 1));
1935    return unicode;
1936}
1937
1938static Py_UCS4
1939kind_maxchar_limit(unsigned int kind)
1940{
1941    switch (kind) {
1942    case PyUnicode_1BYTE_KIND:
1943        return 0x80;
1944    case PyUnicode_2BYTE_KIND:
1945        return 0x100;
1946    case PyUnicode_4BYTE_KIND:
1947        return 0x10000;
1948    default:
1949        assert(0 && "invalid kind");
1950        return MAX_UNICODE;
1951    }
1952}
1953
1954Py_LOCAL_INLINE(Py_UCS4)
1955align_maxchar(Py_UCS4 maxchar)
1956{
1957    if (maxchar <= 127)
1958        return 127;
1959    else if (maxchar <= 255)
1960        return 255;
1961    else if (maxchar <= 65535)
1962        return 65535;
1963    else
1964        return MAX_UNICODE;
1965}
1966
1967static PyObject*
1968_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
1969{
1970    PyObject *res;
1971    unsigned char max_char;
1972
1973    if (size == 0)
1974        _Py_RETURN_UNICODE_EMPTY();
1975    assert(size > 0);
1976    if (size == 1)
1977        return get_latin1_char(u[0]);
1978
1979    max_char = ucs1lib_find_max_char(u, u + size);
1980    res = PyUnicode_New(size, max_char);
1981    if (!res)
1982        return NULL;
1983    memcpy(PyUnicode_1BYTE_DATA(res), u, size);
1984    assert(_PyUnicode_CheckConsistency(res, 1));
1985    return res;
1986}
1987
1988static PyObject*
1989_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
1990{
1991    PyObject *res;
1992    Py_UCS2 max_char;
1993
1994    if (size == 0)
1995        _Py_RETURN_UNICODE_EMPTY();
1996    assert(size > 0);
1997    if (size == 1)
1998        return unicode_char(u[0]);
1999
2000    max_char = ucs2lib_find_max_char(u, u + size);
2001    res = PyUnicode_New(size, max_char);
2002    if (!res)
2003        return NULL;
2004    if (max_char >= 256)
2005        memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
2006    else {
2007        _PyUnicode_CONVERT_BYTES(
2008            Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2009    }
2010    assert(_PyUnicode_CheckConsistency(res, 1));
2011    return res;
2012}
2013
2014static PyObject*
2015_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
2016{
2017    PyObject *res;
2018    Py_UCS4 max_char;
2019
2020    if (size == 0)
2021        _Py_RETURN_UNICODE_EMPTY();
2022    assert(size > 0);
2023    if (size == 1)
2024        return unicode_char(u[0]);
2025
2026    max_char = ucs4lib_find_max_char(u, u + size);
2027    res = PyUnicode_New(size, max_char);
2028    if (!res)
2029        return NULL;
2030    if (max_char < 256)
2031        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2032                                 PyUnicode_1BYTE_DATA(res));
2033    else if (max_char < 0x10000)
2034        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2035                                 PyUnicode_2BYTE_DATA(res));
2036    else
2037        memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
2038    assert(_PyUnicode_CheckConsistency(res, 1));
2039    return res;
2040}
2041
2042PyObject*
2043PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2044{
2045    if (size < 0) {
2046        PyErr_SetString(PyExc_ValueError, "size must be positive");
2047        return NULL;
2048    }
2049    switch (kind) {
2050    case PyUnicode_1BYTE_KIND:
2051        return _PyUnicode_FromUCS1(buffer, size);
2052    case PyUnicode_2BYTE_KIND:
2053        return _PyUnicode_FromUCS2(buffer, size);
2054    case PyUnicode_4BYTE_KIND:
2055        return _PyUnicode_FromUCS4(buffer, size);
2056    default:
2057        PyErr_SetString(PyExc_SystemError, "invalid kind");
2058        return NULL;
2059    }
2060}
2061
2062Py_UCS4
2063_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2064{
2065    enum PyUnicode_Kind kind;
2066    void *startptr, *endptr;
2067
2068    assert(PyUnicode_IS_READY(unicode));
2069    assert(0 <= start);
2070    assert(end <= PyUnicode_GET_LENGTH(unicode));
2071    assert(start <= end);
2072
2073    if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2074        return PyUnicode_MAX_CHAR_VALUE(unicode);
2075
2076    if (start == end)
2077        return 127;
2078
2079    if (PyUnicode_IS_ASCII(unicode))
2080        return 127;
2081
2082    kind = PyUnicode_KIND(unicode);
2083    startptr = PyUnicode_DATA(unicode);
2084    endptr = (char *)startptr + end * kind;
2085    startptr = (char *)startptr + start * kind;
2086    switch(kind) {
2087    case PyUnicode_1BYTE_KIND:
2088        return ucs1lib_find_max_char(startptr, endptr);
2089    case PyUnicode_2BYTE_KIND:
2090        return ucs2lib_find_max_char(startptr, endptr);
2091    case PyUnicode_4BYTE_KIND:
2092        return ucs4lib_find_max_char(startptr, endptr);
2093    default:
2094        assert(0);
2095        return 0;
2096    }
2097}
2098
2099/* Ensure that a string uses the most efficient storage, if it is not the
2100   case: create a new string with of the right kind. Write NULL into *p_unicode
2101   on error. */
2102static void
2103unicode_adjust_maxchar(PyObject **p_unicode)
2104{
2105    PyObject *unicode, *copy;
2106    Py_UCS4 max_char;
2107    Py_ssize_t len;
2108    unsigned int kind;
2109
2110    assert(p_unicode != NULL);
2111    unicode = *p_unicode;
2112    assert(PyUnicode_IS_READY(unicode));
2113    if (PyUnicode_IS_ASCII(unicode))
2114        return;
2115
2116    len = PyUnicode_GET_LENGTH(unicode);
2117    kind = PyUnicode_KIND(unicode);
2118    if (kind == PyUnicode_1BYTE_KIND) {
2119        const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
2120        max_char = ucs1lib_find_max_char(u, u + len);
2121        if (max_char >= 128)
2122            return;
2123    }
2124    else if (kind == PyUnicode_2BYTE_KIND) {
2125        const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
2126        max_char = ucs2lib_find_max_char(u, u + len);
2127        if (max_char >= 256)
2128            return;
2129    }
2130    else {
2131        const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
2132        assert(kind == PyUnicode_4BYTE_KIND);
2133        max_char = ucs4lib_find_max_char(u, u + len);
2134        if (max_char >= 0x10000)
2135            return;
2136    }
2137    copy = PyUnicode_New(len, max_char);
2138    if (copy != NULL)
2139        _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
2140    Py_DECREF(unicode);
2141    *p_unicode = copy;
2142}
2143
2144PyObject*
2145_PyUnicode_Copy(PyObject *unicode)
2146{
2147    Py_ssize_t length;
2148    PyObject *copy;
2149
2150    if (!PyUnicode_Check(unicode)) {
2151        PyErr_BadInternalCall();
2152        return NULL;
2153    }
2154    if (PyUnicode_READY(unicode) == -1)
2155        return NULL;
2156
2157    length = PyUnicode_GET_LENGTH(unicode);
2158    copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
2159    if (!copy)
2160        return NULL;
2161    assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2162
2163    Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2164              length * PyUnicode_KIND(unicode));
2165    assert(_PyUnicode_CheckConsistency(copy, 1));
2166    return copy;
2167}
2168
2169
2170/* Widen Unicode objects to larger buffers. Don't write terminating null
2171   character. Return NULL on error. */
2172
2173void*
2174_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2175{
2176    Py_ssize_t len;
2177    void *result;
2178    unsigned int skind;
2179
2180    if (PyUnicode_READY(s) == -1)
2181        return NULL;
2182
2183    len = PyUnicode_GET_LENGTH(s);
2184    skind = PyUnicode_KIND(s);
2185    if (skind >= kind) {
2186        PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
2187        return NULL;
2188    }
2189    switch (kind) {
2190    case PyUnicode_2BYTE_KIND:
2191        result = PyMem_New(Py_UCS2, len);
2192        if (!result)
2193            return PyErr_NoMemory();
2194        assert(skind == PyUnicode_1BYTE_KIND);
2195        _PyUnicode_CONVERT_BYTES(
2196            Py_UCS1, Py_UCS2,
2197            PyUnicode_1BYTE_DATA(s),
2198            PyUnicode_1BYTE_DATA(s) + len,
2199            result);
2200        return result;
2201    case PyUnicode_4BYTE_KIND:
2202        result = PyMem_New(Py_UCS4, len);
2203        if (!result)
2204            return PyErr_NoMemory();
2205        if (skind == PyUnicode_2BYTE_KIND) {
2206            _PyUnicode_CONVERT_BYTES(
2207                Py_UCS2, Py_UCS4,
2208                PyUnicode_2BYTE_DATA(s),
2209                PyUnicode_2BYTE_DATA(s) + len,
2210                result);
2211        }
2212        else {
2213            assert(skind == PyUnicode_1BYTE_KIND);
2214            _PyUnicode_CONVERT_BYTES(
2215                Py_UCS1, Py_UCS4,
2216                PyUnicode_1BYTE_DATA(s),
2217                PyUnicode_1BYTE_DATA(s) + len,
2218                result);
2219        }
2220        return result;
2221    default:
2222        break;
2223    }
2224    PyErr_SetString(PyExc_SystemError, "invalid kind");
2225    return NULL;
2226}
2227
2228static Py_UCS4*
2229as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2230        int copy_null)
2231{
2232    int kind;
2233    void *data;
2234    Py_ssize_t len, targetlen;
2235    if (PyUnicode_READY(string) == -1)
2236        return NULL;
2237    kind = PyUnicode_KIND(string);
2238    data = PyUnicode_DATA(string);
2239    len = PyUnicode_GET_LENGTH(string);
2240    targetlen = len;
2241    if (copy_null)
2242        targetlen++;
2243    if (!target) {
2244        target = PyMem_New(Py_UCS4, targetlen);
2245        if (!target) {
2246            PyErr_NoMemory();
2247            return NULL;
2248        }
2249    }
2250    else {
2251        if (targetsize < targetlen) {
2252            PyErr_Format(PyExc_SystemError,
2253                         "string is longer than the buffer");
2254            if (copy_null && 0 < targetsize)
2255                target[0] = 0;
2256            return NULL;
2257        }
2258    }
2259    if (kind == PyUnicode_1BYTE_KIND) {
2260        Py_UCS1 *start = (Py_UCS1 *) data;
2261        _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
2262    }
2263    else if (kind == PyUnicode_2BYTE_KIND) {
2264        Py_UCS2 *start = (Py_UCS2 *) data;
2265        _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2266    }
2267    else {
2268        assert(kind == PyUnicode_4BYTE_KIND);
2269        Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
2270    }
2271    if (copy_null)
2272        target[len] = 0;
2273    return target;
2274}
2275
2276Py_UCS4*
2277PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2278                 int copy_null)
2279{
2280    if (target == NULL || targetsize < 0) {
2281        PyErr_BadInternalCall();
2282        return NULL;
2283    }
2284    return as_ucs4(string, target, targetsize, copy_null);
2285}
2286
2287Py_UCS4*
2288PyUnicode_AsUCS4Copy(PyObject *string)
2289{
2290    return as_ucs4(string, NULL, 0, 1);
2291}
2292
2293#ifdef HAVE_WCHAR_H
2294
2295PyObject *
2296PyUnicode_FromWideChar(const wchar_t *w, Py_ssize_t size)
2297{
2298    if (w == NULL) {
2299        if (size == 0)
2300            _Py_RETURN_UNICODE_EMPTY();
2301        PyErr_BadInternalCall();
2302        return NULL;
2303    }
2304
2305    if (size == -1) {
2306        size = wcslen(w);
2307    }
2308
2309    return PyUnicode_FromUnicode(w, size);
2310}
2311
2312#endif /* HAVE_WCHAR_H */
2313
2314/* maximum number of characters required for output of %lld or %p.
2315   We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2316   plus 1 for the sign.  53/22 is an upper bound for log10(256). */
2317#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2318
2319static int
2320unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2321                             Py_ssize_t width, Py_ssize_t precision)
2322{
2323    Py_ssize_t length, fill, arglen;
2324    Py_UCS4 maxchar;
2325
2326    if (PyUnicode_READY(str) == -1)
2327        return -1;
2328
2329    length = PyUnicode_GET_LENGTH(str);
2330    if ((precision == -1 || precision >= length)
2331        && width <= length)
2332        return _PyUnicodeWriter_WriteStr(writer, str);
2333
2334    if (precision != -1)
2335        length = Py_MIN(precision, length);
2336
2337    arglen = Py_MAX(length, width);
2338    if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2339        maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2340    else
2341        maxchar = writer->maxchar;
2342
2343    if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2344        return -1;
2345
2346    if (width > length) {
2347        fill = width - length;
2348        if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2349            return -1;
2350        writer->pos += fill;
2351    }
2352
2353    _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2354                                  str, 0, length);
2355    writer->pos += length;
2356    return 0;
2357}
2358
2359static int
2360unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2361                              Py_ssize_t width, Py_ssize_t precision)
2362{
2363    /* UTF-8 */
2364    Py_ssize_t length;
2365    PyObject *unicode;
2366    int res;
2367
2368    length = strlen(str);
2369    if (precision != -1)
2370        length = Py_MIN(length, precision);
2371    unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2372    if (unicode == NULL)
2373        return -1;
2374
2375    res = unicode_fromformat_write_str(writer, unicode, width, -1);
2376    Py_DECREF(unicode);
2377    return res;
2378}
2379
2380static const char*
2381unicode_fromformat_arg(_PyUnicodeWriter *writer,
2382                       const char *f, va_list *vargs)
2383{
2384    const char *p;
2385    Py_ssize_t len;
2386    int zeropad;
2387    Py_ssize_t width;
2388    Py_ssize_t precision;
2389    int longflag;
2390    int longlongflag;
2391    int size_tflag;
2392    Py_ssize_t fill;
2393
2394    p = f;
2395    f++;
2396    zeropad = 0;
2397    if (*f == '0') {
2398        zeropad = 1;
2399        f++;
2400    }
2401
2402    /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2403    width = -1;
2404    if (Py_ISDIGIT((unsigned)*f)) {
2405        width = *f - '0';
2406        f++;
2407        while (Py_ISDIGIT((unsigned)*f)) {
2408            if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2409                PyErr_SetString(PyExc_ValueError,
2410                                "width too big");
2411                return NULL;
2412            }
2413            width = (width * 10) + (*f - '0');
2414            f++;
2415        }
2416    }
2417    precision = -1;
2418    if (*f == '.') {
2419        f++;
2420        if (Py_ISDIGIT((unsigned)*f)) {
2421            precision = (*f - '0');
2422            f++;
2423            while (Py_ISDIGIT((unsigned)*f)) {
2424                if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2425                    PyErr_SetString(PyExc_ValueError,
2426                                    "precision too big");
2427                    return NULL;
2428                }
2429                precision = (precision * 10) + (*f - '0');
2430                f++;
2431            }
2432        }
2433        if (*f == '%') {
2434            /* "%.3%s" => f points to "3" */
2435            f--;
2436        }
2437    }
2438    if (*f == '\0') {
2439        /* bogus format "%.123" => go backward, f points to "3" */
2440        f--;
2441    }
2442
2443    /* Handle %ld, %lu, %lld and %llu. */
2444    longflag = 0;
2445    longlongflag = 0;
2446    size_tflag = 0;
2447    if (*f == 'l') {
2448        if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
2449            longflag = 1;
2450            ++f;
2451        }
2452#ifdef HAVE_LONG_LONG
2453        else if (f[1] == 'l' &&
2454                 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
2455            longlongflag = 1;
2456            f += 2;
2457        }
2458#endif
2459    }
2460    /* handle the size_t flag. */
2461    else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
2462        size_tflag = 1;
2463        ++f;
2464    }
2465
2466    if (f[1] == '\0')
2467        writer->overallocate = 0;
2468
2469    switch (*f) {
2470    case 'c':
2471    {
2472        int ordinal = va_arg(*vargs, int);
2473        if (ordinal < 0 || ordinal > MAX_UNICODE) {
2474            PyErr_SetString(PyExc_OverflowError,
2475                            "character argument not in range(0x110000)");
2476            return NULL;
2477        }
2478        if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
2479            return NULL;
2480        break;
2481    }
2482
2483    case 'i':
2484    case 'd':
2485    case 'u':
2486    case 'x':
2487    {
2488        /* used by sprintf */
2489        char buffer[MAX_LONG_LONG_CHARS];
2490        Py_ssize_t arglen;
2491
2492        if (*f == 'u') {
2493            if (longflag)
2494                len = sprintf(buffer, "%lu",
2495                        va_arg(*vargs, unsigned long));
2496#ifdef HAVE_LONG_LONG
2497            else if (longlongflag)
2498                len = sprintf(buffer, "%" PY_FORMAT_LONG_LONG "u",
2499                        va_arg(*vargs, unsigned PY_LONG_LONG));
2500#endif
2501            else if (size_tflag)
2502                len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
2503                        va_arg(*vargs, size_t));
2504            else
2505                len = sprintf(buffer, "%u",
2506                        va_arg(*vargs, unsigned int));
2507        }
2508        else if (*f == 'x') {
2509            len = sprintf(buffer, "%x", va_arg(*vargs, int));
2510        }
2511        else {
2512            if (longflag)
2513                len = sprintf(buffer, "%li",
2514                        va_arg(*vargs, long));
2515#ifdef HAVE_LONG_LONG
2516            else if (longlongflag)
2517                len = sprintf(buffer, "%" PY_FORMAT_LONG_LONG "i",
2518                        va_arg(*vargs, PY_LONG_LONG));
2519#endif
2520            else if (size_tflag)
2521                len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
2522                        va_arg(*vargs, Py_ssize_t));
2523            else
2524                len = sprintf(buffer, "%i",
2525                        va_arg(*vargs, int));
2526        }
2527        assert(len >= 0);
2528
2529        if (precision < len)
2530            precision = len;
2531
2532        arglen = Py_MAX(precision, width);
2533        if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2534            return NULL;
2535
2536        if (width > precision) {
2537            Py_UCS4 fillchar;
2538            fill = width - precision;
2539            fillchar = zeropad?'0':' ';
2540            if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2541                return NULL;
2542            writer->pos += fill;
2543        }
2544        if (precision > len) {
2545            fill = precision - len;
2546            if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2547                return NULL;
2548            writer->pos += fill;
2549        }
2550
2551        if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2552            return NULL;
2553        break;
2554    }
2555
2556    case 'p':
2557    {
2558        char number[MAX_LONG_LONG_CHARS];
2559
2560        len = sprintf(number, "%p", va_arg(*vargs, void*));
2561        assert(len >= 0);
2562
2563        /* %p is ill-defined:  ensure leading 0x. */
2564        if (number[1] == 'X')
2565            number[1] = 'x';
2566        else if (number[1] != 'x') {
2567            memmove(number + 2, number,
2568                    strlen(number) + 1);
2569            number[0] = '0';
2570            number[1] = 'x';
2571            len += 2;
2572        }
2573
2574        if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
2575            return NULL;
2576        break;
2577    }
2578
2579    case 's':
2580    {
2581        /* UTF-8 */
2582        const char *s = va_arg(*vargs, const char*);
2583        if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
2584            return NULL;
2585        break;
2586    }
2587
2588    case 'U':
2589    {
2590        PyObject *obj = va_arg(*vargs, PyObject *);
2591        assert(obj && _PyUnicode_CHECK(obj));
2592
2593        if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
2594            return NULL;
2595        break;
2596    }
2597
2598    case 'V':
2599    {
2600        PyObject *obj = va_arg(*vargs, PyObject *);
2601        const char *str = va_arg(*vargs, const char *);
2602        if (obj) {
2603            assert(_PyUnicode_CHECK(obj));
2604            if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
2605                return NULL;
2606        }
2607        else {
2608            assert(str != NULL);
2609            if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
2610                return NULL;
2611        }
2612        break;
2613    }
2614
2615    case 'S':
2616    {
2617        PyObject *obj = va_arg(*vargs, PyObject *);
2618        PyObject *str;
2619        assert(obj);
2620        str = PyObject_Str(obj);
2621        if (!str)
2622            return NULL;
2623        if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
2624            Py_DECREF(str);
2625            return NULL;
2626        }
2627        Py_DECREF(str);
2628        break;
2629    }
2630
2631    case 'R':
2632    {
2633        PyObject *obj = va_arg(*vargs, PyObject *);
2634        PyObject *repr;
2635        assert(obj);
2636        repr = PyObject_Repr(obj);
2637        if (!repr)
2638            return NULL;
2639        if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
2640            Py_DECREF(repr);
2641            return NULL;
2642        }
2643        Py_DECREF(repr);
2644        break;
2645    }
2646
2647    case 'A':
2648    {
2649        PyObject *obj = va_arg(*vargs, PyObject *);
2650        PyObject *ascii;
2651        assert(obj);
2652        ascii = PyObject_ASCII(obj);
2653        if (!ascii)
2654            return NULL;
2655        if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
2656            Py_DECREF(ascii);
2657            return NULL;
2658        }
2659        Py_DECREF(ascii);
2660        break;
2661    }
2662
2663    case '%':
2664        if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
2665            return NULL;
2666        break;
2667
2668    default:
2669        /* if we stumble upon an unknown formatting code, copy the rest
2670           of the format string to the output string. (we cannot just
2671           skip the code, since there's no way to know what's in the
2672           argument list) */
2673        len = strlen(p);
2674        if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
2675            return NULL;
2676        f = p+len;
2677        return f;
2678    }
2679
2680    f++;
2681    return f;
2682}
2683
2684PyObject *
2685PyUnicode_FromFormatV(const char *format, va_list vargs)
2686{
2687    va_list vargs2;
2688    const char *f;
2689    _PyUnicodeWriter writer;
2690
2691    _PyUnicodeWriter_Init(&writer);
2692    writer.min_length = strlen(format) + 100;
2693    writer.overallocate = 1;
2694
2695    /* va_list may be an array (of 1 item) on some platforms (ex: AMD64).
2696       Copy it to be able to pass a reference to a subfunction. */
2697    Py_VA_COPY(vargs2, vargs);
2698
2699    for (f = format; *f; ) {
2700        if (*f == '%') {
2701            f = unicode_fromformat_arg(&writer, f, &vargs2);
2702            if (f == NULL)
2703                goto fail;
2704        }
2705        else {
2706            const char *p;
2707            Py_ssize_t len;
2708
2709            p = f;
2710            do
2711            {
2712                if ((unsigned char)*p > 127) {
2713                    PyErr_Format(PyExc_ValueError,
2714                        "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2715                        "string, got a non-ASCII byte: 0x%02x",
2716                        (unsigned char)*p);
2717                    return NULL;
2718                }
2719                p++;
2720            }
2721            while (*p != '\0' && *p != '%');
2722            len = p - f;
2723
2724            if (*p == '\0')
2725                writer.overallocate = 0;
2726
2727            if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
2728                goto fail;
2729
2730            f = p;
2731        }
2732    }
2733    return _PyUnicodeWriter_Finish(&writer);
2734
2735  fail:
2736    _PyUnicodeWriter_Dealloc(&writer);
2737    return NULL;
2738}
2739
2740PyObject *
2741PyUnicode_FromFormat(const char *format, ...)
2742{
2743    PyObject* ret;
2744    va_list vargs;
2745
2746#ifdef HAVE_STDARG_PROTOTYPES
2747    va_start(vargs, format);
2748#else
2749    va_start(vargs);
2750#endif
2751    ret = PyUnicode_FromFormatV(format, vargs);
2752    va_end(vargs);
2753    return ret;
2754}
2755
2756#ifdef HAVE_WCHAR_H
2757
2758/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2759   convert a Unicode object to a wide character string.
2760
2761   - If w is NULL: return the number of wide characters (including the null
2762     character) required to convert the unicode object. Ignore size argument.
2763
2764   - Otherwise: return the number of wide characters (excluding the null
2765     character) written into w. Write at most size wide characters (including
2766     the null character). */
2767static Py_ssize_t
2768unicode_aswidechar(PyObject *unicode,
2769                   wchar_t *w,
2770                   Py_ssize_t size)
2771{
2772    Py_ssize_t res;
2773    const wchar_t *wstr;
2774
2775    wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
2776    if (wstr == NULL)
2777        return -1;
2778
2779    if (w != NULL) {
2780        if (size > res)
2781            size = res + 1;
2782        else
2783            res = size;
2784        Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
2785        return res;
2786    }
2787    else
2788        return res + 1;
2789}
2790
2791Py_ssize_t
2792PyUnicode_AsWideChar(PyObject *unicode,
2793                     wchar_t *w,
2794                     Py_ssize_t size)
2795{
2796    if (unicode == NULL) {
2797        PyErr_BadInternalCall();
2798        return -1;
2799    }
2800    return unicode_aswidechar(unicode, w, size);
2801}
2802
2803wchar_t*
2804PyUnicode_AsWideCharString(PyObject *unicode,
2805                           Py_ssize_t *size)
2806{
2807    wchar_t* buffer;
2808    Py_ssize_t buflen;
2809
2810    if (unicode == NULL) {
2811        PyErr_BadInternalCall();
2812        return NULL;
2813    }
2814
2815    buflen = unicode_aswidechar(unicode, NULL, 0);
2816    if (buflen == -1)
2817        return NULL;
2818    buffer = PyMem_NEW(wchar_t, buflen);
2819    if (buffer == NULL) {
2820        PyErr_NoMemory();
2821        return NULL;
2822    }
2823    buflen = unicode_aswidechar(unicode, buffer, buflen);
2824    if (buflen == -1) {
2825        PyMem_FREE(buffer);
2826        return NULL;
2827    }
2828    if (size != NULL)
2829        *size = buflen;
2830    return buffer;
2831}
2832
2833#endif /* HAVE_WCHAR_H */
2834
2835PyObject *
2836PyUnicode_FromOrdinal(int ordinal)
2837{
2838    if (ordinal < 0 || ordinal > MAX_UNICODE) {
2839        PyErr_SetString(PyExc_ValueError,
2840                        "chr() arg not in range(0x110000)");
2841        return NULL;
2842    }
2843
2844    return unicode_char((Py_UCS4)ordinal);
2845}
2846
2847PyObject *
2848PyUnicode_FromObject(PyObject *obj)
2849{
2850    /* XXX Perhaps we should make this API an alias of
2851       PyObject_Str() instead ?! */
2852    if (PyUnicode_CheckExact(obj)) {
2853        if (PyUnicode_READY(obj) == -1)
2854            return NULL;
2855        Py_INCREF(obj);
2856        return obj;
2857    }
2858    if (PyUnicode_Check(obj)) {
2859        /* For a Unicode subtype that's not a Unicode object,
2860           return a true Unicode object with the same data. */
2861        return _PyUnicode_Copy(obj);
2862    }
2863    PyErr_Format(PyExc_TypeError,
2864                 "Can't convert '%.100s' object to str implicitly",
2865                 Py_TYPE(obj)->tp_name);
2866    return NULL;
2867}
2868
2869PyObject *
2870PyUnicode_FromEncodedObject(PyObject *obj,
2871                            const char *encoding,
2872                            const char *errors)
2873{
2874    Py_buffer buffer;
2875    PyObject *v;
2876
2877    if (obj == NULL) {
2878        PyErr_BadInternalCall();
2879        return NULL;
2880    }
2881
2882    /* Decoding bytes objects is the most common case and should be fast */
2883    if (PyBytes_Check(obj)) {
2884        if (PyBytes_GET_SIZE(obj) == 0)
2885            _Py_RETURN_UNICODE_EMPTY();
2886        v = PyUnicode_Decode(
2887                PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2888                encoding, errors);
2889        return v;
2890    }
2891
2892    if (PyUnicode_Check(obj)) {
2893        PyErr_SetString(PyExc_TypeError,
2894                        "decoding str is not supported");
2895        return NULL;
2896    }
2897
2898    /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2899    if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2900        PyErr_Format(PyExc_TypeError,
2901                     "coercing to str: need a bytes-like object, %.80s found",
2902                     Py_TYPE(obj)->tp_name);
2903        return NULL;
2904    }
2905
2906    if (buffer.len == 0) {
2907        PyBuffer_Release(&buffer);
2908        _Py_RETURN_UNICODE_EMPTY();
2909    }
2910
2911    v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
2912    PyBuffer_Release(&buffer);
2913    return v;
2914}
2915
2916/* Convert encoding to lower case and replace '_' with '-' in order to
2917   catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2918   1 on success. */
2919int
2920_Py_normalize_encoding(const char *encoding,
2921                       char *lower,
2922                       size_t lower_len)
2923{
2924    const char *e;
2925    char *l;
2926    char *l_end;
2927
2928    if (encoding == NULL) {
2929        /* 6 == strlen("utf-8") + 1 */
2930        if (lower_len < 6)
2931            return 0;
2932        strcpy(lower, "utf-8");
2933        return 1;
2934    }
2935    e = encoding;
2936    l = lower;
2937    l_end = &lower[lower_len - 1];
2938    while (*e) {
2939        if (l == l_end)
2940            return 0;
2941        if (Py_ISUPPER(*e)) {
2942            *l++ = Py_TOLOWER(*e++);
2943        }
2944        else if (*e == '_') {
2945            *l++ = '-';
2946            e++;
2947        }
2948        else {
2949            *l++ = *e++;
2950        }
2951    }
2952    *l = '\0';
2953    return 1;
2954}
2955
2956PyObject *
2957PyUnicode_Decode(const char *s,
2958                 Py_ssize_t size,
2959                 const char *encoding,
2960                 const char *errors)
2961{
2962    PyObject *buffer = NULL, *unicode;
2963    Py_buffer info;
2964    char lower[11];  /* Enough for any encoding shortcut */
2965
2966    /* Shortcuts for common default encodings */
2967    if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
2968        if ((strcmp(lower, "utf-8") == 0) ||
2969            (strcmp(lower, "utf8") == 0))
2970            return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
2971        else if ((strcmp(lower, "latin-1") == 0) ||
2972                 (strcmp(lower, "latin1") == 0) ||
2973                 (strcmp(lower, "iso-8859-1") == 0) ||
2974                 (strcmp(lower, "iso8859-1") == 0))
2975            return PyUnicode_DecodeLatin1(s, size, errors);
2976#ifdef HAVE_MBCS
2977        else if (strcmp(lower, "mbcs") == 0)
2978            return PyUnicode_DecodeMBCS(s, size, errors);
2979#endif
2980        else if (strcmp(lower, "ascii") == 0)
2981            return PyUnicode_DecodeASCII(s, size, errors);
2982        else if (strcmp(lower, "utf-16") == 0)
2983            return PyUnicode_DecodeUTF16(s, size, errors, 0);
2984        else if (strcmp(lower, "utf-32") == 0)
2985            return PyUnicode_DecodeUTF32(s, size, errors, 0);
2986    }
2987
2988    /* Decode via the codec registry */
2989    buffer = NULL;
2990    if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
2991        goto onError;
2992    buffer = PyMemoryView_FromBuffer(&info);
2993    if (buffer == NULL)
2994        goto onError;
2995    unicode = _PyCodec_DecodeText(buffer, encoding, errors);
2996    if (unicode == NULL)
2997        goto onError;
2998    if (!PyUnicode_Check(unicode)) {
2999        PyErr_Format(PyExc_TypeError,
3000                     "'%.400s' decoder returned '%.400s' instead of 'str'; "
3001                     "use codecs.decode() to decode to arbitrary types",
3002                     encoding,
3003                     Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name);
3004        Py_DECREF(unicode);
3005        goto onError;
3006    }
3007    Py_DECREF(buffer);
3008    return unicode_result(unicode);
3009
3010  onError:
3011    Py_XDECREF(buffer);
3012    return NULL;
3013}
3014
3015PyObject *
3016PyUnicode_AsDecodedObject(PyObject *unicode,
3017                          const char *encoding,
3018                          const char *errors)
3019{
3020    PyObject *v;
3021
3022    if (!PyUnicode_Check(unicode)) {
3023        PyErr_BadArgument();
3024        goto onError;
3025    }
3026
3027    if (encoding == NULL)
3028        encoding = PyUnicode_GetDefaultEncoding();
3029
3030    /* Decode via the codec registry */
3031    v = PyCodec_Decode(unicode, encoding, errors);
3032    if (v == NULL)
3033        goto onError;
3034    return unicode_result(v);
3035
3036  onError:
3037    return NULL;
3038}
3039
3040PyObject *
3041PyUnicode_AsDecodedUnicode(PyObject *unicode,
3042                           const char *encoding,
3043                           const char *errors)
3044{
3045    PyObject *v;
3046
3047    if (!PyUnicode_Check(unicode)) {
3048        PyErr_BadArgument();
3049        goto onError;
3050    }
3051
3052    if (encoding == NULL)
3053        encoding = PyUnicode_GetDefaultEncoding();
3054
3055    /* Decode via the codec registry */
3056    v = PyCodec_Decode(unicode, encoding, errors);
3057    if (v == NULL)
3058        goto onError;
3059    if (!PyUnicode_Check(v)) {
3060        PyErr_Format(PyExc_TypeError,
3061                     "'%.400s' decoder returned '%.400s' instead of 'str'; "
3062                     "use codecs.decode() to decode to arbitrary types",
3063                     encoding,
3064                     Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name);
3065        Py_DECREF(v);
3066        goto onError;
3067    }
3068    return unicode_result(v);
3069
3070  onError:
3071    return NULL;
3072}
3073
3074PyObject *
3075PyUnicode_Encode(const Py_UNICODE *s,
3076                 Py_ssize_t size,
3077                 const char *encoding,
3078                 const char *errors)
3079{
3080    PyObject *v, *unicode;
3081
3082    unicode = PyUnicode_FromUnicode(s, size);
3083    if (unicode == NULL)
3084        return NULL;
3085    v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3086    Py_DECREF(unicode);
3087    return v;
3088}
3089
3090PyObject *
3091PyUnicode_AsEncodedObject(PyObject *unicode,
3092                          const char *encoding,
3093                          const char *errors)
3094{
3095    PyObject *v;
3096
3097    if (!PyUnicode_Check(unicode)) {
3098        PyErr_BadArgument();
3099        goto onError;
3100    }
3101
3102    if (encoding == NULL)
3103        encoding = PyUnicode_GetDefaultEncoding();
3104
3105    /* Encode via the codec registry */
3106    v = PyCodec_Encode(unicode, encoding, errors);
3107    if (v == NULL)
3108        goto onError;
3109    return v;
3110
3111  onError:
3112    return NULL;
3113}
3114
3115static size_t
3116wcstombs_errorpos(const wchar_t *wstr)
3117{
3118    size_t len;
3119#if SIZEOF_WCHAR_T == 2
3120    wchar_t buf[3];
3121#else
3122    wchar_t buf[2];
3123#endif
3124    char outbuf[MB_LEN_MAX];
3125    const wchar_t *start, *previous;
3126
3127#if SIZEOF_WCHAR_T == 2
3128    buf[2] = 0;
3129#else
3130    buf[1] = 0;
3131#endif
3132    start = wstr;
3133    while (*wstr != L'\0')
3134    {
3135        previous = wstr;
3136#if SIZEOF_WCHAR_T == 2
3137        if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3138            && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3139        {
3140            buf[0] = wstr[0];
3141            buf[1] = wstr[1];
3142            wstr += 2;
3143        }
3144        else {
3145            buf[0] = *wstr;
3146            buf[1] = 0;
3147            wstr++;
3148        }
3149#else
3150        buf[0] = *wstr;
3151        wstr++;
3152#endif
3153        len = wcstombs(outbuf, buf, sizeof(outbuf));
3154        if (len == (size_t)-1)
3155            return previous - start;
3156    }
3157
3158    /* failed to find the unencodable character */
3159    return 0;
3160}
3161
3162static int
3163locale_error_handler(const char *errors, int *surrogateescape)
3164{
3165    if (errors == NULL) {
3166        *surrogateescape = 0;
3167        return 0;
3168    }
3169
3170    if (strcmp(errors, "strict") == 0) {
3171        *surrogateescape = 0;
3172        return 0;
3173    }
3174    if (strcmp(errors, "surrogateescape") == 0) {
3175        *surrogateescape = 1;
3176        return 0;
3177    }
3178    PyErr_Format(PyExc_ValueError,
3179                 "only 'strict' and 'surrogateescape' error handlers "
3180                 "are supported, not '%s'",
3181                 errors);
3182    return -1;
3183}
3184
3185PyObject *
3186PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3187{
3188    Py_ssize_t wlen, wlen2;
3189    wchar_t *wstr;
3190    PyObject *bytes = NULL;
3191    char *errmsg;
3192    PyObject *reason = NULL;
3193    PyObject *exc;
3194    size_t error_pos;
3195    int surrogateescape;
3196
3197    if (locale_error_handler(errors, &surrogateescape) < 0)
3198        return NULL;
3199
3200    wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3201    if (wstr == NULL)
3202        return NULL;
3203
3204    wlen2 = wcslen(wstr);
3205    if (wlen2 != wlen) {
3206        PyMem_Free(wstr);
3207        PyErr_SetString(PyExc_ValueError, "embedded null character");
3208        return NULL;
3209    }
3210
3211    if (surrogateescape) {
3212        /* "surrogateescape" error handler */
3213        char *str;
3214
3215        str = Py_EncodeLocale(wstr, &error_pos);
3216        if (str == NULL) {
3217            if (error_pos == (size_t)-1) {
3218                PyErr_NoMemory();
3219                PyMem_Free(wstr);
3220                return NULL;
3221            }
3222            else {
3223                goto encode_error;
3224            }
3225        }
3226        PyMem_Free(wstr);
3227
3228        bytes = PyBytes_FromString(str);
3229        PyMem_Free(str);
3230    }
3231    else {
3232        /* strict mode */
3233        size_t len, len2;
3234
3235        len = wcstombs(NULL, wstr, 0);
3236        if (len == (size_t)-1) {
3237            error_pos = (size_t)-1;
3238            goto encode_error;
3239        }
3240
3241        bytes = PyBytes_FromStringAndSize(NULL, len);
3242        if (bytes == NULL) {
3243            PyMem_Free(wstr);
3244            return NULL;
3245        }
3246
3247        len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3248        if (len2 == (size_t)-1 || len2 > len) {
3249            error_pos = (size_t)-1;
3250            goto encode_error;
3251        }
3252        PyMem_Free(wstr);
3253    }
3254    return bytes;
3255
3256encode_error:
3257    errmsg = strerror(errno);
3258    assert(errmsg != NULL);
3259
3260    if (error_pos == (size_t)-1)
3261        error_pos = wcstombs_errorpos(wstr);
3262
3263    PyMem_Free(wstr);
3264    Py_XDECREF(bytes);
3265
3266    if (errmsg != NULL) {
3267        size_t errlen;
3268        wstr = Py_DecodeLocale(errmsg, &errlen);
3269        if (wstr != NULL) {
3270            reason = PyUnicode_FromWideChar(wstr, errlen);
3271            PyMem_RawFree(wstr);
3272        } else
3273            errmsg = NULL;
3274    }
3275    if (errmsg == NULL)
3276        reason = PyUnicode_FromString(
3277            "wcstombs() encountered an unencodable "
3278            "wide character");
3279    if (reason == NULL)
3280        return NULL;
3281
3282    exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3283                                "locale", unicode,
3284                                (Py_ssize_t)error_pos,
3285                                (Py_ssize_t)(error_pos+1),
3286                                reason);
3287    Py_DECREF(reason);
3288    if (exc != NULL) {
3289        PyCodec_StrictErrors(exc);
3290        Py_XDECREF(exc);
3291    }
3292    return NULL;
3293}
3294
3295PyObject *
3296PyUnicode_EncodeFSDefault(PyObject *unicode)
3297{
3298#ifdef HAVE_MBCS
3299    return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
3300#elif defined(__APPLE__)
3301    return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
3302#else
3303    PyInterpreterState *interp = PyThreadState_GET()->interp;
3304    /* Bootstrap check: if the filesystem codec is implemented in Python, we
3305       cannot use it to encode and decode filenames before it is loaded. Load
3306       the Python codec requires to encode at least its own filename. Use the C
3307       version of the locale codec until the codec registry is initialized and
3308       the Python codec is loaded.
3309
3310       Py_FileSystemDefaultEncoding is shared between all interpreters, we
3311       cannot only rely on it: check also interp->fscodec_initialized for
3312       subinterpreters. */
3313    if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
3314        return PyUnicode_AsEncodedString(unicode,
3315                                         Py_FileSystemDefaultEncoding,
3316                                         "surrogateescape");
3317    }
3318    else {
3319        return PyUnicode_EncodeLocale(unicode, "surrogateescape");
3320    }
3321#endif
3322}
3323
3324PyObject *
3325PyUnicode_AsEncodedString(PyObject *unicode,
3326                          const char *encoding,
3327                          const char *errors)
3328{
3329    PyObject *v;
3330    char lower[11];  /* Enough for any encoding shortcut */
3331
3332    if (!PyUnicode_Check(unicode)) {
3333        PyErr_BadArgument();
3334        return NULL;
3335    }
3336
3337    /* Shortcuts for common default encodings */
3338    if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
3339        if ((strcmp(lower, "utf-8") == 0) ||
3340            (strcmp(lower, "utf8") == 0))
3341        {
3342            if (errors == NULL || strcmp(errors, "strict") == 0)
3343                return _PyUnicode_AsUTF8String(unicode, NULL);
3344            else
3345                return _PyUnicode_AsUTF8String(unicode, errors);
3346        }
3347        else if ((strcmp(lower, "latin-1") == 0) ||
3348                 (strcmp(lower, "latin1") == 0) ||
3349                 (strcmp(lower, "iso-8859-1") == 0) ||
3350                 (strcmp(lower, "iso8859-1") == 0))
3351            return _PyUnicode_AsLatin1String(unicode, errors);
3352#ifdef HAVE_MBCS
3353        else if (strcmp(lower, "mbcs") == 0)
3354            return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3355#endif
3356        else if (strcmp(lower, "ascii") == 0)
3357            return _PyUnicode_AsASCIIString(unicode, errors);
3358    }
3359
3360    /* Encode via the codec registry */
3361    v = _PyCodec_EncodeText(unicode, encoding, errors);
3362    if (v == NULL)
3363        return NULL;
3364
3365    /* The normal path */
3366    if (PyBytes_Check(v))
3367        return v;
3368
3369    /* If the codec returns a buffer, raise a warning and convert to bytes */
3370    if (PyByteArray_Check(v)) {
3371        int error;
3372        PyObject *b;
3373
3374        error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3375            "encoder %s returned bytearray instead of bytes; "
3376            "use codecs.encode() to encode to arbitrary types",
3377            encoding);
3378        if (error) {
3379            Py_DECREF(v);
3380            return NULL;
3381        }
3382
3383        b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3384        Py_DECREF(v);
3385        return b;
3386    }
3387
3388    PyErr_Format(PyExc_TypeError,
3389                 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3390                 "use codecs.encode() to encode to arbitrary types",
3391                 encoding,
3392                 Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name);
3393    Py_DECREF(v);
3394    return NULL;
3395}
3396
3397PyObject *
3398PyUnicode_AsEncodedUnicode(PyObject *unicode,
3399                           const char *encoding,
3400                           const char *errors)
3401{
3402    PyObject *v;
3403
3404    if (!PyUnicode_Check(unicode)) {
3405        PyErr_BadArgument();
3406        goto onError;
3407    }
3408
3409    if (encoding == NULL)
3410        encoding = PyUnicode_GetDefaultEncoding();
3411
3412    /* Encode via the codec registry */
3413    v = PyCodec_Encode(unicode, encoding, errors);
3414    if (v == NULL)
3415        goto onError;
3416    if (!PyUnicode_Check(v)) {
3417        PyErr_Format(PyExc_TypeError,
3418                     "'%.400s' encoder returned '%.400s' instead of 'str'; "
3419                     "use codecs.encode() to encode to arbitrary types",
3420                     encoding,
3421                     Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name);
3422        Py_DECREF(v);
3423        goto onError;
3424    }
3425    return v;
3426
3427  onError:
3428    return NULL;
3429}
3430
3431static size_t
3432mbstowcs_errorpos(const char *str, size_t len)
3433{
3434#ifdef HAVE_MBRTOWC
3435    const char *start = str;
3436    mbstate_t mbs;
3437    size_t converted;
3438    wchar_t ch;
3439
3440    memset(&mbs, 0, sizeof mbs);
3441    while (len)
3442    {
3443        converted = mbrtowc(&ch, str, len, &mbs);
3444        if (converted == 0)
3445            /* Reached end of string */
3446            break;
3447        if (converted == (size_t)-1 || converted == (size_t)-2) {
3448            /* Conversion error or incomplete character */
3449            return str - start;
3450        }
3451        else {
3452            str += converted;
3453            len -= converted;
3454        }
3455    }
3456    /* failed to find the undecodable byte sequence */
3457    return 0;
3458#endif
3459    return 0;
3460}
3461
3462PyObject*
3463PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3464                              const char *errors)
3465{
3466    wchar_t smallbuf[256];
3467    size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3468    wchar_t *wstr;
3469    size_t wlen, wlen2;
3470    PyObject *unicode;
3471    int surrogateescape;
3472    size_t error_pos;
3473    char *errmsg;
3474    PyObject *reason = NULL;   /* initialize to prevent gcc warning */
3475    PyObject *exc;
3476
3477    if (locale_error_handler(errors, &surrogateescape) < 0)
3478        return NULL;
3479
3480    if (str[len] != '\0' || (size_t)len != strlen(str))  {
3481        PyErr_SetString(PyExc_ValueError, "embedded null byte");
3482        return NULL;
3483    }
3484
3485    if (surrogateescape) {
3486        /* "surrogateescape" error handler */
3487        wstr = Py_DecodeLocale(str, &wlen);
3488        if (wstr == NULL) {
3489            if (wlen == (size_t)-1)
3490                PyErr_NoMemory();
3491            else
3492                PyErr_SetFromErrno(PyExc_OSError);
3493            return NULL;
3494        }
3495
3496        unicode = PyUnicode_FromWideChar(wstr, wlen);
3497        PyMem_RawFree(wstr);
3498    }
3499    else {
3500        /* strict mode */
3501#ifndef HAVE_BROKEN_MBSTOWCS
3502        wlen = mbstowcs(NULL, str, 0);
3503#else
3504        wlen = len;
3505#endif
3506        if (wlen == (size_t)-1)
3507            goto decode_error;
3508        if (wlen+1 <= smallbuf_len) {
3509            wstr = smallbuf;
3510        }
3511        else {
3512            wstr = PyMem_New(wchar_t, wlen+1);
3513            if (!wstr)
3514                return PyErr_NoMemory();
3515        }
3516
3517        wlen2 = mbstowcs(wstr, str, wlen+1);
3518        if (wlen2 == (size_t)-1) {
3519            if (wstr != smallbuf)
3520                PyMem_Free(wstr);
3521            goto decode_error;
3522        }
3523#ifdef HAVE_BROKEN_MBSTOWCS
3524        assert(wlen2 == wlen);
3525#endif
3526        unicode = PyUnicode_FromWideChar(wstr, wlen2);
3527        if (wstr != smallbuf)
3528            PyMem_Free(wstr);
3529    }
3530    return unicode;
3531
3532decode_error:
3533    errmsg = strerror(errno);
3534    assert(errmsg != NULL);
3535
3536    error_pos = mbstowcs_errorpos(str, len);
3537    if (errmsg != NULL) {
3538        size_t errlen;
3539        wstr = Py_DecodeLocale(errmsg, &errlen);
3540        if (wstr != NULL) {
3541            reason = PyUnicode_FromWideChar(wstr, errlen);
3542            PyMem_RawFree(wstr);
3543        } else
3544            errmsg = NULL;
3545    }
3546    if (errmsg == NULL)
3547        reason = PyUnicode_FromString(
3548            "mbstowcs() encountered an invalid multibyte sequence");
3549    if (reason == NULL)
3550        return NULL;
3551
3552    exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3553                                "locale", str, len,
3554                                (Py_ssize_t)error_pos,
3555                                (Py_ssize_t)(error_pos+1),
3556                                reason);
3557    Py_DECREF(reason);
3558    if (exc != NULL) {
3559        PyCodec_StrictErrors(exc);
3560        Py_XDECREF(exc);
3561    }
3562    return NULL;
3563}
3564
3565PyObject*
3566PyUnicode_DecodeLocale(const char *str, const char *errors)
3567{
3568    Py_ssize_t size = (Py_ssize_t)strlen(str);
3569    return PyUnicode_DecodeLocaleAndSize(str, size, errors);
3570}
3571
3572
3573PyObject*
3574PyUnicode_DecodeFSDefault(const char *s) {
3575    Py_ssize_t size = (Py_ssize_t)strlen(s);
3576    return PyUnicode_DecodeFSDefaultAndSize(s, size);
3577}
3578
3579PyObject*
3580PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3581{
3582#ifdef HAVE_MBCS
3583    return PyUnicode_DecodeMBCS(s, size, NULL);
3584#elif defined(__APPLE__)
3585    return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
3586#else
3587    PyInterpreterState *interp = PyThreadState_GET()->interp;
3588    /* Bootstrap check: if the filesystem codec is implemented in Python, we
3589       cannot use it to encode and decode filenames before it is loaded. Load
3590       the Python codec requires to encode at least its own filename. Use the C
3591       version of the locale codec until the codec registry is initialized and
3592       the Python codec is loaded.
3593
3594       Py_FileSystemDefaultEncoding is shared between all interpreters, we
3595       cannot only rely on it: check also interp->fscodec_initialized for
3596       subinterpreters. */
3597    if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
3598        return PyUnicode_Decode(s, size,
3599                                Py_FileSystemDefaultEncoding,
3600                                "surrogateescape");
3601    }
3602    else {
3603        return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
3604    }
3605#endif
3606}
3607
3608
3609int
3610PyUnicode_FSConverter(PyObject* arg, void* addr)
3611{
3612    PyObject *output = NULL;
3613    Py_ssize_t size;
3614    void *data;
3615    if (arg == NULL) {
3616        Py_DECREF(*(PyObject**)addr);
3617        return 1;
3618    }
3619    if (PyBytes_Check(arg)) {
3620        output = arg;
3621        Py_INCREF(output);
3622    }
3623    else {
3624        arg = PyUnicode_FromObject(arg);
3625        if (!arg)
3626            return 0;
3627        output = PyUnicode_EncodeFSDefault(arg);
3628        Py_DECREF(arg);
3629        if (!output)
3630            return 0;
3631        if (!PyBytes_Check(output)) {
3632            Py_DECREF(output);
3633            PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3634            return 0;
3635        }
3636    }
3637    size = PyBytes_GET_SIZE(output);
3638    data = PyBytes_AS_STRING(output);
3639    if ((size_t)size != strlen(data)) {
3640        PyErr_SetString(PyExc_ValueError, "embedded null byte");
3641        Py_DECREF(output);
3642        return 0;
3643    }
3644    *(PyObject**)addr = output;
3645    return Py_CLEANUP_SUPPORTED;
3646}
3647
3648
3649int
3650PyUnicode_FSDecoder(PyObject* arg, void* addr)
3651{
3652    PyObject *output = NULL;
3653    if (arg == NULL) {
3654        Py_DECREF(*(PyObject**)addr);
3655        return 1;
3656    }
3657    if (PyUnicode_Check(arg)) {
3658        if (PyUnicode_READY(arg) == -1)
3659            return 0;
3660        output = arg;
3661        Py_INCREF(output);
3662    }
3663    else {
3664        arg = PyBytes_FromObject(arg);
3665        if (!arg)
3666            return 0;
3667        output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3668                                                  PyBytes_GET_SIZE(arg));
3669        Py_DECREF(arg);
3670        if (!output)
3671            return 0;
3672        if (!PyUnicode_Check(output)) {
3673            Py_DECREF(output);
3674            PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3675            return 0;
3676        }
3677    }
3678    if (PyUnicode_READY(output) == -1) {
3679        Py_DECREF(output);
3680        return 0;
3681    }
3682    if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
3683                 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
3684        PyErr_SetString(PyExc_ValueError, "embedded null character");
3685        Py_DECREF(output);
3686        return 0;
3687    }
3688    *(PyObject**)addr = output;
3689    return Py_CLEANUP_SUPPORTED;
3690}
3691
3692
3693char*
3694PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
3695{
3696    PyObject *bytes;
3697
3698    if (!PyUnicode_Check(unicode)) {
3699        PyErr_BadArgument();
3700        return NULL;
3701    }
3702    if (PyUnicode_READY(unicode) == -1)
3703        return NULL;
3704
3705    if (PyUnicode_UTF8(unicode) == NULL) {
3706        assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
3707        bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3708        if (bytes == NULL)
3709            return NULL;
3710        _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3711        if (_PyUnicode_UTF8(unicode) == NULL) {
3712            PyErr_NoMemory();
3713            Py_DECREF(bytes);
3714            return NULL;
3715        }
3716        _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3717        Py_MEMCPY(_PyUnicode_UTF8(unicode),
3718                  PyBytes_AS_STRING(bytes),
3719                  _PyUnicode_UTF8_LENGTH(unicode) + 1);
3720        Py_DECREF(bytes);
3721    }
3722
3723    if (psize)
3724        *psize = PyUnicode_UTF8_LENGTH(unicode);
3725    return PyUnicode_UTF8(unicode);
3726}
3727
3728char*
3729PyUnicode_AsUTF8(PyObject *unicode)
3730{
3731    return PyUnicode_AsUTF8AndSize(unicode, NULL);
3732}
3733
3734Py_UNICODE *
3735PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3736{
3737    const unsigned char *one_byte;
3738#if SIZEOF_WCHAR_T == 4
3739    const Py_UCS2 *two_bytes;
3740#else
3741    const Py_UCS4 *four_bytes;
3742    const Py_UCS4 *ucs4_end;
3743    Py_ssize_t num_surrogates;
3744#endif
3745    wchar_t *w;
3746    wchar_t *wchar_end;
3747
3748    if (!PyUnicode_Check(unicode)) {
3749        PyErr_BadArgument();
3750        return NULL;
3751    }
3752    if (_PyUnicode_WSTR(unicode) == NULL) {
3753        /* Non-ASCII compact unicode object */
3754        assert(_PyUnicode_KIND(unicode) != 0);
3755        assert(PyUnicode_IS_READY(unicode));
3756
3757        if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3758#if SIZEOF_WCHAR_T == 2
3759            four_bytes = PyUnicode_4BYTE_DATA(unicode);
3760            ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
3761            num_surrogates = 0;
3762
3763            for (; four_bytes < ucs4_end; ++four_bytes) {
3764                if (*four_bytes > 0xFFFF)
3765                    ++num_surrogates;
3766            }
3767
3768            _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3769                    sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3770            if (!_PyUnicode_WSTR(unicode)) {
3771                PyErr_NoMemory();
3772                return NULL;
3773            }
3774            _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
3775
3776            w = _PyUnicode_WSTR(unicode);
3777            wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3778            four_bytes = PyUnicode_4BYTE_DATA(unicode);
3779            for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3780                if (*four_bytes > 0xFFFF) {
3781                    assert(*four_bytes <= MAX_UNICODE);
3782                    /* encode surrogate pair in this case */
3783                    *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3784                    *w   = Py_UNICODE_LOW_SURROGATE(*four_bytes);
3785                }
3786                else
3787                    *w = *four_bytes;
3788
3789                if (w > wchar_end) {
3790                    assert(0 && "Miscalculated string end");
3791                }
3792            }
3793            *w = 0;
3794#else
3795            /* sizeof(wchar_t) == 4 */
3796            Py_FatalError("Impossible unicode object state, wstr and str "
3797                          "should share memory already.");
3798            return NULL;
3799#endif
3800        }
3801        else {
3802            if ((size_t)_PyUnicode_LENGTH(unicode) >
3803                    PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
3804                PyErr_NoMemory();
3805                return NULL;
3806            }
3807            _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3808                                                  (_PyUnicode_LENGTH(unicode) + 1));
3809            if (!_PyUnicode_WSTR(unicode)) {
3810                PyErr_NoMemory();
3811                return NULL;
3812            }
3813            if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3814                _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3815            w = _PyUnicode_WSTR(unicode);
3816            wchar_end = w + _PyUnicode_LENGTH(unicode);
3817
3818            if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3819                one_byte = PyUnicode_1BYTE_DATA(unicode);
3820                for (; w < wchar_end; ++one_byte, ++w)
3821                    *w = *one_byte;
3822                /* null-terminate the wstr */
3823                *w = 0;
3824            }
3825            else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
3826#if SIZEOF_WCHAR_T == 4
3827                two_bytes = PyUnicode_2BYTE_DATA(unicode);
3828                for (; w < wchar_end; ++two_bytes, ++w)
3829                    *w = *two_bytes;
3830                /* null-terminate the wstr */
3831                *w = 0;
3832#else
3833                /* sizeof(wchar_t) == 2 */
3834                PyObject_FREE(_PyUnicode_WSTR(unicode));
3835                _PyUnicode_WSTR(unicode) = NULL;
3836                Py_FatalError("Impossible unicode object state, wstr "
3837                              "and str should share memory already.");
3838                return NULL;
3839#endif
3840            }
3841            else {
3842                assert(0 && "This should never happen.");
3843            }
3844        }
3845    }
3846    if (size != NULL)
3847        *size = PyUnicode_WSTR_LENGTH(unicode);
3848    return _PyUnicode_WSTR(unicode);
3849}
3850
3851Py_UNICODE *
3852PyUnicode_AsUnicode(PyObject *unicode)
3853{
3854    return PyUnicode_AsUnicodeAndSize(unicode, NULL);
3855}
3856
3857
3858Py_ssize_t
3859PyUnicode_GetSize(PyObject *unicode)
3860{
3861    if (!PyUnicode_Check(unicode)) {
3862        PyErr_BadArgument();
3863        goto onError;
3864    }
3865    return PyUnicode_GET_SIZE(unicode);
3866
3867  onError:
3868    return -1;
3869}
3870
3871Py_ssize_t
3872PyUnicode_GetLength(PyObject *unicode)
3873{
3874    if (!PyUnicode_Check(unicode)) {
3875        PyErr_BadArgument();
3876        return -1;
3877    }
3878    if (PyUnicode_READY(unicode) == -1)
3879        return -1;
3880    return PyUnicode_GET_LENGTH(unicode);
3881}
3882
3883Py_UCS4
3884PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3885{
3886    void *data;
3887    int kind;
3888
3889    if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3890        PyErr_BadArgument();
3891        return (Py_UCS4)-1;
3892    }
3893    if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
3894        PyErr_SetString(PyExc_IndexError, "string index out of range");
3895        return (Py_UCS4)-1;
3896    }
3897    data = PyUnicode_DATA(unicode);
3898    kind = PyUnicode_KIND(unicode);
3899    return PyUnicode_READ(kind, data, index);
3900}
3901
3902int
3903PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3904{
3905    if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
3906        PyErr_BadArgument();
3907        return -1;
3908    }
3909    assert(PyUnicode_IS_READY(unicode));
3910    if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
3911        PyErr_SetString(PyExc_IndexError, "string index out of range");
3912        return -1;
3913    }
3914    if (unicode_check_modifiable(unicode))
3915        return -1;
3916    if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
3917        PyErr_SetString(PyExc_ValueError, "character out of range");
3918        return -1;
3919    }
3920    PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3921                    index, ch);
3922    return 0;
3923}
3924
3925const char *
3926PyUnicode_GetDefaultEncoding(void)
3927{
3928    return "utf-8";
3929}
3930
3931/* create or adjust a UnicodeDecodeError */
3932static void
3933make_decode_exception(PyObject **exceptionObject,
3934                      const char *encoding,
3935                      const char *input, Py_ssize_t length,
3936                      Py_ssize_t startpos, Py_ssize_t endpos,
3937                      const char *reason)
3938{
3939    if (*exceptionObject == NULL) {
3940        *exceptionObject = PyUnicodeDecodeError_Create(
3941            encoding, input, length, startpos, endpos, reason);
3942    }
3943    else {
3944        if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3945            goto onError;
3946        if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3947            goto onError;
3948        if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3949            goto onError;
3950    }
3951    return;
3952
3953onError:
3954    Py_CLEAR(*exceptionObject);
3955}
3956
3957#ifdef HAVE_MBCS
3958/* error handling callback helper:
3959   build arguments, call the callback and check the arguments,
3960   if no exception occurred, copy the replacement to the output
3961   and adjust various state variables.
3962   return 0 on success, -1 on error
3963*/
3964
3965static int
3966unicode_decode_call_errorhandler_wchar(
3967    const char *errors, PyObject **errorHandler,
3968    const char *encoding, const char *reason,
3969    const char **input, const char **inend, Py_ssize_t *startinpos,
3970    Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
3971    PyObject **output, Py_ssize_t *outpos)
3972{
3973    static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
3974
3975    PyObject *restuple = NULL;
3976    PyObject *repunicode = NULL;
3977    Py_ssize_t outsize;
3978    Py_ssize_t insize;
3979    Py_ssize_t requiredsize;
3980    Py_ssize_t newpos;
3981    PyObject *inputobj = NULL;
3982    wchar_t *repwstr;
3983    Py_ssize_t repwlen;
3984
3985    assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND);
3986    outsize = _PyUnicode_WSTR_LENGTH(*output);
3987
3988    if (*errorHandler == NULL) {
3989        *errorHandler = PyCodec_LookupError(errors);
3990        if (*errorHandler == NULL)
3991            goto onError;
3992    }
3993
3994    make_decode_exception(exceptionObject,
3995        encoding,
3996        *input, *inend - *input,
3997        *startinpos, *endinpos,
3998        reason);
3999    if (*exceptionObject == NULL)
4000        goto onError;
4001
4002    restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4003    if (restuple == NULL)
4004        goto onError;
4005    if (!PyTuple_Check(restuple)) {
4006        PyErr_SetString(PyExc_TypeError, &argparse[4]);
4007        goto onError;
4008    }
4009    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
4010        goto onError;
4011
4012    /* Copy back the bytes variables, which might have been modified by the
4013       callback */
4014    inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4015    if (!inputobj)
4016        goto onError;
4017    if (!PyBytes_Check(inputobj)) {
4018        PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
4019    }
4020    *input = PyBytes_AS_STRING(inputobj);
4021    insize = PyBytes_GET_SIZE(inputobj);
4022    *inend = *input + insize;
4023    /* we can DECREF safely, as the exception has another reference,
4024       so the object won't go away. */
4025    Py_DECREF(inputobj);
4026
4027    if (newpos<0)
4028        newpos = insize+newpos;
4029    if (newpos<0 || newpos>insize) {
4030        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4031        goto onError;
4032    }
4033
4034    repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4035    if (repwstr == NULL)
4036        goto onError;
4037    /* need more space? (at least enough for what we
4038       have+the replacement+the rest of the string (starting
4039       at the new input position), so we won't have to check space
4040       when there are no errors in the rest of the string) */
4041    requiredsize = *outpos;
4042    if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4043        goto overflow;
4044    requiredsize += repwlen;
4045    if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4046        goto overflow;
4047    requiredsize += insize - newpos;
4048    if (requiredsize > outsize) {
4049        if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
4050            requiredsize = 2*outsize;
4051        if (unicode_resize(output, requiredsize) < 0)
4052            goto onError;
4053    }
4054    wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4055    *outpos += repwlen;
4056    *endinpos = newpos;
4057    *inptr = *input + newpos;
4058
4059    /* we made it! */
4060    Py_XDECREF(restuple);
4061    return 0;
4062
4063  overflow:
4064    PyErr_SetString(PyExc_OverflowError,
4065                    "decoded result is too long for a Python string");
4066
4067  onError:
4068    Py_XDECREF(restuple);
4069    return -1;
4070}
4071#endif   /* HAVE_MBCS */
4072
4073static int
4074unicode_decode_call_errorhandler_writer(
4075    const char *errors, PyObject **errorHandler,
4076    const char *encoding, const char *reason,
4077    const char **input, const char **inend, Py_ssize_t *startinpos,
4078    Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4079    _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4080{
4081    static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
4082
4083    PyObject *restuple = NULL;
4084    PyObject *repunicode = NULL;
4085    Py_ssize_t insize;
4086    Py_ssize_t newpos;
4087    Py_ssize_t replen;
4088    PyObject *inputobj = NULL;
4089
4090    if (*errorHandler == NULL) {
4091        *errorHandler = PyCodec_LookupError(errors);
4092        if (*errorHandler == NULL)
4093            goto onError;
4094    }
4095
4096    make_decode_exception(exceptionObject,
4097        encoding,
4098        *input, *inend - *input,
4099        *startinpos, *endinpos,
4100        reason);
4101    if (*exceptionObject == NULL)
4102        goto onError;
4103
4104    restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4105    if (restuple == NULL)
4106        goto onError;
4107    if (!PyTuple_Check(restuple)) {
4108        PyErr_SetString(PyExc_TypeError, &argparse[4]);
4109        goto onError;
4110    }
4111    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
4112        goto onError;
4113
4114    /* Copy back the bytes variables, which might have been modified by the
4115       callback */
4116    inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4117    if (!inputobj)
4118        goto onError;
4119    if (!PyBytes_Check(inputobj)) {
4120        PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
4121    }
4122    *input = PyBytes_AS_STRING(inputobj);
4123    insize = PyBytes_GET_SIZE(inputobj);
4124    *inend = *input + insize;
4125    /* we can DECREF safely, as the exception has another reference,
4126       so the object won't go away. */
4127    Py_DECREF(inputobj);
4128
4129    if (newpos<0)
4130        newpos = insize+newpos;
4131    if (newpos<0 || newpos>insize) {
4132        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4133        goto onError;
4134    }
4135
4136    if (PyUnicode_READY(repunicode) < 0)
4137        goto onError;
4138    replen = PyUnicode_GET_LENGTH(repunicode);
4139    if (replen > 1) {
4140        writer->min_length += replen - 1;
4141        writer->overallocate = 1;
4142        if (_PyUnicodeWriter_Prepare(writer, writer->min_length,
4143                            PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4144            goto onError;
4145    }
4146    if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
4147        goto onError;
4148
4149    *endinpos = newpos;
4150    *inptr = *input + newpos;
4151
4152    /* we made it! */
4153    Py_XDECREF(restuple);
4154    return 0;
4155
4156  onError:
4157    Py_XDECREF(restuple);
4158    return -1;
4159}
4160
4161/* --- UTF-7 Codec -------------------------------------------------------- */
4162
4163/* See RFC2152 for details.  We encode conservatively and decode liberally. */
4164
4165/* Three simple macros defining base-64. */
4166
4167/* Is c a base-64 character? */
4168
4169#define IS_BASE64(c) \
4170    (((c) >= 'A' && (c) <= 'Z') ||     \
4171     ((c) >= 'a' && (c) <= 'z') ||     \
4172     ((c) >= '0' && (c) <= '9') ||     \
4173     (c) == '+' || (c) == '/')
4174
4175/* given that c is a base-64 character, what is its base-64 value? */
4176
4177#define FROM_BASE64(c)                                                  \
4178    (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' :                           \
4179     ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 :                      \
4180     ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 :                      \
4181     (c) == '+' ? 62 : 63)
4182
4183/* What is the base-64 character of the bottom 6 bits of n? */
4184
4185#define TO_BASE64(n)  \
4186    ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4187
4188/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4189 * decoded as itself.  We are permissive on decoding; the only ASCII
4190 * byte not decoding to itself is the + which begins a base64
4191 * string. */
4192
4193#define DECODE_DIRECT(c)                                \
4194    ((c) <= 127 && (c) != '+')
4195
4196/* The UTF-7 encoder treats ASCII characters differently according to
4197 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4198 * the above).  See RFC2152.  This array identifies these different
4199 * sets:
4200 * 0 : "Set D"
4201 *     alphanumeric and '(),-./:?
4202 * 1 : "Set O"
4203 *     !"#$%&*;<=>@[]^_`{|}
4204 * 2 : "whitespace"
4205 *     ht nl cr sp
4206 * 3 : special (must be base64 encoded)
4207 *     everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4208 */
4209
4210static
4211char utf7_category[128] = {
4212/* nul soh stx etx eot enq ack bel bs  ht  nl  vt  np  cr  so  si  */
4213    3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  3,  3,  2,  3,  3,
4214/* dle dc1 dc2 dc3 dc4 nak syn etb can em  sub esc fs  gs  rs  us  */
4215    3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
4216/* sp   !   "   #   $   %   &   '   (   )   *   +   ,   -   .   /  */
4217    2,  1,  1,  1,  1,  1,  1,  0,  0,  0,  1,  3,  0,  0,  0,  0,
4218/*  0   1   2   3   4   5   6   7   8   9   :   ;   <   =   >   ?  */
4219    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  0,
4220/*  @   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O  */
4221    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
4222/*  P   Q   R   S   T   U   V   W   X   Y   Z   [   \   ]   ^   _  */
4223    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  3,  1,  1,  1,
4224/*  `   a   b   c   d   e   f   g   h   i   j   k   l   m   n   o  */
4225    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
4226/*  p   q   r   s   t   u   v   w   x   y   z   {   |   }   ~  del */
4227    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  3,  3,
4228};
4229
4230/* ENCODE_DIRECT: this character should be encoded as itself.  The
4231 * answer depends on whether we are encoding set O as itself, and also
4232 * on whether we are encoding whitespace as itself.  RFC2152 makes it
4233 * clear that the answers to these questions vary between
4234 * applications, so this code needs to be flexible.  */
4235
4236#define ENCODE_DIRECT(c, directO, directWS)             \
4237    ((c) < 128 && (c) > 0 &&                            \
4238     ((utf7_category[(c)] == 0) ||                      \
4239      (directWS && (utf7_category[(c)] == 2)) ||        \
4240      (directO && (utf7_category[(c)] == 1))))
4241
4242PyObject *
4243PyUnicode_DecodeUTF7(const char *s,
4244                     Py_ssize_t size,
4245                     const char *errors)
4246{
4247    return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4248}
4249
4250/* The decoder.  The only state we preserve is our read position,
4251 * i.e. how many characters we have consumed.  So if we end in the
4252 * middle of a shift sequence we have to back off the read position
4253 * and the output to the beginning of the sequence, otherwise we lose
4254 * all the shift state (seen bits, number of bits seen, high
4255 * surrogate). */
4256
4257PyObject *
4258PyUnicode_DecodeUTF7Stateful(const char *s,
4259                             Py_ssize_t size,
4260                             const char *errors,
4261                             Py_ssize_t *consumed)
4262{
4263    const char *starts = s;
4264    Py_ssize_t startinpos;
4265    Py_ssize_t endinpos;
4266    const char *e;
4267    _PyUnicodeWriter writer;
4268    const char *errmsg = "";
4269    int inShift = 0;
4270    Py_ssize_t shiftOutStart;
4271    unsigned int base64bits = 0;
4272    unsigned long base64buffer = 0;
4273    Py_UCS4 surrogate = 0;
4274    PyObject *errorHandler = NULL;
4275    PyObject *exc = NULL;
4276
4277    if (size == 0) {
4278        if (consumed)
4279            *consumed = 0;
4280        _Py_RETURN_UNICODE_EMPTY();
4281    }
4282
4283    /* Start off assuming it's all ASCII. Widen later as necessary. */
4284    _PyUnicodeWriter_Init(&writer);
4285    writer.min_length = size;
4286
4287    shiftOutStart = 0;
4288    e = s + size;
4289
4290    while (s < e) {
4291        Py_UCS4 ch;
4292      restart:
4293        ch = (unsigned char) *s;
4294
4295        if (inShift) { /* in a base-64 section */
4296            if (IS_BASE64(ch)) { /* consume a base-64 character */
4297                base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4298                base64bits += 6;
4299                s++;
4300                if (base64bits >= 16) {
4301                    /* we have enough bits for a UTF-16 value */
4302                    Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
4303                    base64bits -= 16;
4304                    base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4305                    assert(outCh <= 0xffff);
4306                    if (surrogate) {
4307                        /* expecting a second surrogate */
4308                        if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4309                            Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
4310                            if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
4311                                goto onError;
4312                            surrogate = 0;
4313                            continue;
4314                        }
4315                        else {
4316                            if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4317                                goto onError;
4318                            surrogate = 0;
4319                        }
4320                    }
4321                    if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
4322                        /* first surrogate */
4323                        surrogate = outCh;
4324                    }
4325                    else {
4326                        if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
4327                            goto onError;
4328                    }
4329                }
4330            }
4331            else { /* now leaving a base-64 section */
4332                inShift = 0;
4333                s++;
4334                if (surrogate) {
4335                    if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4336                        goto onError;
4337                    surrogate = 0;
4338                }
4339                if (base64bits > 0) { /* left-over bits */
4340                    if (base64bits >= 6) {
4341                        /* We've seen at least one base-64 character */
4342                        errmsg = "partial character in shift sequence";
4343                        goto utf7Error;
4344                    }
4345                    else {
4346                        /* Some bits remain; they should be zero */
4347                        if (base64buffer != 0) {
4348                            errmsg = "non-zero padding bits in shift sequence";
4349                            goto utf7Error;
4350                        }
4351                    }
4352                }
4353                if (ch != '-') {
4354                    /* '-' is absorbed; other terminating
4355                       characters are preserved */
4356                    if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
4357                        goto onError;
4358                }
4359            }
4360        }
4361        else if ( ch == '+' ) {
4362            startinpos = s-starts;
4363            s++; /* consume '+' */
4364            if (s < e && *s == '-') { /* '+-' encodes '+' */
4365                s++;
4366                if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
4367                    goto onError;
4368            }
4369            else { /* begin base64-encoded section */
4370                inShift = 1;
4371                shiftOutStart = writer.pos;
4372                base64bits = 0;
4373                base64buffer = 0;
4374            }
4375        }
4376        else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
4377            s++;
4378            if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
4379                goto onError;
4380        }
4381        else {
4382            startinpos = s-starts;
4383            s++;
4384            errmsg = "unexpected special character";
4385            goto utf7Error;
4386        }
4387        continue;
4388utf7Error:
4389        endinpos = s-starts;
4390        if (unicode_decode_call_errorhandler_writer(
4391                errors, &errorHandler,
4392                "utf7", errmsg,
4393                &starts, &e, &startinpos, &endinpos, &exc, &s,
4394                &writer))
4395            goto onError;
4396    }
4397
4398    /* end of string */
4399
4400    if (inShift && !consumed) { /* in shift sequence, no more to follow */
4401        /* if we're in an inconsistent state, that's an error */
4402        if (surrogate ||
4403                (base64bits >= 6) ||
4404                (base64bits > 0 && base64buffer != 0)) {
4405            endinpos = size;
4406            if (unicode_decode_call_errorhandler_writer(
4407                    errors, &errorHandler,
4408                    "utf7", "unterminated shift sequence",
4409                    &starts, &e, &startinpos, &endinpos, &exc, &s,
4410                    &writer))
4411                goto onError;
4412            if (s < e)
4413                goto restart;
4414        }
4415    }
4416
4417    /* return state */
4418    if (consumed) {
4419        if (inShift) {
4420            *consumed = startinpos;
4421            if (writer.pos != shiftOutStart && writer.maxchar > 127) {
4422                PyObject *result = PyUnicode_FromKindAndData(
4423                        writer.kind, writer.data, shiftOutStart);
4424                Py_XDECREF(errorHandler);
4425                Py_XDECREF(exc);
4426                _PyUnicodeWriter_Dealloc(&writer);
4427                return result;
4428            }
4429            writer.pos = shiftOutStart; /* back off output */
4430        }
4431        else {
4432            *consumed = s-starts;
4433        }
4434    }
4435
4436    Py_XDECREF(errorHandler);
4437    Py_XDECREF(exc);
4438    return _PyUnicodeWriter_Finish(&writer);
4439
4440  onError:
4441    Py_XDECREF(errorHandler);
4442    Py_XDECREF(exc);
4443    _PyUnicodeWriter_Dealloc(&writer);
4444    return NULL;
4445}
4446
4447
4448PyObject *
4449_PyUnicode_EncodeUTF7(PyObject *str,
4450                      int base64SetO,
4451                      int base64WhiteSpace,
4452                      const char *errors)
4453{
4454    int kind;
4455    void *data;
4456    Py_ssize_t len;
4457    PyObject *v;
4458    int inShift = 0;
4459    Py_ssize_t i;
4460    unsigned int base64bits = 0;
4461    unsigned long base64buffer = 0;
4462    char * out;
4463    char * start;
4464
4465    if (PyUnicode_READY(str) == -1)
4466        return NULL;
4467    kind = PyUnicode_KIND(str);
4468    data = PyUnicode_DATA(str);
4469    len = PyUnicode_GET_LENGTH(str);
4470
4471    if (len == 0)
4472        return PyBytes_FromStringAndSize(NULL, 0);
4473
4474    /* It might be possible to tighten this worst case */
4475    if (len > PY_SSIZE_T_MAX / 8)
4476        return PyErr_NoMemory();
4477    v = PyBytes_FromStringAndSize(NULL, len * 8);
4478    if (v == NULL)
4479        return NULL;
4480
4481    start = out = PyBytes_AS_STRING(v);
4482    for (i = 0; i < len; ++i) {
4483        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
4484
4485        if (inShift) {
4486            if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4487                /* shifting out */
4488                if (base64bits) { /* output remaining bits */
4489                    *out++ = TO_BASE64(base64buffer << (6-base64bits));
4490                    base64buffer = 0;
4491                    base64bits = 0;
4492                }
4493                inShift = 0;
4494                /* Characters not in the BASE64 set implicitly unshift the sequence
4495                   so no '-' is required, except if the character is itself a '-' */
4496                if (IS_BASE64(ch) || ch == '-') {
4497                    *out++ = '-';
4498                }
4499                *out++ = (char) ch;
4500            }
4501            else {
4502                goto encode_char;
4503            }
4504        }
4505        else { /* not in a shift sequence */
4506            if (ch == '+') {
4507                *out++ = '+';
4508                        *out++ = '-';
4509            }
4510            else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4511                *out++ = (char) ch;
4512            }
4513            else {
4514                *out++ = '+';
4515                inShift = 1;
4516                goto encode_char;
4517            }
4518        }
4519        continue;
4520encode_char:
4521        if (ch >= 0x10000) {
4522            assert(ch <= MAX_UNICODE);
4523
4524            /* code first surrogate */
4525            base64bits += 16;
4526            base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
4527            while (base64bits >= 6) {
4528                *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4529                base64bits -= 6;
4530            }
4531            /* prepare second surrogate */
4532            ch = Py_UNICODE_LOW_SURROGATE(ch);
4533        }
4534        base64bits += 16;
4535        base64buffer = (base64buffer << 16) | ch;
4536        while (base64bits >= 6) {
4537            *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4538            base64bits -= 6;
4539        }
4540    }
4541    if (base64bits)
4542        *out++= TO_BASE64(base64buffer << (6-base64bits) );
4543    if (inShift)
4544        *out++ = '-';
4545    if (_PyBytes_Resize(&v, out - start) < 0)
4546        return NULL;
4547    return v;
4548}
4549PyObject *
4550PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4551                     Py_ssize_t size,
4552                     int base64SetO,
4553                     int base64WhiteSpace,
4554                     const char *errors)
4555{
4556    PyObject *result;
4557    PyObject *tmp = PyUnicode_FromUnicode(s, size);
4558    if (tmp == NULL)
4559        return NULL;
4560    result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
4561                                   base64WhiteSpace, errors);
4562    Py_DECREF(tmp);
4563    return result;
4564}
4565
4566#undef IS_BASE64
4567#undef FROM_BASE64
4568#undef TO_BASE64
4569#undef DECODE_DIRECT
4570#undef ENCODE_DIRECT
4571
4572/* --- UTF-8 Codec -------------------------------------------------------- */
4573
4574PyObject *
4575PyUnicode_DecodeUTF8(const char *s,
4576                     Py_ssize_t size,
4577                     const char *errors)
4578{
4579    return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4580}
4581
4582#include "stringlib/asciilib.h"
4583#include "stringlib/codecs.h"
4584#include "stringlib/undef.h"
4585
4586#include "stringlib/ucs1lib.h"
4587#include "stringlib/codecs.h"
4588#include "stringlib/undef.h"
4589
4590#include "stringlib/ucs2lib.h"
4591#include "stringlib/codecs.h"
4592#include "stringlib/undef.h"
4593
4594#include "stringlib/ucs4lib.h"
4595#include "stringlib/codecs.h"
4596#include "stringlib/undef.h"
4597
4598/* Mask to quickly check whether a C 'long' contains a
4599   non-ASCII, UTF8-encoded char. */
4600#if (SIZEOF_LONG == 8)
4601# define ASCII_CHAR_MASK 0x8080808080808080UL
4602#elif (SIZEOF_LONG == 4)
4603# define ASCII_CHAR_MASK 0x80808080UL
4604#else
4605# error C 'long' size should be either 4 or 8!
4606#endif
4607
4608static Py_ssize_t
4609ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
4610{
4611    const char *p = start;
4612    const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
4613
4614    /*
4615     * Issue #17237: m68k is a bit different from most architectures in
4616     * that objects do not use "natural alignment" - for example, int and
4617     * long are only aligned at 2-byte boundaries.  Therefore the assert()
4618     * won't work; also, tests have shown that skipping the "optimised
4619     * version" will even speed up m68k.
4620     */
4621#if !defined(__m68k__)
4622#if SIZEOF_LONG <= SIZEOF_VOID_P
4623    assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4624    if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
4625        /* Fast path, see in STRINGLIB(utf8_decode) for
4626           an explanation. */
4627        /* Help allocation */
4628        const char *_p = p;
4629        Py_UCS1 * q = dest;
4630        while (_p < aligned_end) {
4631            unsigned long value = *(const unsigned long *) _p;
4632            if (value & ASCII_CHAR_MASK)
4633                break;
4634            *((unsigned long *)q) = value;
4635            _p += SIZEOF_LONG;
4636            q += SIZEOF_LONG;
4637        }
4638        p = _p;
4639        while (p < end) {
4640            if ((unsigned char)*p & 0x80)
4641                break;
4642            *q++ = *p++;
4643        }
4644        return p - start;
4645    }
4646#endif
4647#endif
4648    while (p < end) {
4649        /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4650           for an explanation. */
4651        if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
4652            /* Help allocation */
4653            const char *_p = p;
4654            while (_p < aligned_end) {
4655                unsigned long value = *(unsigned long *) _p;
4656                if (value & ASCII_CHAR_MASK)
4657                    break;
4658                _p += SIZEOF_LONG;
4659            }
4660            p = _p;
4661            if (_p == end)
4662                break;
4663        }
4664        if ((unsigned char)*p & 0x80)
4665            break;
4666        ++p;
4667    }
4668    memcpy(dest, start, p - start);
4669    return p - start;
4670}
4671
4672PyObject *
4673PyUnicode_DecodeUTF8Stateful(const char *s,
4674                             Py_ssize_t size,
4675                             const char *errors,
4676                             Py_ssize_t *consumed)
4677{
4678    _PyUnicodeWriter writer;
4679    const char *starts = s;
4680    const char *end = s + size;
4681
4682    Py_ssize_t startinpos;
4683    Py_ssize_t endinpos;
4684    const char *errmsg = "";
4685    PyObject *errorHandler = NULL;
4686    PyObject *exc = NULL;
4687
4688    if (size == 0) {
4689        if (consumed)
4690            *consumed = 0;
4691        _Py_RETURN_UNICODE_EMPTY();
4692    }
4693
4694    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4695    if (size == 1 && (unsigned char)s[0] < 128) {
4696        if (consumed)
4697            *consumed = 1;
4698        return get_latin1_char((unsigned char)s[0]);
4699    }
4700
4701    _PyUnicodeWriter_Init(&writer);
4702    writer.min_length = size;
4703    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
4704        goto onError;
4705
4706    writer.pos = ascii_decode(s, end, writer.data);
4707    s += writer.pos;
4708    while (s < end) {
4709        Py_UCS4 ch;
4710        int kind = writer.kind;
4711        if (kind == PyUnicode_1BYTE_KIND) {
4712            if (PyUnicode_IS_ASCII(writer.buffer))
4713                ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
4714            else
4715                ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
4716        } else if (kind == PyUnicode_2BYTE_KIND) {
4717            ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
4718        } else {
4719            assert(kind == PyUnicode_4BYTE_KIND);
4720            ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
4721        }
4722
4723        switch (ch) {
4724        case 0:
4725            if (s == end || consumed)
4726                goto End;
4727            errmsg = "unexpected end of data";
4728            startinpos = s - starts;
4729            endinpos = end - starts;
4730            break;
4731        case 1:
4732            errmsg = "invalid start byte";
4733            startinpos = s - starts;
4734            endinpos = startinpos + 1;
4735            break;
4736        case 2:
4737        case 3:
4738        case 4:
4739            errmsg = "invalid continuation byte";
4740            startinpos = s - starts;
4741            endinpos = startinpos + ch - 1;
4742            break;
4743        default:
4744            if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
4745                goto onError;
4746            continue;
4747        }
4748
4749        if (unicode_decode_call_errorhandler_writer(
4750                errors, &errorHandler,
4751                "utf-8", errmsg,
4752                &starts, &end, &startinpos, &endinpos, &exc, &s,
4753                &writer))
4754            goto onError;
4755    }
4756
4757End:
4758    if (consumed)
4759        *consumed = s - starts;
4760
4761    Py_XDECREF(errorHandler);
4762    Py_XDECREF(exc);
4763    return _PyUnicodeWriter_Finish(&writer);
4764
4765onError:
4766    Py_XDECREF(errorHandler);
4767    Py_XDECREF(exc);
4768    _PyUnicodeWriter_Dealloc(&writer);
4769    return NULL;
4770}
4771
4772#ifdef __APPLE__
4773
4774/* Simplified UTF-8 decoder using surrogateescape error handler,
4775   used to decode the command line arguments on Mac OS X.
4776
4777   Return a pointer to a newly allocated wide character string (use
4778   PyMem_RawFree() to free the memory), or NULL on memory allocation error. */
4779
4780wchar_t*
4781_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4782{
4783    const char *e;
4784    wchar_t *unicode;
4785    Py_ssize_t outpos;
4786
4787    /* Note: size will always be longer than the resulting Unicode
4788       character count */
4789    if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1))
4790        return NULL;
4791    unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
4792    if (!unicode)
4793        return NULL;
4794
4795    /* Unpack UTF-8 encoded data */
4796    e = s + size;
4797    outpos = 0;
4798    while (s < e) {
4799        Py_UCS4 ch;
4800#if SIZEOF_WCHAR_T == 4
4801        ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
4802#else
4803        ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
4804#endif
4805        if (ch > 0xFF) {
4806#if SIZEOF_WCHAR_T == 4
4807            assert(0);
4808#else
4809            assert(Py_UNICODE_IS_SURROGATE(ch));
4810            /*  compute and append the two surrogates: */
4811            unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
4812            unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
4813#endif
4814        }
4815        else {
4816            if (!ch && s == e)
4817                break;
4818            /* surrogateescape */
4819            unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
4820        }
4821    }
4822    unicode[outpos] = L'\0';
4823    return unicode;
4824}
4825
4826#endif /* __APPLE__ */
4827
4828/* Primary internal function which creates utf8 encoded bytes objects.
4829
4830   Allocation strategy:  if the string is short, convert into a stack buffer
4831   and allocate exactly as much space needed at the end.  Else allocate the
4832   maximum possible needed (4 result bytes per Unicode character), and return
4833   the excess memory at the end.
4834*/
4835PyObject *
4836_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
4837{
4838    enum PyUnicode_Kind kind;
4839    void *data;
4840    Py_ssize_t size;
4841
4842    if (!PyUnicode_Check(unicode)) {
4843        PyErr_BadArgument();
4844        return NULL;
4845    }
4846
4847    if (PyUnicode_READY(unicode) == -1)
4848        return NULL;
4849
4850    if (PyUnicode_UTF8(unicode))
4851        return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4852                                         PyUnicode_UTF8_LENGTH(unicode));
4853
4854    kind = PyUnicode_KIND(unicode);
4855    data = PyUnicode_DATA(unicode);
4856    size = PyUnicode_GET_LENGTH(unicode);
4857
4858    switch (kind) {
4859    default:
4860        assert(0);
4861    case PyUnicode_1BYTE_KIND:
4862        /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
4863        assert(!PyUnicode_IS_ASCII(unicode));
4864        return ucs1lib_utf8_encoder(unicode, data, size, errors);
4865    case PyUnicode_2BYTE_KIND:
4866        return ucs2lib_utf8_encoder(unicode, data, size, errors);
4867    case PyUnicode_4BYTE_KIND:
4868        return ucs4lib_utf8_encoder(unicode, data, size, errors);
4869    }
4870}
4871
4872PyObject *
4873PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4874                     Py_ssize_t size,
4875                     const char *errors)
4876{
4877    PyObject *v, *unicode;
4878
4879    unicode = PyUnicode_FromUnicode(s, size);
4880    if (unicode == NULL)
4881        return NULL;
4882    v = _PyUnicode_AsUTF8String(unicode, errors);
4883    Py_DECREF(unicode);
4884    return v;
4885}
4886
4887PyObject *
4888PyUnicode_AsUTF8String(PyObject *unicode)
4889{
4890    return _PyUnicode_AsUTF8String(unicode, NULL);
4891}
4892
4893/* --- UTF-32 Codec ------------------------------------------------------- */
4894
4895PyObject *
4896PyUnicode_DecodeUTF32(const char *s,
4897                      Py_ssize_t size,
4898                      const char *errors,
4899                      int *byteorder)
4900{
4901    return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4902}
4903
4904PyObject *
4905PyUnicode_DecodeUTF32Stateful(const char *s,
4906                              Py_ssize_t size,
4907                              const char *errors,
4908                              int *byteorder,
4909                              Py_ssize_t *consumed)
4910{
4911    const char *starts = s;
4912    Py_ssize_t startinpos;
4913    Py_ssize_t endinpos;
4914    _PyUnicodeWriter writer;
4915    const unsigned char *q, *e;
4916    int le, bo = 0;       /* assume native ordering by default */
4917    const char *encoding;
4918    const char *errmsg = "";
4919    PyObject *errorHandler = NULL;
4920    PyObject *exc = NULL;
4921
4922    q = (unsigned char *)s;
4923    e = q + size;
4924
4925    if (byteorder)
4926        bo = *byteorder;
4927
4928    /* Check for BOM marks (U+FEFF) in the input and adjust current
4929       byte order setting accordingly. In native mode, the leading BOM
4930       mark is skipped, in all other modes, it is copied to the output
4931       stream as-is (giving a ZWNBSP character). */
4932    if (bo == 0 && size >= 4) {
4933        Py_UCS4 bom = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
4934        if (bom == 0x0000FEFF) {
4935            bo = -1;
4936            q += 4;
4937        }
4938        else if (bom == 0xFFFE0000) {
4939            bo = 1;
4940            q += 4;
4941        }
4942        if (byteorder)
4943            *byteorder = bo;
4944    }
4945
4946    if (q == e) {
4947        if (consumed)
4948            *consumed = size;
4949        _Py_RETURN_UNICODE_EMPTY();
4950    }
4951
4952#ifdef WORDS_BIGENDIAN
4953    le = bo < 0;
4954#else
4955    le = bo <= 0;
4956#endif
4957    encoding = le ? "utf-32-le" : "utf-32-be";
4958
4959    _PyUnicodeWriter_Init(&writer);
4960    writer.min_length = (e - q + 3) / 4;
4961    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
4962        goto onError;
4963
4964    while (1) {
4965        Py_UCS4 ch = 0;
4966        Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
4967
4968        if (e - q >= 4) {
4969            enum PyUnicode_Kind kind = writer.kind;
4970            void *data = writer.data;
4971            const unsigned char *last = e - 4;
4972            Py_ssize_t pos = writer.pos;
4973            if (le) {
4974                do {
4975                    ch = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
4976                    if (ch > maxch)
4977                        break;
4978                    if (kind != PyUnicode_1BYTE_KIND &&
4979                        Py_UNICODE_IS_SURROGATE(ch))
4980                        break;
4981                    PyUnicode_WRITE(kind, data, pos++, ch);
4982                    q += 4;
4983                } while (q <= last);
4984            }
4985            else {
4986                do {
4987                    ch = (q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
4988                    if (ch > maxch)
4989                        break;
4990                    if (kind != PyUnicode_1BYTE_KIND &&
4991                        Py_UNICODE_IS_SURROGATE(ch))
4992                        break;
4993                    PyUnicode_WRITE(kind, data, pos++, ch);
4994                    q += 4;
4995                } while (q <= last);
4996            }
4997            writer.pos = pos;
4998        }
4999
5000        if (Py_UNICODE_IS_SURROGATE(ch)) {
5001            errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
5002            startinpos = ((const char *)q) - starts;
5003            endinpos = startinpos + 4;
5004        }
5005        else if (ch <= maxch) {
5006            if (q == e || consumed)
5007                break;
5008            /* remaining bytes at the end? (size should be divisible by 4) */
5009            errmsg = "truncated data";
5010            startinpos = ((const char *)q) - starts;
5011            endinpos = ((const char *)e) - starts;
5012        }
5013        else {
5014            if (ch < 0x110000) {
5015                if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
5016                    goto onError;
5017                q += 4;
5018                continue;
5019            }
5020            errmsg = "code point not in range(0x110000)";
5021            startinpos = ((const char *)q) - starts;
5022            endinpos = startinpos + 4;
5023        }
5024
5025        /* The remaining input chars are ignored if the callback
5026           chooses to skip the input */
5027        if (unicode_decode_call_errorhandler_writer(
5028                errors, &errorHandler,
5029                encoding, errmsg,
5030                &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
5031                &writer))
5032            goto onError;
5033    }
5034
5035    if (consumed)
5036        *consumed = (const char *)q-starts;
5037
5038    Py_XDECREF(errorHandler);
5039    Py_XDECREF(exc);
5040    return _PyUnicodeWriter_Finish(&writer);
5041
5042  onError:
5043    _PyUnicodeWriter_Dealloc(&writer);
5044    Py_XDECREF(errorHandler);
5045    Py_XDECREF(exc);
5046    return NULL;
5047}
5048
5049PyObject *
5050_PyUnicode_EncodeUTF32(PyObject *str,
5051                       const char *errors,
5052                       int byteorder)
5053{
5054    enum PyUnicode_Kind kind;
5055    const void *data;
5056    Py_ssize_t len;
5057    PyObject *v;
5058    PY_UINT32_T *out;
5059#if PY_LITTLE_ENDIAN
5060    int native_ordering = byteorder <= 0;
5061#else
5062    int native_ordering = byteorder >= 0;
5063#endif
5064    const char *encoding;
5065    Py_ssize_t nsize, pos;
5066    PyObject *errorHandler = NULL;
5067    PyObject *exc = NULL;
5068    PyObject *rep = NULL;
5069
5070    if (!PyUnicode_Check(str)) {
5071        PyErr_BadArgument();
5072        return NULL;
5073    }
5074    if (PyUnicode_READY(str) == -1)
5075        return NULL;
5076    kind = PyUnicode_KIND(str);
5077    data = PyUnicode_DATA(str);
5078    len = PyUnicode_GET_LENGTH(str);
5079
5080    if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
5081        return PyErr_NoMemory();
5082    nsize = len + (byteorder == 0);
5083    v = PyBytes_FromStringAndSize(NULL, nsize * 4);
5084    if (v == NULL)
5085        return NULL;
5086
5087    /* output buffer is 4-bytes aligned */
5088    assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
5089    out = (PY_UINT32_T *)PyBytes_AS_STRING(v);
5090    if (byteorder == 0)
5091        *out++ = 0xFEFF;
5092    if (len == 0)
5093        goto done;
5094
5095    if (byteorder == -1)
5096        encoding = "utf-32-le";
5097    else if (byteorder == 1)
5098        encoding = "utf-32-be";
5099    else
5100        encoding = "utf-32";
5101
5102    if (kind == PyUnicode_1BYTE_KIND) {
5103        ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5104        goto done;
5105    }
5106
5107    pos = 0;
5108    while (pos < len) {
5109        Py_ssize_t repsize, moreunits;
5110
5111        if (kind == PyUnicode_2BYTE_KIND) {
5112            pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5113                                        &out, native_ordering);
5114        }
5115        else {
5116            assert(kind == PyUnicode_4BYTE_KIND);
5117            pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5118                                        &out, native_ordering);
5119        }
5120        if (pos == len)
5121            break;
5122
5123        rep = unicode_encode_call_errorhandler(
5124                errors, &errorHandler,
5125                encoding, "surrogates not allowed",
5126                str, &exc, pos, pos + 1, &pos);
5127        if (!rep)
5128            goto error;
5129
5130        if (PyBytes_Check(rep)) {
5131            repsize = PyBytes_GET_SIZE(rep);
5132            if (repsize & 3) {
5133                raise_encode_exception(&exc, encoding,
5134                                       str, pos - 1, pos,
5135                                       "surrogates not allowed");
5136                goto error;
5137            }
5138            moreunits = repsize / 4;
5139        }
5140        else {
5141            assert(PyUnicode_Check(rep));
5142            if (PyUnicode_READY(rep) < 0)
5143                goto error;
5144            moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5145            if (!PyUnicode_IS_ASCII(rep)) {
5146                raise_encode_exception(&exc, encoding,
5147                                       str, pos - 1, pos,
5148                                       "surrogates not allowed");
5149                goto error;
5150            }
5151        }
5152
5153        /* four bytes are reserved for each surrogate */
5154        if (moreunits > 1) {
5155            Py_ssize_t outpos = out - (PY_UINT32_T*) PyBytes_AS_STRING(v);
5156            Py_ssize_t morebytes = 4 * (moreunits - 1);
5157            if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5158                /* integer overflow */
5159                PyErr_NoMemory();
5160                goto error;
5161            }
5162            if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5163                goto error;
5164            out = (PY_UINT32_T*) PyBytes_AS_STRING(v) + outpos;
5165        }
5166
5167        if (PyBytes_Check(rep)) {
5168            Py_MEMCPY(out, PyBytes_AS_STRING(rep), repsize);
5169            out += moreunits;
5170        } else /* rep is unicode */ {
5171            assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5172            ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5173                                 &out, native_ordering);
5174        }
5175
5176        Py_CLEAR(rep);
5177    }
5178
5179    /* Cut back to size actually needed. This is necessary for, for example,
5180       encoding of a string containing isolated surrogates and the 'ignore'
5181       handler is used. */
5182    nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5183    if (nsize != PyBytes_GET_SIZE(v))
5184      _PyBytes_Resize(&v, nsize);
5185    Py_XDECREF(errorHandler);
5186    Py_XDECREF(exc);
5187  done:
5188    return v;
5189  error:
5190    Py_XDECREF(rep);
5191    Py_XDECREF(errorHandler);
5192    Py_XDECREF(exc);
5193    Py_XDECREF(v);
5194    return NULL;
5195}
5196
5197PyObject *
5198PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5199                      Py_ssize_t size,
5200                      const char *errors,
5201                      int byteorder)
5202{
5203    PyObject *result;
5204    PyObject *tmp = PyUnicode_FromUnicode(s, size);
5205    if (tmp == NULL)
5206        return NULL;
5207    result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5208    Py_DECREF(tmp);
5209    return result;
5210}
5211
5212PyObject *
5213PyUnicode_AsUTF32String(PyObject *unicode)
5214{
5215    return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
5216}
5217
5218/* --- UTF-16 Codec ------------------------------------------------------- */
5219
5220PyObject *
5221PyUnicode_DecodeUTF16(const char *s,
5222                      Py_ssize_t size,
5223                      const char *errors,
5224                      int *byteorder)
5225{
5226    return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5227}
5228
5229PyObject *
5230PyUnicode_DecodeUTF16Stateful(const char *s,
5231                              Py_ssize_t size,
5232                              const char *errors,
5233                              int *byteorder,
5234                              Py_ssize_t *consumed)
5235{
5236    const char *starts = s;
5237    Py_ssize_t startinpos;
5238    Py_ssize_t endinpos;
5239    _PyUnicodeWriter writer;
5240    const unsigned char *q, *e;
5241    int bo = 0;       /* assume native ordering by default */
5242    int native_ordering;
5243    const char *errmsg = "";
5244    PyObject *errorHandler = NULL;
5245    PyObject *exc = NULL;
5246    const char *encoding;
5247
5248    q = (unsigned char *)s;
5249    e = q + size;
5250
5251    if (byteorder)
5252        bo = *byteorder;
5253
5254    /* Check for BOM marks (U+FEFF) in the input and adjust current
5255       byte order setting accordingly. In native mode, the leading BOM
5256       mark is skipped, in all other modes, it is copied to the output
5257       stream as-is (giving a ZWNBSP character). */
5258    if (bo == 0 && size >= 2) {
5259        const Py_UCS4 bom = (q[1] << 8) | q[0];
5260        if (bom == 0xFEFF) {
5261            q += 2;
5262            bo = -1;
5263        }
5264        else if (bom == 0xFFFE) {
5265            q += 2;
5266            bo = 1;
5267        }
5268        if (byteorder)
5269            *byteorder = bo;
5270    }
5271
5272    if (q == e) {
5273        if (consumed)
5274            *consumed = size;
5275        _Py_RETURN_UNICODE_EMPTY();
5276    }
5277
5278#if PY_LITTLE_ENDIAN
5279    native_ordering = bo <= 0;
5280    encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
5281#else
5282    native_ordering = bo >= 0;
5283    encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
5284#endif
5285
5286    /* Note: size will always be longer than the resulting Unicode
5287       character count */
5288    _PyUnicodeWriter_Init(&writer);
5289    writer.min_length = (e - q + 1) / 2;
5290    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
5291        goto onError;
5292
5293    while (1) {
5294        Py_UCS4 ch = 0;
5295        if (e - q >= 2) {
5296            int kind = writer.kind;
5297            if (kind == PyUnicode_1BYTE_KIND) {
5298                if (PyUnicode_IS_ASCII(writer.buffer))
5299                    ch = asciilib_utf16_decode(&q, e,
5300                            (Py_UCS1*)writer.data, &writer.pos,
5301                            native_ordering);
5302                else
5303                    ch = ucs1lib_utf16_decode(&q, e,
5304                            (Py_UCS1*)writer.data, &writer.pos,
5305                            native_ordering);
5306            } else if (kind == PyUnicode_2BYTE_KIND) {
5307                ch = ucs2lib_utf16_decode(&q, e,
5308                        (Py_UCS2*)writer.data, &writer.pos,
5309                        native_ordering);
5310            } else {
5311                assert(kind == PyUnicode_4BYTE_KIND);
5312                ch = ucs4lib_utf16_decode(&q, e,
5313                        (Py_UCS4*)writer.data, &writer.pos,
5314                        native_ordering);
5315            }
5316        }
5317
5318        switch (ch)
5319        {
5320        case 0:
5321            /* remaining byte at the end? (size should be even) */
5322            if (q == e || consumed)
5323                goto End;
5324            errmsg = "truncated data";
5325            startinpos = ((const char *)q) - starts;
5326            endinpos = ((const char *)e) - starts;
5327            break;
5328            /* The remaining input chars are ignored if the callback
5329               chooses to skip the input */
5330        case 1:
5331            q -= 2;
5332            if (consumed)
5333                goto End;
5334            errmsg = "unexpected end of data";
5335            startinpos = ((const char *)q) - starts;
5336            endinpos = ((const char *)e) - starts;
5337            break;
5338        case 2:
5339            errmsg = "illegal encoding";
5340            startinpos = ((const char *)q) - 2 - starts;
5341            endinpos = startinpos + 2;
5342            break;
5343        case 3:
5344            errmsg = "illegal UTF-16 surrogate";
5345            startinpos = ((const char *)q) - 4 - starts;
5346            endinpos = startinpos + 2;
5347            break;
5348        default:
5349            if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
5350                goto onError;
5351            continue;
5352        }
5353
5354        if (unicode_decode_call_errorhandler_writer(
5355                errors,
5356                &errorHandler,
5357                encoding, errmsg,
5358                &starts,
5359                (const char **)&e,
5360                &startinpos,
5361                &endinpos,
5362                &exc,
5363                (const char **)&q,
5364                &writer))
5365            goto onError;
5366    }
5367
5368End:
5369    if (consumed)
5370        *consumed = (const char *)q-starts;
5371
5372    Py_XDECREF(errorHandler);
5373    Py_XDECREF(exc);
5374    return _PyUnicodeWriter_Finish(&writer);
5375
5376  onError:
5377    _PyUnicodeWriter_Dealloc(&writer);
5378    Py_XDECREF(errorHandler);
5379    Py_XDECREF(exc);
5380    return NULL;
5381}
5382
5383PyObject *
5384_PyUnicode_EncodeUTF16(PyObject *str,
5385                       const char *errors,
5386                       int byteorder)
5387{
5388    enum PyUnicode_Kind kind;
5389    const void *data;
5390    Py_ssize_t len;
5391    PyObject *v;
5392    unsigned short *out;
5393    Py_ssize_t pairs;
5394#if PY_BIG_ENDIAN
5395    int native_ordering = byteorder >= 0;
5396#else
5397    int native_ordering = byteorder <= 0;
5398#endif
5399    const char *encoding;
5400    Py_ssize_t nsize, pos;
5401    PyObject *errorHandler = NULL;
5402    PyObject *exc = NULL;
5403    PyObject *rep = NULL;
5404
5405    if (!PyUnicode_Check(str)) {
5406        PyErr_BadArgument();
5407        return NULL;
5408    }
5409    if (PyUnicode_READY(str) == -1)
5410        return NULL;
5411    kind = PyUnicode_KIND(str);
5412    data = PyUnicode_DATA(str);
5413    len = PyUnicode_GET_LENGTH(str);
5414
5415    pairs = 0;
5416    if (kind == PyUnicode_4BYTE_KIND) {
5417        const Py_UCS4 *in = (const Py_UCS4 *)data;
5418        const Py_UCS4 *end = in + len;
5419        while (in < end)
5420            if (*in++ >= 0x10000)
5421                pairs++;
5422    }
5423    if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0))
5424        return PyErr_NoMemory();
5425    nsize = len + pairs + (byteorder == 0);
5426    v = PyBytes_FromStringAndSize(NULL, nsize * 2);
5427    if (v == NULL)
5428        return NULL;
5429
5430    /* output buffer is 2-bytes aligned */
5431    assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
5432    out = (unsigned short *)PyBytes_AS_STRING(v);
5433    if (byteorder == 0)
5434        *out++ = 0xFEFF;
5435    if (len == 0)
5436        goto done;
5437
5438    if (kind == PyUnicode_1BYTE_KIND) {
5439        ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5440        goto done;
5441    }
5442
5443    if (byteorder < 0)
5444        encoding = "utf-16-le";
5445    else if (byteorder > 0)
5446        encoding = "utf-16-be";
5447    else
5448        encoding = "utf-16";
5449
5450    pos = 0;
5451    while (pos < len) {
5452        Py_ssize_t repsize, moreunits;
5453
5454        if (kind == PyUnicode_2BYTE_KIND) {
5455            pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
5456                                        &out, native_ordering);
5457        }
5458        else {
5459            assert(kind == PyUnicode_4BYTE_KIND);
5460            pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
5461                                        &out, native_ordering);
5462        }
5463        if (pos == len)
5464            break;
5465
5466        rep = unicode_encode_call_errorhandler(
5467                errors, &errorHandler,
5468                encoding, "surrogates not allowed",
5469                str, &exc, pos, pos + 1, &pos);
5470        if (!rep)
5471            goto error;
5472
5473        if (PyBytes_Check(rep)) {
5474            repsize = PyBytes_GET_SIZE(rep);
5475            if (repsize & 1) {
5476                raise_encode_exception(&exc, encoding,
5477                                       str, pos - 1, pos,
5478                                       "surrogates not allowed");
5479                goto error;
5480            }
5481            moreunits = repsize / 2;
5482        }
5483        else {
5484            assert(PyUnicode_Check(rep));
5485            if (PyUnicode_READY(rep) < 0)
5486                goto error;
5487            moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5488            if (!PyUnicode_IS_ASCII(rep)) {
5489                raise_encode_exception(&exc, encoding,
5490                                       str, pos - 1, pos,
5491                                       "surrogates not allowed");
5492                goto error;
5493            }
5494        }
5495
5496        /* two bytes are reserved for each surrogate */
5497        if (moreunits > 1) {
5498            Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
5499            Py_ssize_t morebytes = 2 * (moreunits - 1);
5500            if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5501                /* integer overflow */
5502                PyErr_NoMemory();
5503                goto error;
5504            }
5505            if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5506                goto error;
5507            out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
5508        }
5509
5510        if (PyBytes_Check(rep)) {
5511            Py_MEMCPY(out, PyBytes_AS_STRING(rep), repsize);
5512            out += moreunits;
5513        } else /* rep is unicode */ {
5514            assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5515            ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5516                                 &out, native_ordering);
5517        }
5518
5519        Py_CLEAR(rep);
5520    }
5521
5522    /* Cut back to size actually needed. This is necessary for, for example,
5523    encoding of a string containing isolated surrogates and the 'ignore' handler
5524    is used. */
5525    nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5526    if (nsize != PyBytes_GET_SIZE(v))
5527      _PyBytes_Resize(&v, nsize);
5528    Py_XDECREF(errorHandler);
5529    Py_XDECREF(exc);
5530  done:
5531    return v;
5532  error:
5533    Py_XDECREF(rep);
5534    Py_XDECREF(errorHandler);
5535    Py_XDECREF(exc);
5536    Py_XDECREF(v);
5537    return NULL;
5538#undef STORECHAR
5539}
5540
5541PyObject *
5542PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5543                      Py_ssize_t size,
5544                      const char *errors,
5545                      int byteorder)
5546{
5547    PyObject *result;
5548    PyObject *tmp = PyUnicode_FromUnicode(s, size);
5549    if (tmp == NULL)
5550        return NULL;
5551    result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5552    Py_DECREF(tmp);
5553    return result;
5554}
5555
5556PyObject *
5557PyUnicode_AsUTF16String(PyObject *unicode)
5558{
5559    return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
5560}
5561
5562/* --- Unicode Escape Codec ----------------------------------------------- */
5563
5564/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5565   if all the escapes in the string make it still a valid ASCII string.
5566   Returns -1 if any escapes were found which cause the string to
5567   pop out of ASCII range.  Otherwise returns the length of the
5568   required buffer to hold the string.
5569   */
5570static Py_ssize_t
5571length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5572{
5573    const unsigned char *p = (const unsigned char *)s;
5574    const unsigned char *end = p + size;
5575    Py_ssize_t length = 0;
5576
5577    if (size < 0)
5578        return -1;
5579
5580    for (; p < end; ++p) {
5581        if (*p > 127) {
5582            /* Non-ASCII */
5583            return -1;
5584        }
5585        else if (*p != '\\') {
5586            /* Normal character */
5587            ++length;
5588        }
5589        else {
5590            /* Backslash-escape, check next char */
5591            ++p;
5592            /* Escape sequence reaches till end of string or
5593               non-ASCII follow-up. */
5594            if (p >= end || *p > 127)
5595                return -1;
5596            switch (*p) {
5597            case '\n':
5598                /* backslash + \n result in zero characters */
5599                break;
5600            case '\\': case '\'': case '\"':
5601            case 'b': case 'f': case 't':
5602            case 'n': case 'r': case 'v': case 'a':
5603                ++length;
5604                break;
5605            case '0': case '1': case '2': case '3':
5606            case '4': case '5': case '6': case '7':
5607            case 'x': case 'u': case 'U': case 'N':
5608                /* these do not guarantee ASCII characters */
5609                return -1;
5610            default:
5611                /* count the backslash + the other character */
5612                length += 2;
5613            }
5614        }
5615    }
5616    return length;
5617}
5618
5619static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
5620
5621PyObject *
5622PyUnicode_DecodeUnicodeEscape(const char *s,
5623                              Py_ssize_t size,
5624                              const char *errors)
5625{
5626    const char *starts = s;
5627    Py_ssize_t startinpos;
5628    Py_ssize_t endinpos;
5629    _PyUnicodeWriter writer;
5630    const char *end;
5631    char* message;
5632    Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
5633    PyObject *errorHandler = NULL;
5634    PyObject *exc = NULL;
5635    Py_ssize_t len;
5636
5637    len = length_of_escaped_ascii_string(s, size);
5638    if (len == 0)
5639        _Py_RETURN_UNICODE_EMPTY();
5640
5641    /* After length_of_escaped_ascii_string() there are two alternatives,
5642       either the string is pure ASCII with named escapes like \n, etc.
5643       and we determined it's exact size (common case)
5644       or it contains \x, \u, ... escape sequences.  then we create a
5645       legacy wchar string and resize it at the end of this function. */
5646    _PyUnicodeWriter_Init(&writer);
5647    if (len > 0) {
5648        writer.min_length = len;
5649    }
5650    else {
5651        /* Escaped strings will always be longer than the resulting
5652           Unicode string, so we start with size here and then reduce the
5653           length after conversion to the true value.
5654           (but if the error callback returns a long replacement string
5655           we'll have to allocate more space) */
5656        writer.min_length = size;
5657    }
5658
5659    if (size == 0)
5660        return _PyUnicodeWriter_Finish(&writer);
5661    end = s + size;
5662
5663    while (s < end) {
5664        unsigned char c;
5665        Py_UCS4 x;
5666        int digits;
5667
5668        /* Non-escape characters are interpreted as Unicode ordinals */
5669        if (*s != '\\') {
5670            x = (unsigned char)*s;
5671            s++;
5672            if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
5673                goto onError;
5674            continue;
5675        }
5676
5677        startinpos = s-starts;
5678        /* \ - Escapes */
5679        s++;
5680        c = *s++;
5681        if (s > end)
5682            c = '\0'; /* Invalid after \ */
5683
5684        switch (c) {
5685
5686            /* \x escapes */
5687#define WRITECHAR(ch)                                                      \
5688            do {                                                           \
5689                if (_PyUnicodeWriter_WriteCharInline(&writer, (ch)) < 0)    \
5690                    goto onError;                                          \
5691            } while(0)
5692
5693        case '\n': break;
5694        case '\\': WRITECHAR('\\'); break;
5695        case '\'': WRITECHAR('\''); break;
5696        case '\"': WRITECHAR('\"'); break;
5697        case 'b': WRITECHAR('\b'); break;
5698        /* FF */
5699        case 'f': WRITECHAR('\014'); break;
5700        case 't': WRITECHAR('\t'); break;
5701        case 'n': WRITECHAR('\n'); break;
5702        case 'r': WRITECHAR('\r'); break;
5703        /* VT */
5704        case 'v': WRITECHAR('\013'); break;
5705        /* BEL, not classic C */
5706        case 'a': WRITECHAR('\007'); break;
5707
5708            /* \OOO (octal) escapes */
5709        case '0': case '1': case '2': case '3':
5710        case '4': case '5': case '6': case '7':
5711            x = s[-1] - '0';
5712            if (s < end && '0' <= *s && *s <= '7') {
5713                x = (x<<3) + *s++ - '0';
5714                if (s < end && '0' <= *s && *s <= '7')
5715                    x = (x<<3) + *s++ - '0';
5716            }
5717            WRITECHAR(x);
5718            break;
5719
5720            /* hex escapes */
5721            /* \xXX */
5722        case 'x':
5723            digits = 2;
5724            message = "truncated \\xXX escape";
5725            goto hexescape;
5726
5727            /* \uXXXX */
5728        case 'u':
5729            digits = 4;
5730            message = "truncated \\uXXXX escape";
5731            goto hexescape;
5732
5733            /* \UXXXXXXXX */
5734        case 'U':
5735            digits = 8;
5736            message = "truncated \\UXXXXXXXX escape";
5737        hexescape:
5738            chr = 0;
5739            if (end - s < digits) {
5740                /* count only hex digits */
5741                for (; s < end; ++s) {
5742                    c = (unsigned char)*s;
5743                    if (!Py_ISXDIGIT(c))
5744                        goto error;
5745                }
5746                goto error;
5747            }
5748            for (; digits--; ++s) {
5749                c = (unsigned char)*s;
5750                if (!Py_ISXDIGIT(c))
5751                    goto error;
5752                chr = (chr<<4) & ~0xF;
5753                if (c >= '0' && c <= '9')
5754                    chr += c - '0';
5755                else if (c >= 'a' && c <= 'f')
5756                    chr += 10 + c - 'a';
5757                else
5758                    chr += 10 + c - 'A';
5759            }
5760            if (chr == 0xffffffff && PyErr_Occurred())
5761                /* _decoding_error will have already written into the
5762                   target buffer. */
5763                break;
5764        store:
5765            /* when we get here, chr is a 32-bit unicode character */
5766            message = "illegal Unicode character";
5767            if (chr > MAX_UNICODE)
5768                goto error;
5769            WRITECHAR(chr);
5770            break;
5771
5772            /* \N{name} */
5773        case 'N':
5774            message = "malformed \\N character escape";
5775            if (ucnhash_CAPI == NULL) {
5776                /* load the unicode data module */
5777                ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5778                                                PyUnicodeData_CAPSULE_NAME, 1);
5779                if (ucnhash_CAPI == NULL)
5780                    goto ucnhashError;
5781            }
5782            if (*s == '{') {
5783                const char *start = s+1;
5784                /* look for the closing brace */
5785                while (*s != '}' && s < end)
5786                    s++;
5787                if (s > start && s < end && *s == '}') {
5788                    /* found a name.  look it up in the unicode database */
5789                    message = "unknown Unicode character name";
5790                    s++;
5791                    if (s - start - 1 <= INT_MAX &&
5792                        ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
5793                                              &chr, 0))
5794                        goto store;
5795                }
5796            }
5797            goto error;
5798
5799        default:
5800            if (s > end) {
5801                message = "\\ at end of string";
5802                s--;
5803                goto error;
5804            }
5805            else {
5806                WRITECHAR('\\');
5807                WRITECHAR((unsigned char)s[-1]);
5808            }
5809            break;
5810        }
5811        continue;
5812
5813      error:
5814        endinpos = s-starts;
5815        if (unicode_decode_call_errorhandler_writer(
5816                errors, &errorHandler,
5817                "unicodeescape", message,
5818                &starts, &end, &startinpos, &endinpos, &exc, &s,
5819                &writer))
5820            goto onError;
5821        continue;
5822    }
5823#undef WRITECHAR
5824
5825    Py_XDECREF(errorHandler);
5826    Py_XDECREF(exc);
5827    return _PyUnicodeWriter_Finish(&writer);
5828
5829  ucnhashError:
5830    PyErr_SetString(
5831        PyExc_UnicodeError,
5832        "\\N escapes not supported (can't load unicodedata module)"
5833        );
5834    _PyUnicodeWriter_Dealloc(&writer);
5835    Py_XDECREF(errorHandler);
5836    Py_XDECREF(exc);
5837    return NULL;
5838
5839  onError:
5840    _PyUnicodeWriter_Dealloc(&writer);
5841    Py_XDECREF(errorHandler);
5842    Py_XDECREF(exc);
5843    return NULL;
5844}
5845
5846/* Return a Unicode-Escape string version of the Unicode object.
5847
5848   If quotes is true, the string is enclosed in u"" or u'' quotes as
5849   appropriate.
5850
5851*/
5852
5853PyObject *
5854PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
5855{
5856    Py_ssize_t i, len;
5857    PyObject *repr;
5858    char *p;
5859    int kind;
5860    void *data;
5861    Py_ssize_t expandsize = 0;
5862
5863    /* Initial allocation is based on the longest-possible character
5864       escape.
5865
5866       For UCS1 strings it's '\xxx', 4 bytes per source character.
5867       For UCS2 strings it's '\uxxxx', 6 bytes per source character.
5868       For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
5869    */
5870
5871    if (!PyUnicode_Check(unicode)) {
5872        PyErr_BadArgument();
5873        return NULL;
5874    }
5875    if (PyUnicode_READY(unicode) == -1)
5876        return NULL;
5877    len = PyUnicode_GET_LENGTH(unicode);
5878    kind = PyUnicode_KIND(unicode);
5879    data = PyUnicode_DATA(unicode);
5880    switch (kind) {
5881    case PyUnicode_1BYTE_KIND: expandsize = 4; break;
5882    case PyUnicode_2BYTE_KIND: expandsize = 6; break;
5883    case PyUnicode_4BYTE_KIND: expandsize = 10; break;
5884    }
5885
5886    if (len == 0)
5887        return PyBytes_FromStringAndSize(NULL, 0);
5888
5889    if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
5890        return PyErr_NoMemory();
5891
5892    repr = PyBytes_FromStringAndSize(NULL,
5893                                     2
5894                                     + expandsize*len
5895                                     + 1);
5896    if (repr == NULL)
5897        return NULL;
5898
5899    p = PyBytes_AS_STRING(repr);
5900
5901    for (i = 0; i < len; i++) {
5902        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
5903
5904        /* Escape backslashes */
5905        if (ch == '\\') {
5906            *p++ = '\\';
5907            *p++ = (char) ch;
5908            continue;
5909        }
5910
5911        /* Map 21-bit characters to '\U00xxxxxx' */
5912        else if (ch >= 0x10000) {
5913            assert(ch <= MAX_UNICODE);
5914            *p++ = '\\';
5915            *p++ = 'U';
5916            *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
5917            *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
5918            *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
5919            *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
5920            *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
5921            *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
5922            *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
5923            *p++ = Py_hexdigits[ch & 0x0000000F];
5924            continue;
5925        }
5926
5927        /* Map 16-bit characters to '\uxxxx' */
5928        if (ch >= 256) {
5929            *p++ = '\\';
5930            *p++ = 'u';
5931            *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
5932            *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
5933            *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5934            *p++ = Py_hexdigits[ch & 0x000F];
5935        }
5936
5937        /* Map special whitespace to '\t', \n', '\r' */
5938        else if (ch == '\t') {
5939            *p++ = '\\';
5940            *p++ = 't';
5941        }
5942        else if (ch == '\n') {
5943            *p++ = '\\';
5944            *p++ = 'n';
5945        }
5946        else if (ch == '\r') {
5947            *p++ = '\\';
5948            *p++ = 'r';
5949        }
5950
5951        /* Map non-printable US ASCII to '\xhh' */
5952        else if (ch < ' ' || ch >= 0x7F) {
5953            *p++ = '\\';
5954            *p++ = 'x';
5955            *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5956            *p++ = Py_hexdigits[ch & 0x000F];
5957        }
5958
5959        /* Copy everything else as-is */
5960        else
5961            *p++ = (char) ch;
5962    }
5963
5964    assert(p - PyBytes_AS_STRING(repr) > 0);
5965    if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5966        return NULL;
5967    return repr;
5968}
5969
5970PyObject *
5971PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
5972                              Py_ssize_t size)
5973{
5974    PyObject *result;
5975    PyObject *tmp = PyUnicode_FromUnicode(s, size);
5976    if (tmp == NULL)
5977        return NULL;
5978    result = PyUnicode_AsUnicodeEscapeString(tmp);
5979    Py_DECREF(tmp);
5980    return result;
5981}
5982
5983/* --- Raw Unicode Escape Codec ------------------------------------------- */
5984
5985PyObject *
5986PyUnicode_DecodeRawUnicodeEscape(const char *s,
5987                                 Py_ssize_t size,
5988                                 const char *errors)
5989{
5990    const char *starts = s;
5991    Py_ssize_t startinpos;
5992    Py_ssize_t endinpos;
5993    _PyUnicodeWriter writer;
5994    const char *end;
5995    const char *bs;
5996    PyObject *errorHandler = NULL;
5997    PyObject *exc = NULL;
5998
5999    if (size == 0)
6000        _Py_RETURN_UNICODE_EMPTY();
6001
6002    /* Escaped strings will always be longer than the resulting
6003       Unicode string, so we start with size here and then reduce the
6004       length after conversion to the true value. (But decoding error
6005       handler might have to resize the string) */
6006    _PyUnicodeWriter_Init(&writer);
6007    writer.min_length = size;
6008
6009    end = s + size;
6010    while (s < end) {
6011        unsigned char c;
6012        Py_UCS4 x;
6013        int i;
6014        int count;
6015
6016        /* Non-escape characters are interpreted as Unicode ordinals */
6017        if (*s != '\\') {
6018            x = (unsigned char)*s++;
6019            if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
6020                goto onError;
6021            continue;
6022        }
6023        startinpos = s-starts;
6024
6025        /* \u-escapes are only interpreted iff the number of leading
6026           backslashes if odd */
6027        bs = s;
6028        for (;s < end;) {
6029            if (*s != '\\')
6030                break;
6031            x = (unsigned char)*s++;
6032            if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
6033                goto onError;
6034        }
6035        if (((s - bs) & 1) == 0 ||
6036            s >= end ||
6037            (*s != 'u' && *s != 'U')) {
6038            continue;
6039        }
6040        writer.pos--;
6041        count = *s=='u' ? 4 : 8;
6042        s++;
6043
6044        /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
6045        for (x = 0, i = 0; i < count; ++i, ++s) {
6046            c = (unsigned char)*s;
6047            if (!Py_ISXDIGIT(c)) {
6048                endinpos = s-starts;
6049                if (unicode_decode_call_errorhandler_writer(
6050                        errors, &errorHandler,
6051                        "rawunicodeescape", "truncated \\uXXXX",
6052                        &starts, &end, &startinpos, &endinpos, &exc, &s,
6053                        &writer))
6054                    goto onError;
6055                goto nextByte;
6056            }
6057            x = (x<<4) & ~0xF;
6058            if (c >= '0' && c <= '9')
6059                x += c - '0';
6060            else if (c >= 'a' && c <= 'f')
6061                x += 10 + c - 'a';
6062            else
6063                x += 10 + c - 'A';
6064        }
6065        if (x <= MAX_UNICODE) {
6066            if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
6067                goto onError;
6068        }
6069        else {
6070            endinpos = s-starts;
6071            if (unicode_decode_call_errorhandler_writer(
6072                    errors, &errorHandler,
6073                    "rawunicodeescape", "\\Uxxxxxxxx out of range",
6074                    &starts, &end, &startinpos, &endinpos, &exc, &s,
6075                    &writer))
6076                goto onError;
6077        }
6078      nextByte:
6079        ;
6080    }
6081    Py_XDECREF(errorHandler);
6082    Py_XDECREF(exc);
6083    return _PyUnicodeWriter_Finish(&writer);
6084
6085  onError:
6086    _PyUnicodeWriter_Dealloc(&writer);
6087    Py_XDECREF(errorHandler);
6088    Py_XDECREF(exc);
6089    return NULL;
6090}
6091
6092
6093PyObject *
6094PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
6095{
6096    PyObject *repr;
6097    char *p;
6098    char *q;
6099    Py_ssize_t expandsize, pos;
6100    int kind;
6101    void *data;
6102    Py_ssize_t len;
6103
6104    if (!PyUnicode_Check(unicode)) {
6105        PyErr_BadArgument();
6106        return NULL;
6107    }
6108    if (PyUnicode_READY(unicode) == -1)
6109        return NULL;
6110    kind = PyUnicode_KIND(unicode);
6111    data = PyUnicode_DATA(unicode);
6112    len = PyUnicode_GET_LENGTH(unicode);
6113    /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6114       bytes, and 1 byte characters 4. */
6115    expandsize = kind * 2 + 2;
6116
6117    if (len > PY_SSIZE_T_MAX / expandsize)
6118        return PyErr_NoMemory();
6119
6120    repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6121    if (repr == NULL)
6122        return NULL;
6123    if (len == 0)
6124        return repr;
6125
6126    p = q = PyBytes_AS_STRING(repr);
6127    for (pos = 0; pos < len; pos++) {
6128        Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
6129        /* Map 32-bit characters to '\Uxxxxxxxx' */
6130        if (ch >= 0x10000) {
6131            assert(ch <= MAX_UNICODE);
6132            *p++ = '\\';
6133            *p++ = 'U';
6134            *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6135            *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6136            *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6137            *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6138            *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6139            *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6140            *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6141            *p++ = Py_hexdigits[ch & 15];
6142        }
6143        /* Map 16-bit characters to '\uxxxx' */
6144        else if (ch >= 256) {
6145            *p++ = '\\';
6146            *p++ = 'u';
6147            *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6148            *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6149            *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6150            *p++ = Py_hexdigits[ch & 15];
6151        }
6152        /* Copy everything else as-is */
6153        else
6154            *p++ = (char) ch;
6155    }
6156
6157    assert(p > q);
6158    if (_PyBytes_Resize(&repr, p - q) < 0)
6159        return NULL;
6160    return repr;
6161}
6162
6163PyObject *
6164PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6165                                 Py_ssize_t size)
6166{
6167    PyObject *result;
6168    PyObject *tmp = PyUnicode_FromUnicode(s, size);
6169    if (tmp == NULL)
6170        return NULL;
6171    result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6172    Py_DECREF(tmp);
6173    return result;
6174}
6175
6176/* --- Unicode Internal Codec ------------------------------------------- */
6177
6178PyObject *
6179_PyUnicode_DecodeUnicodeInternal(const char *s,
6180                                 Py_ssize_t size,
6181                                 const char *errors)
6182{
6183    const char *starts = s;
6184    Py_ssize_t startinpos;
6185    Py_ssize_t endinpos;
6186    _PyUnicodeWriter writer;
6187    const char *end;
6188    const char *reason;
6189    PyObject *errorHandler = NULL;
6190    PyObject *exc = NULL;
6191
6192    if (PyErr_WarnEx(PyExc_DeprecationWarning,
6193                     "unicode_internal codec has been deprecated",
6194                     1))
6195        return NULL;
6196
6197    if (size == 0)
6198        _Py_RETURN_UNICODE_EMPTY();
6199
6200    _PyUnicodeWriter_Init(&writer);
6201    if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) {
6202        PyErr_NoMemory();
6203        goto onError;
6204    }
6205    writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE;
6206
6207    end = s + size;
6208    while (s < end) {
6209        Py_UNICODE uch;
6210        Py_UCS4 ch;
6211        if (end - s < Py_UNICODE_SIZE) {
6212            endinpos = end-starts;
6213            reason = "truncated input";
6214            goto error;
6215        }
6216        /* We copy the raw representation one byte at a time because the
6217           pointer may be unaligned (see test_codeccallbacks). */
6218        ((char *) &uch)[0] = s[0];
6219        ((char *) &uch)[1] = s[1];
6220#ifdef Py_UNICODE_WIDE
6221        ((char *) &uch)[2] = s[2];
6222        ((char *) &uch)[3] = s[3];
6223#endif
6224        ch = uch;
6225#ifdef Py_UNICODE_WIDE
6226        /* We have to sanity check the raw data, otherwise doom looms for
6227           some malformed UCS-4 data. */
6228        if (ch > 0x10ffff) {
6229            endinpos = s - starts + Py_UNICODE_SIZE;
6230            reason = "illegal code point (> 0x10FFFF)";
6231            goto error;
6232        }
6233#endif
6234        s += Py_UNICODE_SIZE;
6235#ifndef Py_UNICODE_WIDE
6236        if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE)
6237        {
6238            Py_UNICODE uch2;
6239            ((char *) &uch2)[0] = s[0];
6240            ((char *) &uch2)[1] = s[1];
6241            if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
6242            {
6243                ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
6244                s += Py_UNICODE_SIZE;
6245            }
6246        }
6247#endif
6248
6249        if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
6250            goto onError;
6251        continue;
6252
6253  error:
6254        startinpos = s - starts;
6255        if (unicode_decode_call_errorhandler_writer(
6256                errors, &errorHandler,
6257                "unicode_internal", reason,
6258                &starts, &end, &startinpos, &endinpos, &exc, &s,
6259                &writer))
6260            goto onError;
6261    }
6262
6263    Py_XDECREF(errorHandler);
6264    Py_XDECREF(exc);
6265    return _PyUnicodeWriter_Finish(&writer);
6266
6267  onError:
6268    _PyUnicodeWriter_Dealloc(&writer);
6269    Py_XDECREF(errorHandler);
6270    Py_XDECREF(exc);
6271    return NULL;
6272}
6273
6274/* --- Latin-1 Codec ------------------------------------------------------ */
6275
6276PyObject *
6277PyUnicode_DecodeLatin1(const char *s,
6278                       Py_ssize_t size,
6279                       const char *errors)
6280{
6281    /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
6282    return _PyUnicode_FromUCS1((unsigned char*)s, size);
6283}
6284
6285/* create or adjust a UnicodeEncodeError */
6286static void
6287make_encode_exception(PyObject **exceptionObject,
6288                      const char *encoding,
6289                      PyObject *unicode,
6290                      Py_ssize_t startpos, Py_ssize_t endpos,
6291                      const char *reason)
6292{
6293    if (*exceptionObject == NULL) {
6294        *exceptionObject = PyObject_CallFunction(
6295            PyExc_UnicodeEncodeError, "sOnns",
6296            encoding, unicode, startpos, endpos, reason);
6297    }
6298    else {
6299        if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6300            goto onError;
6301        if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6302            goto onError;
6303        if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6304            goto onError;
6305        return;
6306      onError:
6307        Py_CLEAR(*exceptionObject);
6308    }
6309}
6310
6311/* raises a UnicodeEncodeError */
6312static void
6313raise_encode_exception(PyObject **exceptionObject,
6314                       const char *encoding,
6315                       PyObject *unicode,
6316                       Py_ssize_t startpos, Py_ssize_t endpos,
6317                       const char *reason)
6318{
6319    make_encode_exception(exceptionObject,
6320                          encoding, unicode, startpos, endpos, reason);
6321    if (*exceptionObject != NULL)
6322        PyCodec_StrictErrors(*exceptionObject);
6323}
6324
6325/* error handling callback helper:
6326   build arguments, call the callback and check the arguments,
6327   put the result into newpos and return the replacement string, which
6328   has to be freed by the caller */
6329static PyObject *
6330unicode_encode_call_errorhandler(const char *errors,
6331                                 PyObject **errorHandler,
6332                                 const char *encoding, const char *reason,
6333                                 PyObject *unicode, PyObject **exceptionObject,
6334                                 Py_ssize_t startpos, Py_ssize_t endpos,
6335                                 Py_ssize_t *newpos)
6336{
6337    static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
6338    Py_ssize_t len;
6339    PyObject *restuple;
6340    PyObject *resunicode;
6341
6342    if (*errorHandler == NULL) {
6343        *errorHandler = PyCodec_LookupError(errors);
6344        if (*errorHandler == NULL)
6345            return NULL;
6346    }
6347
6348    if (PyUnicode_READY(unicode) == -1)
6349        return NULL;
6350    len = PyUnicode_GET_LENGTH(unicode);
6351
6352    make_encode_exception(exceptionObject,
6353                          encoding, unicode, startpos, endpos, reason);
6354    if (*exceptionObject == NULL)
6355        return NULL;
6356
6357    restuple = PyObject_CallFunctionObjArgs(
6358        *errorHandler, *exceptionObject, NULL);
6359    if (restuple == NULL)
6360        return NULL;
6361    if (!PyTuple_Check(restuple)) {
6362        PyErr_SetString(PyExc_TypeError, &argparse[3]);
6363        Py_DECREF(restuple);
6364        return NULL;
6365    }
6366    if (!PyArg_ParseTuple(restuple, argparse,
6367                          &resunicode, newpos)) {
6368        Py_DECREF(restuple);
6369        return NULL;
6370    }
6371    if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6372        PyErr_SetString(PyExc_TypeError, &argparse[3]);
6373        Py_DECREF(restuple);
6374        return NULL;
6375    }
6376    if (*newpos<0)
6377        *newpos = len + *newpos;
6378    if (*newpos<0 || *newpos>len) {
6379        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6380        Py_DECREF(restuple);
6381        return NULL;
6382    }
6383    Py_INCREF(resunicode);
6384    Py_DECREF(restuple);
6385    return resunicode;
6386}
6387
6388static PyObject *
6389unicode_encode_ucs1(PyObject *unicode,
6390                    const char *errors,
6391                    unsigned int limit)
6392{
6393    /* input state */
6394    Py_ssize_t pos=0, size;
6395    int kind;
6396    void *data;
6397    /* output object */
6398    PyObject *res;
6399    /* pointer into the output */
6400    char *str;
6401    /* current output position */
6402    Py_ssize_t ressize;
6403    const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6404    const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
6405    PyObject *errorHandler = NULL;
6406    PyObject *exc = NULL;
6407    /* the following variable is used for caching string comparisons
6408     * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6409    int known_errorHandler = -1;
6410
6411    if (PyUnicode_READY(unicode) == -1)
6412        return NULL;
6413    size = PyUnicode_GET_LENGTH(unicode);
6414    kind = PyUnicode_KIND(unicode);
6415    data = PyUnicode_DATA(unicode);
6416    /* allocate enough for a simple encoding without
6417       replacements, if we need more, we'll resize */
6418    if (size == 0)
6419        return PyBytes_FromStringAndSize(NULL, 0);
6420    res = PyBytes_FromStringAndSize(NULL, size);
6421    if (res == NULL)
6422        return NULL;
6423    str = PyBytes_AS_STRING(res);
6424    ressize = size;
6425
6426    while (pos < size) {
6427        Py_UCS4 c = PyUnicode_READ(kind, data, pos);
6428
6429        /* can we encode this? */
6430        if (c<limit) {
6431            /* no overflow check, because we know that the space is enough */
6432            *str++ = (char)c;
6433            ++pos;
6434        }
6435        else {
6436            Py_ssize_t requiredsize;
6437            PyObject *repunicode;
6438            Py_ssize_t repsize, newpos, respos, i;
6439            /* startpos for collecting unencodable chars */
6440            Py_ssize_t collstart = pos;
6441            Py_ssize_t collend = pos;
6442            /* find all unecodable characters */
6443            while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
6444                ++collend;
6445            /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6446            if (known_errorHandler==-1) {
6447                if ((errors==NULL) || (!strcmp(errors, "strict")))
6448                    known_errorHandler = 1;
6449                else if (!strcmp(errors, "replace"))
6450                    known_errorHandler = 2;
6451                else if (!strcmp(errors, "ignore"))
6452                    known_errorHandler = 3;
6453                else if (!strcmp(errors, "xmlcharrefreplace"))
6454                    known_errorHandler = 4;
6455                else
6456                    known_errorHandler = 0;
6457            }
6458            switch (known_errorHandler) {
6459            case 1: /* strict */
6460                raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
6461                goto onError;
6462            case 2: /* replace */
6463                while (collstart++ < collend)
6464                    *str++ = '?'; /* fall through */
6465            case 3: /* ignore */
6466                pos = collend;
6467                break;
6468            case 4: /* xmlcharrefreplace */
6469                respos = str - PyBytes_AS_STRING(res);
6470                requiredsize = respos;
6471                /* determine replacement size */
6472                for (i = collstart; i < collend; ++i) {
6473                    Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6474                    Py_ssize_t incr;
6475                    if (ch < 10)
6476                        incr = 2+1+1;
6477                    else if (ch < 100)
6478                        incr = 2+2+1;
6479                    else if (ch < 1000)
6480                        incr = 2+3+1;
6481                    else if (ch < 10000)
6482                        incr = 2+4+1;
6483                    else if (ch < 100000)
6484                        incr = 2+5+1;
6485                    else if (ch < 1000000)
6486                        incr = 2+6+1;
6487                    else {
6488                        assert(ch <= MAX_UNICODE);
6489                        incr = 2+7+1;
6490                    }
6491                    if (requiredsize > PY_SSIZE_T_MAX - incr)
6492                        goto overflow;
6493                    requiredsize += incr;
6494                }
6495                if (requiredsize > PY_SSIZE_T_MAX - (size - collend))
6496                    goto overflow;
6497                requiredsize += size - collend;
6498                if (requiredsize > ressize) {
6499                    if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize)
6500                        requiredsize = 2*ressize;
6501                    if (_PyBytes_Resize(&res, requiredsize))
6502                        goto onError;
6503                    str = PyBytes_AS_STRING(res) + respos;
6504                    ressize = requiredsize;
6505                }
6506                /* generate replacement */
6507                for (i = collstart; i < collend; ++i) {
6508                    str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
6509                }
6510                pos = collend;
6511                break;
6512            default:
6513                repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6514                                                              encoding, reason, unicode, &exc,
6515                                                              collstart, collend, &newpos);
6516                if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
6517                                           PyUnicode_READY(repunicode) == -1))
6518                    goto onError;
6519                if (PyBytes_Check(repunicode)) {
6520                    /* Directly copy bytes result to output. */
6521                    repsize = PyBytes_Size(repunicode);
6522                    if (repsize > 1) {
6523                        /* Make room for all additional bytes. */
6524                        respos = str - PyBytes_AS_STRING(res);
6525                        if (ressize > PY_SSIZE_T_MAX - repsize - 1) {
6526                            Py_DECREF(repunicode);
6527                            goto overflow;
6528                        }
6529                        if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6530                            Py_DECREF(repunicode);
6531                            goto onError;
6532                        }
6533                        str = PyBytes_AS_STRING(res) + respos;
6534                        ressize += repsize-1;
6535                    }
6536                    memcpy(str, PyBytes_AsString(repunicode), repsize);
6537                    str += repsize;
6538                    pos = newpos;
6539                    Py_DECREF(repunicode);
6540                    break;
6541                }
6542                /* need more space? (at least enough for what we
6543                   have+the replacement+the rest of the string, so
6544                   we won't have to check space for encodable characters) */
6545                respos = str - PyBytes_AS_STRING(res);
6546                repsize = PyUnicode_GET_LENGTH(repunicode);
6547                requiredsize = respos;
6548                if (requiredsize > PY_SSIZE_T_MAX - repsize)
6549                    goto overflow;
6550                requiredsize += repsize;
6551                if (requiredsize > PY_SSIZE_T_MAX - (size - collend))
6552                    goto overflow;
6553                requiredsize += size - collend;
6554                if (requiredsize > ressize) {
6555                    if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize)
6556                        requiredsize = 2*ressize;
6557                    if (_PyBytes_Resize(&res, requiredsize)) {
6558                        Py_DECREF(repunicode);
6559                        goto onError;
6560                    }
6561                    str = PyBytes_AS_STRING(res) + respos;
6562                    ressize = requiredsize;
6563                }
6564                /* check if there is anything unencodable in the replacement
6565                   and copy it to the output */
6566                for (i = 0; repsize-->0; ++i, ++str) {
6567                    c = PyUnicode_READ_CHAR(repunicode, i);
6568                    if (c >= limit) {
6569                        raise_encode_exception(&exc, encoding, unicode,
6570                                               pos, pos+1, reason);
6571                        Py_DECREF(repunicode);
6572                        goto onError;
6573                    }
6574                    *str = (char)c;
6575                }
6576                pos = newpos;
6577                Py_DECREF(repunicode);
6578            }
6579        }
6580    }
6581    /* Resize if we allocated to much */
6582    size = str - PyBytes_AS_STRING(res);
6583    if (size < ressize) { /* If this falls res will be NULL */
6584        assert(size >= 0);
6585        if (_PyBytes_Resize(&res, size) < 0)
6586            goto onError;
6587    }
6588
6589    Py_XDECREF(errorHandler);
6590    Py_XDECREF(exc);
6591    return res;
6592
6593  overflow:
6594    PyErr_SetString(PyExc_OverflowError,
6595                    "encoded result is too long for a Python string");
6596
6597  onError:
6598    Py_XDECREF(res);
6599    Py_XDECREF(errorHandler);
6600    Py_XDECREF(exc);
6601    return NULL;
6602}
6603
6604/* Deprecated */
6605PyObject *
6606PyUnicode_EncodeLatin1(const Py_UNICODE *p,
6607                       Py_ssize_t size,
6608                       const char *errors)
6609{
6610    PyObject *result;
6611    PyObject *unicode = PyUnicode_FromUnicode(p, size);
6612    if (unicode == NULL)
6613        return NULL;
6614    result = unicode_encode_ucs1(unicode, errors, 256);
6615    Py_DECREF(unicode);
6616    return result;
6617}
6618
6619PyObject *
6620_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
6621{
6622    if (!PyUnicode_Check(unicode)) {
6623        PyErr_BadArgument();
6624        return NULL;
6625    }
6626    if (PyUnicode_READY(unicode) == -1)
6627        return NULL;
6628    /* Fast path: if it is a one-byte string, construct
6629       bytes object directly. */
6630    if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6631        return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6632                                         PyUnicode_GET_LENGTH(unicode));
6633    /* Non-Latin-1 characters present. Defer to above function to
6634       raise the exception. */
6635    return unicode_encode_ucs1(unicode, errors, 256);
6636}
6637
6638PyObject*
6639PyUnicode_AsLatin1String(PyObject *unicode)
6640{
6641    return _PyUnicode_AsLatin1String(unicode, NULL);
6642}
6643
6644/* --- 7-bit ASCII Codec -------------------------------------------------- */
6645
6646PyObject *
6647PyUnicode_DecodeASCII(const char *s,
6648                      Py_ssize_t size,
6649                      const char *errors)
6650{
6651    const char *starts = s;
6652    _PyUnicodeWriter writer;
6653    int kind;
6654    void *data;
6655    Py_ssize_t startinpos;
6656    Py_ssize_t endinpos;
6657    Py_ssize_t outpos;
6658    const char *e;
6659    PyObject *errorHandler = NULL;
6660    PyObject *exc = NULL;
6661
6662    if (size == 0)
6663        _Py_RETURN_UNICODE_EMPTY();
6664
6665    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
6666    if (size == 1 && (unsigned char)s[0] < 128)
6667        return get_latin1_char((unsigned char)s[0]);
6668
6669    _PyUnicodeWriter_Init(&writer);
6670    writer.min_length = size;
6671    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
6672        return NULL;
6673
6674    e = s + size;
6675    data = writer.data;
6676    outpos = ascii_decode(s, e, (Py_UCS1 *)data);
6677    writer.pos = outpos;
6678    if (writer.pos == size)
6679        return _PyUnicodeWriter_Finish(&writer);
6680
6681    s += writer.pos;
6682    kind = writer.kind;
6683    while (s < e) {
6684        unsigned char c = (unsigned char)*s;
6685        if (c < 128) {
6686            PyUnicode_WRITE(kind, data, writer.pos, c);
6687            writer.pos++;
6688            ++s;
6689        }
6690        else {
6691            startinpos = s-starts;
6692            endinpos = startinpos + 1;
6693            if (unicode_decode_call_errorhandler_writer(
6694                    errors, &errorHandler,
6695                    "ascii", "ordinal not in range(128)",
6696                    &starts, &e, &startinpos, &endinpos, &exc, &s,
6697                    &writer))
6698                goto onError;
6699            kind = writer.kind;
6700            data = writer.data;
6701        }
6702    }
6703    Py_XDECREF(errorHandler);
6704    Py_XDECREF(exc);
6705    return _PyUnicodeWriter_Finish(&writer);
6706
6707  onError:
6708    _PyUnicodeWriter_Dealloc(&writer);
6709    Py_XDECREF(errorHandler);
6710    Py_XDECREF(exc);
6711    return NULL;
6712}
6713
6714/* Deprecated */
6715PyObject *
6716PyUnicode_EncodeASCII(const Py_UNICODE *p,
6717                      Py_ssize_t size,
6718                      const char *errors)
6719{
6720    PyObject *result;
6721    PyObject *unicode = PyUnicode_FromUnicode(p, size);
6722    if (unicode == NULL)
6723        return NULL;
6724    result = unicode_encode_ucs1(unicode, errors, 128);
6725    Py_DECREF(unicode);
6726    return result;
6727}
6728
6729PyObject *
6730_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
6731{
6732    if (!PyUnicode_Check(unicode)) {
6733        PyErr_BadArgument();
6734        return NULL;
6735    }
6736    if (PyUnicode_READY(unicode) == -1)
6737        return NULL;
6738    /* Fast path: if it is an ASCII-only string, construct bytes object
6739       directly. Else defer to above function to raise the exception. */
6740    if (PyUnicode_IS_ASCII(unicode))
6741        return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6742                                         PyUnicode_GET_LENGTH(unicode));
6743    return unicode_encode_ucs1(unicode, errors, 128);
6744}
6745
6746PyObject *
6747PyUnicode_AsASCIIString(PyObject *unicode)
6748{
6749    return _PyUnicode_AsASCIIString(unicode, NULL);
6750}
6751
6752#ifdef HAVE_MBCS
6753
6754/* --- MBCS codecs for Windows -------------------------------------------- */
6755
6756#if SIZEOF_INT < SIZEOF_SIZE_T
6757#define NEED_RETRY
6758#endif
6759
6760#ifndef WC_ERR_INVALID_CHARS
6761#  define WC_ERR_INVALID_CHARS 0x0080
6762#endif
6763
6764static char*
6765code_page_name(UINT code_page, PyObject **obj)
6766{
6767    *obj = NULL;
6768    if (code_page == CP_ACP)
6769        return "mbcs";
6770    if (code_page == CP_UTF7)
6771        return "CP_UTF7";
6772    if (code_page == CP_UTF8)
6773        return "CP_UTF8";
6774
6775    *obj = PyBytes_FromFormat("cp%u", code_page);
6776    if (*obj == NULL)
6777        return NULL;
6778    return PyBytes_AS_STRING(*obj);
6779}
6780
6781static DWORD
6782decode_code_page_flags(UINT code_page)
6783{
6784    if (code_page == CP_UTF7) {
6785        /* The CP_UTF7 decoder only supports flags=0 */
6786        return 0;
6787    }
6788    else
6789        return MB_ERR_INVALID_CHARS;
6790}
6791
6792/*
6793 * Decode a byte string from a Windows code page into unicode object in strict
6794 * mode.
6795 *
6796 * Returns consumed size if succeed, returns -2 on decode error, or raise an
6797 * OSError and returns -1 on other error.
6798 */
6799static int
6800decode_code_page_strict(UINT code_page,
6801                        PyObject **v,
6802                        const char *in,
6803                        int insize)
6804{
6805    const DWORD flags = decode_code_page_flags(code_page);
6806    wchar_t *out;
6807    DWORD outsize;
6808
6809    /* First get the size of the result */
6810    assert(insize > 0);
6811    outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
6812    if (outsize <= 0)
6813        goto error;
6814
6815    if (*v == NULL) {
6816        /* Create unicode object */
6817        /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
6818        *v = (PyObject*)_PyUnicode_New(outsize);
6819        if (*v == NULL)
6820            return -1;
6821        out = PyUnicode_AS_UNICODE(*v);
6822    }
6823    else {
6824        /* Extend unicode object */
6825        Py_ssize_t n = PyUnicode_GET_SIZE(*v);
6826        if (unicode_resize(v, n + outsize) < 0)
6827            return -1;
6828        out = PyUnicode_AS_UNICODE(*v) + n;
6829    }
6830
6831    /* Do the conversion */
6832    outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
6833    if (outsize <= 0)
6834        goto error;
6835    return insize;
6836
6837error:
6838    if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6839        return -2;
6840    PyErr_SetFromWindowsErr(0);
6841    return -1;
6842}
6843
6844/*
6845 * Decode a byte string from a code page into unicode object with an error
6846 * handler.
6847 *
6848 * Returns consumed size if succeed, or raise an OSError or
6849 * UnicodeDecodeError exception and returns -1 on error.
6850 */
6851static int
6852decode_code_page_errors(UINT code_page,
6853                        PyObject **v,
6854                        const char *in, const int size,
6855                        const char *errors, int final)
6856{
6857    const char *startin = in;
6858    const char *endin = in + size;
6859    const DWORD flags = decode_code_page_flags(code_page);
6860    /* Ideally, we should get reason from FormatMessage. This is the Windows
6861       2000 English version of the message. */
6862    const char *reason = "No mapping for the Unicode character exists "
6863                         "in the target code page.";
6864    /* each step cannot decode more than 1 character, but a character can be
6865       represented as a surrogate pair */
6866    wchar_t buffer[2], *startout, *out;
6867    int insize;
6868    Py_ssize_t outsize;
6869    PyObject *errorHandler = NULL;
6870    PyObject *exc = NULL;
6871    PyObject *encoding_obj = NULL;
6872    char *encoding;
6873    DWORD err;
6874    int ret = -1;
6875
6876    assert(size > 0);
6877
6878    encoding = code_page_name(code_page, &encoding_obj);
6879    if (encoding == NULL)
6880        return -1;
6881
6882    if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
6883        /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
6884           UnicodeDecodeError. */
6885        make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
6886        if (exc != NULL) {
6887            PyCodec_StrictErrors(exc);
6888            Py_CLEAR(exc);
6889        }
6890        goto error;
6891    }
6892
6893    if (*v == NULL) {
6894        /* Create unicode object */
6895        if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6896            PyErr_NoMemory();
6897            goto error;
6898        }
6899        /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
6900        *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
6901        if (*v == NULL)
6902            goto error;
6903        startout = PyUnicode_AS_UNICODE(*v);
6904    }
6905    else {
6906        /* Extend unicode object */
6907        Py_ssize_t n = PyUnicode_GET_SIZE(*v);
6908        if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6909            PyErr_NoMemory();
6910            goto error;
6911        }
6912        if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
6913            goto error;
6914        startout = PyUnicode_AS_UNICODE(*v) + n;
6915    }
6916
6917    /* Decode the byte string character per character */
6918    out = startout;
6919    while (in < endin)
6920    {
6921        /* Decode a character */
6922        insize = 1;
6923        do
6924        {
6925            outsize = MultiByteToWideChar(code_page, flags,
6926                                          in, insize,
6927                                          buffer, Py_ARRAY_LENGTH(buffer));
6928            if (outsize > 0)
6929                break;
6930            err = GetLastError();
6931            if (err != ERROR_NO_UNICODE_TRANSLATION
6932                && err != ERROR_INSUFFICIENT_BUFFER)
6933            {
6934                PyErr_SetFromWindowsErr(0);
6935                goto error;
6936            }
6937            insize++;
6938        }
6939        /* 4=maximum length of a UTF-8 sequence */
6940        while (insize <= 4 && (in + insize) <= endin);
6941
6942        if (outsize <= 0) {
6943            Py_ssize_t startinpos, endinpos, outpos;
6944
6945            /* last character in partial decode? */
6946            if (in + insize >= endin && !final)
6947                break;
6948
6949            startinpos = in - startin;
6950            endinpos = startinpos + 1;
6951            outpos = out - PyUnicode_AS_UNICODE(*v);
6952            if (unicode_decode_call_errorhandler_wchar(
6953                    errors, &errorHandler,
6954                    encoding, reason,
6955                    &startin, &endin, &startinpos, &endinpos, &exc, &in,
6956                    v, &outpos))
6957            {
6958                goto error;
6959            }
6960            out = PyUnicode_AS_UNICODE(*v) + outpos;
6961        }
6962        else {
6963            in += insize;
6964            memcpy(out, buffer, outsize * sizeof(wchar_t));
6965            out += outsize;
6966        }
6967    }
6968
6969    /* write a NUL character at the end */
6970    *out = 0;
6971
6972    /* Extend unicode object */
6973    outsize = out - startout;
6974    assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
6975    if (unicode_resize(v, outsize) < 0)
6976        goto error;
6977    /* (in - startin) <= size and size is an int */
6978    ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
6979
6980error:
6981    Py_XDECREF(encoding_obj);
6982    Py_XDECREF(errorHandler);
6983    Py_XDECREF(exc);
6984    return ret;
6985}
6986
6987static PyObject *
6988decode_code_page_stateful(int code_page,
6989                          const char *s, Py_ssize_t size,
6990                          const char *errors, Py_ssize_t *consumed)
6991{
6992    PyObject *v = NULL;
6993    int chunk_size, final, converted, done;
6994
6995    if (code_page < 0) {
6996        PyErr_SetString(PyExc_ValueError, "invalid code page number");
6997        return NULL;
6998    }
6999
7000    if (consumed)
7001        *consumed = 0;
7002
7003    do
7004    {
7005#ifdef NEED_RETRY
7006        if (size > INT_MAX) {
7007            chunk_size = INT_MAX;
7008            final = 0;
7009            done = 0;
7010        }
7011        else
7012#endif
7013        {
7014            chunk_size = (int)size;
7015            final = (consumed == NULL);
7016            done = 1;
7017        }
7018
7019        if (chunk_size == 0 && done) {
7020            if (v != NULL)
7021                break;
7022            _Py_RETURN_UNICODE_EMPTY();
7023        }
7024
7025        converted = decode_code_page_strict(code_page, &v,
7026                                            s, chunk_size);
7027        if (converted == -2)
7028            converted = decode_code_page_errors(code_page, &v,
7029                                                s, chunk_size,
7030                                                errors, final);
7031        assert(converted != 0 || done);
7032
7033        if (converted < 0) {
7034            Py_XDECREF(v);
7035            return NULL;
7036        }
7037
7038        if (consumed)
7039            *consumed += converted;
7040
7041        s += converted;
7042        size -= converted;
7043    } while (!done);
7044
7045    return unicode_result(v);
7046}
7047
7048PyObject *
7049PyUnicode_DecodeCodePageStateful(int code_page,
7050                                 const char *s,
7051                                 Py_ssize_t size,
7052                                 const char *errors,
7053                                 Py_ssize_t *consumed)
7054{
7055    return decode_code_page_stateful(code_page, s, size, errors, consumed);
7056}
7057
7058PyObject *
7059PyUnicode_DecodeMBCSStateful(const char *s,
7060                             Py_ssize_t size,
7061                             const char *errors,
7062                             Py_ssize_t *consumed)
7063{
7064    return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7065}
7066
7067PyObject *
7068PyUnicode_DecodeMBCS(const char *s,
7069                     Py_ssize_t size,
7070                     const char *errors)
7071{
7072    return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7073}
7074
7075static DWORD
7076encode_code_page_flags(UINT code_page, const char *errors)
7077{
7078    if (code_page == CP_UTF8) {
7079        return WC_ERR_INVALID_CHARS;
7080    }
7081    else if (code_page == CP_UTF7) {
7082        /* CP_UTF7 only supports flags=0 */
7083        return 0;
7084    }
7085    else {
7086        if (errors != NULL && strcmp(errors, "replace") == 0)
7087            return 0;
7088        else
7089            return WC_NO_BEST_FIT_CHARS;
7090    }
7091}
7092
7093/*
7094 * Encode a Unicode string to a Windows code page into a byte string in strict
7095 * mode.
7096 *
7097 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7098 * an OSError and returns -1 on other error.
7099 */
7100static int
7101encode_code_page_strict(UINT code_page, PyObject **outbytes,
7102                        PyObject *unicode, Py_ssize_t offset, int len,
7103                        const char* errors)
7104{
7105    BOOL usedDefaultChar = FALSE;
7106    BOOL *pusedDefaultChar = &usedDefaultChar;
7107    int outsize;
7108    PyObject *exc = NULL;
7109    wchar_t *p;
7110    Py_ssize_t size;
7111    const DWORD flags = encode_code_page_flags(code_page, NULL);
7112    char *out;
7113    /* Create a substring so that we can get the UTF-16 representation
7114       of just the slice under consideration. */
7115    PyObject *substring;
7116
7117    assert(len > 0);
7118
7119    if (code_page != CP_UTF8 && code_page != CP_UTF7)
7120        pusedDefaultChar = &usedDefaultChar;
7121    else
7122        pusedDefaultChar = NULL;
7123
7124    substring = PyUnicode_Substring(unicode, offset, offset+len);
7125    if (substring == NULL)
7126        return -1;
7127    p = PyUnicode_AsUnicodeAndSize(substring, &size);
7128    if (p == NULL) {
7129        Py_DECREF(substring);
7130        return -1;
7131    }
7132    assert(size <= INT_MAX);
7133
7134    /* First get the size of the result */
7135    outsize = WideCharToMultiByte(code_page, flags,
7136                                  p, (int)size,
7137                                  NULL, 0,
7138                                  NULL, pusedDefaultChar);
7139    if (outsize <= 0)
7140        goto error;
7141    /* If we used a default char, then we failed! */
7142    if (pusedDefaultChar && *pusedDefaultChar) {
7143        Py_DECREF(substring);
7144        return -2;
7145    }
7146
7147    if (*outbytes == NULL) {
7148        /* Create string object */
7149        *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7150        if (*outbytes == NULL) {
7151            Py_DECREF(substring);
7152            return -1;
7153        }
7154        out = PyBytes_AS_STRING(*outbytes);
7155    }
7156    else {
7157        /* Extend string object */
7158        const Py_ssize_t n = PyBytes_Size(*outbytes);
7159        if (outsize > PY_SSIZE_T_MAX - n) {
7160            PyErr_NoMemory();
7161            Py_DECREF(substring);
7162            return -1;
7163        }
7164        if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7165            Py_DECREF(substring);
7166            return -1;
7167        }
7168        out = PyBytes_AS_STRING(*outbytes) + n;
7169    }
7170
7171    /* Do the conversion */
7172    outsize = WideCharToMultiByte(code_page, flags,
7173                                  p, (int)size,
7174                                  out, outsize,
7175                                  NULL, pusedDefaultChar);
7176    Py_CLEAR(substring);
7177    if (outsize <= 0)
7178        goto error;
7179    if (pusedDefaultChar && *pusedDefaultChar)
7180        return -2;
7181    return 0;
7182
7183error:
7184    Py_XDECREF(substring);
7185    if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7186        return -2;
7187    PyErr_SetFromWindowsErr(0);
7188    return -1;
7189}
7190
7191/*
7192 * Encode a Unicode string to a Windows code page into a byte string using a
7193 * error handler.
7194 *
7195 * Returns consumed characters if succeed, or raise an OSError and returns
7196 * -1 on other error.
7197 */
7198static int
7199encode_code_page_errors(UINT code_page, PyObject **outbytes,
7200                        PyObject *unicode, Py_ssize_t unicode_offset,
7201                        Py_ssize_t insize, const char* errors)
7202{
7203    const DWORD flags = encode_code_page_flags(code_page, errors);
7204    Py_ssize_t pos = unicode_offset;
7205    Py_ssize_t endin = unicode_offset + insize;
7206    /* Ideally, we should get reason from FormatMessage. This is the Windows
7207       2000 English version of the message. */
7208    const char *reason = "invalid character";
7209    /* 4=maximum length of a UTF-8 sequence */
7210    char buffer[4];
7211    BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7212    Py_ssize_t outsize;
7213    char *out;
7214    PyObject *errorHandler = NULL;
7215    PyObject *exc = NULL;
7216    PyObject *encoding_obj = NULL;
7217    char *encoding;
7218    Py_ssize_t newpos, newoutsize;
7219    PyObject *rep;
7220    int ret = -1;
7221
7222    assert(insize > 0);
7223
7224    encoding = code_page_name(code_page, &encoding_obj);
7225    if (encoding == NULL)
7226        return -1;
7227
7228    if (errors == NULL || strcmp(errors, "strict") == 0) {
7229        /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7230           then we raise a UnicodeEncodeError. */
7231        make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
7232        if (exc != NULL) {
7233            PyCodec_StrictErrors(exc);
7234            Py_DECREF(exc);
7235        }
7236        Py_XDECREF(encoding_obj);
7237        return -1;
7238    }
7239
7240    if (code_page != CP_UTF8 && code_page != CP_UTF7)
7241        pusedDefaultChar = &usedDefaultChar;
7242    else
7243        pusedDefaultChar = NULL;
7244
7245    if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7246        PyErr_NoMemory();
7247        goto error;
7248    }
7249    outsize = insize * Py_ARRAY_LENGTH(buffer);
7250
7251    if (*outbytes == NULL) {
7252        /* Create string object */
7253        *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7254        if (*outbytes == NULL)
7255            goto error;
7256        out = PyBytes_AS_STRING(*outbytes);
7257    }
7258    else {
7259        /* Extend string object */
7260        Py_ssize_t n = PyBytes_Size(*outbytes);
7261        if (n > PY_SSIZE_T_MAX - outsize) {
7262            PyErr_NoMemory();
7263            goto error;
7264        }
7265        if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7266            goto error;
7267        out = PyBytes_AS_STRING(*outbytes) + n;
7268    }
7269
7270    /* Encode the string character per character */
7271    while (pos < endin)
7272    {
7273        Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7274        wchar_t chars[2];
7275        int charsize;
7276        if (ch < 0x10000) {
7277            chars[0] = (wchar_t)ch;
7278            charsize = 1;
7279        }
7280        else {
7281            chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7282            chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
7283            charsize = 2;
7284        }
7285
7286        outsize = WideCharToMultiByte(code_page, flags,
7287                                      chars, charsize,
7288                                      buffer, Py_ARRAY_LENGTH(buffer),
7289                                      NULL, pusedDefaultChar);
7290        if (outsize > 0) {
7291            if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7292            {
7293                pos++;
7294                memcpy(out, buffer, outsize);
7295                out += outsize;
7296                continue;
7297            }
7298        }
7299        else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7300            PyErr_SetFromWindowsErr(0);
7301            goto error;
7302        }
7303
7304        rep = unicode_encode_call_errorhandler(
7305                  errors, &errorHandler, encoding, reason,
7306                  unicode, &exc,
7307                  pos, pos + 1, &newpos);
7308        if (rep == NULL)
7309            goto error;
7310        pos = newpos;
7311
7312        if (PyBytes_Check(rep)) {
7313            outsize = PyBytes_GET_SIZE(rep);
7314            if (outsize != 1) {
7315                Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7316                newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7317                if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7318                    Py_DECREF(rep);
7319                    goto error;
7320                }
7321                out = PyBytes_AS_STRING(*outbytes) + offset;
7322            }
7323            memcpy(out, PyBytes_AS_STRING(rep), outsize);
7324            out += outsize;
7325        }
7326        else {
7327            Py_ssize_t i;
7328            enum PyUnicode_Kind kind;
7329            void *data;
7330
7331            if (PyUnicode_READY(rep) == -1) {
7332                Py_DECREF(rep);
7333                goto error;
7334            }
7335
7336            outsize = PyUnicode_GET_LENGTH(rep);
7337            if (outsize != 1) {
7338                Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7339                newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7340                if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7341                    Py_DECREF(rep);
7342                    goto error;
7343                }
7344                out = PyBytes_AS_STRING(*outbytes) + offset;
7345            }
7346            kind = PyUnicode_KIND(rep);
7347            data = PyUnicode_DATA(rep);
7348            for (i=0; i < outsize; i++) {
7349                Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7350                if (ch > 127) {
7351                    raise_encode_exception(&exc,
7352                        encoding, unicode,
7353                        pos, pos + 1,
7354                        "unable to encode error handler result to ASCII");
7355                    Py_DECREF(rep);
7356                    goto error;
7357                }
7358                *out = (unsigned char)ch;
7359                out++;
7360            }
7361        }
7362        Py_DECREF(rep);
7363    }
7364    /* write a NUL byte */
7365    *out = 0;
7366    outsize = out - PyBytes_AS_STRING(*outbytes);
7367    assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7368    if (_PyBytes_Resize(outbytes, outsize) < 0)
7369        goto error;
7370    ret = 0;
7371
7372error:
7373    Py_XDECREF(encoding_obj);
7374    Py_XDECREF(errorHandler);
7375    Py_XDECREF(exc);
7376    return ret;
7377}
7378
7379static PyObject *
7380encode_code_page(int code_page,
7381                 PyObject *unicode,
7382                 const char *errors)
7383{
7384    Py_ssize_t len;
7385    PyObject *outbytes = NULL;
7386    Py_ssize_t offset;
7387    int chunk_len, ret, done;
7388
7389    if (!PyUnicode_Check(unicode)) {
7390        PyErr_BadArgument();
7391        return NULL;
7392    }
7393
7394    if (PyUnicode_READY(unicode) == -1)
7395        return NULL;
7396    len = PyUnicode_GET_LENGTH(unicode);
7397
7398    if (code_page < 0) {
7399        PyErr_SetString(PyExc_ValueError, "invalid code page number");
7400        return NULL;
7401    }
7402
7403    if (len == 0)
7404        return PyBytes_FromStringAndSize(NULL, 0);
7405
7406    offset = 0;
7407    do
7408    {
7409#ifdef NEED_RETRY
7410        /* UTF-16 encoding may double the size, so use only INT_MAX/2
7411           chunks. */
7412        if (len > INT_MAX/2) {
7413            chunk_len = INT_MAX/2;
7414            done = 0;
7415        }
7416        else
7417#endif
7418        {
7419            chunk_len = (int)len;
7420            done = 1;
7421        }
7422
7423        ret = encode_code_page_strict(code_page, &outbytes,
7424                                      unicode, offset, chunk_len,
7425                                      errors);
7426        if (ret == -2)
7427            ret = encode_code_page_errors(code_page, &outbytes,
7428                                          unicode, offset,
7429                                          chunk_len, errors);
7430        if (ret < 0) {
7431            Py_XDECREF(outbytes);
7432            return NULL;
7433        }
7434
7435        offset += chunk_len;
7436        len -= chunk_len;
7437    } while (!done);
7438
7439    return outbytes;
7440}
7441
7442PyObject *
7443PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7444                     Py_ssize_t size,
7445                     const char *errors)
7446{
7447    PyObject *unicode, *res;
7448    unicode = PyUnicode_FromUnicode(p, size);
7449    if (unicode == NULL)
7450        return NULL;
7451    res = encode_code_page(CP_ACP, unicode, errors);
7452    Py_DECREF(unicode);
7453    return res;
7454}
7455
7456PyObject *
7457PyUnicode_EncodeCodePage(int code_page,
7458                         PyObject *unicode,
7459                         const char *errors)
7460{
7461    return encode_code_page(code_page, unicode, errors);
7462}
7463
7464PyObject *
7465PyUnicode_AsMBCSString(PyObject *unicode)
7466{
7467    return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
7468}
7469
7470#undef NEED_RETRY
7471
7472#endif /* HAVE_MBCS */
7473
7474/* --- Character Mapping Codec -------------------------------------------- */
7475
7476static int
7477charmap_decode_string(const char *s,
7478                      Py_ssize_t size,
7479                      PyObject *mapping,
7480                      const char *errors,
7481                      _PyUnicodeWriter *writer)
7482{
7483    const char *starts = s;
7484    const char *e;
7485    Py_ssize_t startinpos, endinpos;
7486    PyObject *errorHandler = NULL, *exc = NULL;
7487    Py_ssize_t maplen;
7488    enum PyUnicode_Kind mapkind;
7489    void *mapdata;
7490    Py_UCS4 x;
7491    unsigned char ch;
7492
7493    if (PyUnicode_READY(mapping) == -1)
7494        return -1;
7495
7496    maplen = PyUnicode_GET_LENGTH(mapping);
7497    mapdata = PyUnicode_DATA(mapping);
7498    mapkind = PyUnicode_KIND(mapping);
7499
7500    e = s + size;
7501
7502    if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7503        /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7504         * is disabled in encoding aliases, latin1 is preferred because
7505         * its implementation is faster. */
7506        Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7507        Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7508        Py_UCS4 maxchar = writer->maxchar;
7509
7510        assert (writer->kind == PyUnicode_1BYTE_KIND);
7511        while (s < e) {
7512            ch = *s;
7513            x = mapdata_ucs1[ch];
7514            if (x > maxchar) {
7515                if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7516                    goto onError;
7517                maxchar = writer->maxchar;
7518                outdata = (Py_UCS1 *)writer->data;
7519            }
7520            outdata[writer->pos] = x;
7521            writer->pos++;
7522            ++s;
7523        }
7524        return 0;
7525    }
7526
7527    while (s < e) {
7528        if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7529            enum PyUnicode_Kind outkind = writer->kind;
7530            Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7531            if (outkind == PyUnicode_1BYTE_KIND) {
7532                Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7533                Py_UCS4 maxchar = writer->maxchar;
7534                while (s < e) {
7535                    ch = *s;
7536                    x = mapdata_ucs2[ch];
7537                    if (x > maxchar)
7538                        goto Error;
7539                    outdata[writer->pos] = x;
7540                    writer->pos++;
7541                    ++s;
7542                }
7543                break;
7544            }
7545            else if (outkind == PyUnicode_2BYTE_KIND) {
7546                Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7547                while (s < e) {
7548                    ch = *s;
7549                    x = mapdata_ucs2[ch];
7550                    if (x == 0xFFFE)
7551                        goto Error;
7552                    outdata[writer->pos] = x;
7553                    writer->pos++;
7554                    ++s;
7555                }
7556                break;
7557            }
7558        }
7559        ch = *s;
7560
7561        if (ch < maplen)
7562            x = PyUnicode_READ(mapkind, mapdata, ch);
7563        else
7564            x = 0xfffe; /* invalid value */
7565Error:
7566        if (x == 0xfffe)
7567        {
7568            /* undefined mapping */
7569            startinpos = s-starts;
7570            endinpos = startinpos+1;
7571            if (unicode_decode_call_errorhandler_writer(
7572                    errors, &errorHandler,
7573                    "charmap", "character maps to <undefined>",
7574                    &starts, &e, &startinpos, &endinpos, &exc, &s,
7575                    writer)) {
7576                goto onError;
7577            }
7578            continue;
7579        }
7580
7581        if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7582            goto onError;
7583        ++s;
7584    }
7585    Py_XDECREF(errorHandler);
7586    Py_XDECREF(exc);
7587    return 0;
7588
7589onError:
7590    Py_XDECREF(errorHandler);
7591    Py_XDECREF(exc);
7592    return -1;
7593}
7594
7595static int
7596charmap_decode_mapping(const char *s,
7597                       Py_ssize_t size,
7598                       PyObject *mapping,
7599                       const char *errors,
7600                       _PyUnicodeWriter *writer)
7601{
7602    const char *starts = s;
7603    const char *e;
7604    Py_ssize_t startinpos, endinpos;
7605    PyObject *errorHandler = NULL, *exc = NULL;
7606    unsigned char ch;
7607    PyObject *key, *item = NULL;
7608
7609    e = s + size;
7610
7611    while (s < e) {
7612        ch = *s;
7613
7614        /* Get mapping (char ordinal -> integer, Unicode char or None) */
7615        key = PyLong_FromLong((long)ch);
7616        if (key == NULL)
7617            goto onError;
7618
7619        item = PyObject_GetItem(mapping, key);
7620        Py_DECREF(key);
7621        if (item == NULL) {
7622            if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7623                /* No mapping found means: mapping is undefined. */
7624                PyErr_Clear();
7625                goto Undefined;
7626            } else
7627                goto onError;
7628        }
7629
7630        /* Apply mapping */
7631        if (item == Py_None)
7632            goto Undefined;
7633        if (PyLong_Check(item)) {
7634            long value = PyLong_AS_LONG(item);
7635            if (value == 0xFFFE)
7636                goto Undefined;
7637            if (value < 0 || value > MAX_UNICODE) {
7638                PyErr_Format(PyExc_TypeError,
7639                             "character mapping must be in range(0x%lx)",
7640                             (unsigned long)MAX_UNICODE + 1);
7641                goto onError;
7642            }
7643
7644            if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7645                goto onError;
7646        }
7647        else if (PyUnicode_Check(item)) {
7648            if (PyUnicode_READY(item) == -1)
7649                goto onError;
7650            if (PyUnicode_GET_LENGTH(item) == 1) {
7651                Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
7652                if (value == 0xFFFE)
7653                    goto Undefined;
7654                if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7655                    goto onError;
7656            }
7657            else {
7658                writer->overallocate = 1;
7659                if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
7660                    goto onError;
7661            }
7662        }
7663        else {
7664            /* wrong return value */
7665            PyErr_SetString(PyExc_TypeError,
7666                            "character mapping must return integer, None or str");
7667            goto onError;
7668        }
7669        Py_CLEAR(item);
7670        ++s;
7671        continue;
7672
7673Undefined:
7674        /* undefined mapping */
7675        Py_CLEAR(item);
7676        startinpos = s-starts;
7677        endinpos = startinpos+1;
7678        if (unicode_decode_call_errorhandler_writer(
7679                errors, &errorHandler,
7680                "charmap", "character maps to <undefined>",
7681                &starts, &e, &startinpos, &endinpos, &exc, &s,
7682                writer)) {
7683            goto onError;
7684        }
7685    }
7686    Py_XDECREF(errorHandler);
7687    Py_XDECREF(exc);
7688    return 0;
7689
7690onError:
7691    Py_XDECREF(item);
7692    Py_XDECREF(errorHandler);
7693    Py_XDECREF(exc);
7694    return -1;
7695}
7696
7697PyObject *
7698PyUnicode_DecodeCharmap(const char *s,
7699                        Py_ssize_t size,
7700                        PyObject *mapping,
7701                        const char *errors)
7702{
7703    _PyUnicodeWriter writer;
7704
7705    /* Default to Latin-1 */
7706    if (mapping == NULL)
7707        return PyUnicode_DecodeLatin1(s, size, errors);
7708
7709    if (size == 0)
7710        _Py_RETURN_UNICODE_EMPTY();
7711    _PyUnicodeWriter_Init(&writer);
7712    writer.min_length = size;
7713    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
7714        goto onError;
7715
7716    if (PyUnicode_CheckExact(mapping)) {
7717        if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
7718            goto onError;
7719    }
7720    else {
7721        if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
7722            goto onError;
7723    }
7724    return _PyUnicodeWriter_Finish(&writer);
7725
7726  onError:
7727    _PyUnicodeWriter_Dealloc(&writer);
7728    return NULL;
7729}
7730
7731/* Charmap encoding: the lookup table */
7732
7733struct encoding_map {
7734    PyObject_HEAD
7735    unsigned char level1[32];
7736    int count2, count3;
7737    unsigned char level23[1];
7738};
7739
7740static PyObject*
7741encoding_map_size(PyObject *obj, PyObject* args)
7742{
7743    struct encoding_map *map = (struct encoding_map*)obj;
7744    return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
7745                           128*map->count3);
7746}
7747
7748static PyMethodDef encoding_map_methods[] = {
7749    {"size", encoding_map_size, METH_NOARGS,
7750     PyDoc_STR("Return the size (in bytes) of this object") },
7751    { 0 }
7752};
7753
7754static void
7755encoding_map_dealloc(PyObject* o)
7756{
7757    PyObject_FREE(o);
7758}
7759
7760static PyTypeObject EncodingMapType = {
7761    PyVarObject_HEAD_INIT(NULL, 0)
7762    "EncodingMap",          /*tp_name*/
7763    sizeof(struct encoding_map),   /*tp_basicsize*/
7764    0,                      /*tp_itemsize*/
7765    /* methods */
7766    encoding_map_dealloc,   /*tp_dealloc*/
7767    0,                      /*tp_print*/
7768    0,                      /*tp_getattr*/
7769    0,                      /*tp_setattr*/
7770    0,                      /*tp_reserved*/
7771    0,                      /*tp_repr*/
7772    0,                      /*tp_as_number*/
7773    0,                      /*tp_as_sequence*/
7774    0,                      /*tp_as_mapping*/
7775    0,                      /*tp_hash*/
7776    0,                      /*tp_call*/
7777    0,                      /*tp_str*/
7778    0,                      /*tp_getattro*/
7779    0,                      /*tp_setattro*/
7780    0,                      /*tp_as_buffer*/
7781    Py_TPFLAGS_DEFAULT,     /*tp_flags*/
7782    0,                      /*tp_doc*/
7783    0,                      /*tp_traverse*/
7784    0,                      /*tp_clear*/
7785    0,                      /*tp_richcompare*/
7786    0,                      /*tp_weaklistoffset*/
7787    0,                      /*tp_iter*/
7788    0,                      /*tp_iternext*/
7789    encoding_map_methods,   /*tp_methods*/
7790    0,                      /*tp_members*/
7791    0,                      /*tp_getset*/
7792    0,                      /*tp_base*/
7793    0,                      /*tp_dict*/
7794    0,                      /*tp_descr_get*/
7795    0,                      /*tp_descr_set*/
7796    0,                      /*tp_dictoffset*/
7797    0,                      /*tp_init*/
7798    0,                      /*tp_alloc*/
7799    0,                      /*tp_new*/
7800    0,                      /*tp_free*/
7801    0,                      /*tp_is_gc*/
7802};
7803
7804PyObject*
7805PyUnicode_BuildEncodingMap(PyObject* string)
7806{
7807    PyObject *result;
7808    struct encoding_map *mresult;
7809    int i;
7810    int need_dict = 0;
7811    unsigned char level1[32];
7812    unsigned char level2[512];
7813    unsigned char *mlevel1, *mlevel2, *mlevel3;
7814    int count2 = 0, count3 = 0;
7815    int kind;
7816    void *data;
7817    Py_ssize_t length;
7818    Py_UCS4 ch;
7819
7820    if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
7821        PyErr_BadArgument();
7822        return NULL;
7823    }
7824    kind = PyUnicode_KIND(string);
7825    data = PyUnicode_DATA(string);
7826    length = PyUnicode_GET_LENGTH(string);
7827    length = Py_MIN(length, 256);
7828    memset(level1, 0xFF, sizeof level1);
7829    memset(level2, 0xFF, sizeof level2);
7830
7831    /* If there isn't a one-to-one mapping of NULL to \0,
7832       or if there are non-BMP characters, we need to use
7833       a mapping dictionary. */
7834    if (PyUnicode_READ(kind, data, 0) != 0)
7835        need_dict = 1;
7836    for (i = 1; i < length; i++) {
7837        int l1, l2;
7838        ch = PyUnicode_READ(kind, data, i);
7839        if (ch == 0 || ch > 0xFFFF) {
7840            need_dict = 1;
7841            break;
7842        }
7843        if (ch == 0xFFFE)
7844            /* unmapped character */
7845            continue;
7846        l1 = ch >> 11;
7847        l2 = ch >> 7;
7848        if (level1[l1] == 0xFF)
7849            level1[l1] = count2++;
7850        if (level2[l2] == 0xFF)
7851            level2[l2] = count3++;
7852    }
7853
7854    if (count2 >= 0xFF || count3 >= 0xFF)
7855        need_dict = 1;
7856
7857    if (need_dict) {
7858        PyObject *result = PyDict_New();
7859        PyObject *key, *value;
7860        if (!result)
7861            return NULL;
7862        for (i = 0; i < length; i++) {
7863            key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
7864            value = PyLong_FromLong(i);
7865            if (!key || !value)
7866                goto failed1;
7867            if (PyDict_SetItem(result, key, value) == -1)
7868                goto failed1;
7869            Py_DECREF(key);
7870            Py_DECREF(value);
7871        }
7872        return result;
7873      failed1:
7874        Py_XDECREF(key);
7875        Py_XDECREF(value);
7876        Py_DECREF(result);
7877        return NULL;
7878    }
7879
7880    /* Create a three-level trie */
7881    result = PyObject_MALLOC(sizeof(struct encoding_map) +
7882                             16*count2 + 128*count3 - 1);
7883    if (!result)
7884        return PyErr_NoMemory();
7885    PyObject_Init(result, &EncodingMapType);
7886    mresult = (struct encoding_map*)result;
7887    mresult->count2 = count2;
7888    mresult->count3 = count3;
7889    mlevel1 = mresult->level1;
7890    mlevel2 = mresult->level23;
7891    mlevel3 = mresult->level23 + 16*count2;
7892    memcpy(mlevel1, level1, 32);
7893    memset(mlevel2, 0xFF, 16*count2);
7894    memset(mlevel3, 0, 128*count3);
7895    count3 = 0;
7896    for (i = 1; i < length; i++) {
7897        int o1, o2, o3, i2, i3;
7898        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7899        if (ch == 0xFFFE)
7900            /* unmapped character */
7901            continue;
7902        o1 = ch>>11;
7903        o2 = (ch>>7) & 0xF;
7904        i2 = 16*mlevel1[o1] + o2;
7905        if (mlevel2[i2] == 0xFF)
7906            mlevel2[i2] = count3++;
7907        o3 = ch & 0x7F;
7908        i3 = 128*mlevel2[i2] + o3;
7909        mlevel3[i3] = i;
7910    }
7911    return result;
7912}
7913
7914static int
7915encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
7916{
7917    struct encoding_map *map = (struct encoding_map*)mapping;
7918    int l1 = c>>11;
7919    int l2 = (c>>7) & 0xF;
7920    int l3 = c & 0x7F;
7921    int i;
7922
7923    if (c > 0xFFFF)
7924        return -1;
7925    if (c == 0)
7926        return 0;
7927    /* level 1*/
7928    i = map->level1[l1];
7929    if (i == 0xFF) {
7930        return -1;
7931    }
7932    /* level 2*/
7933    i = map->level23[16*i+l2];
7934    if (i == 0xFF) {
7935        return -1;
7936    }
7937    /* level 3 */
7938    i = map->level23[16*map->count2 + 128*i + l3];
7939    if (i == 0) {
7940        return -1;
7941    }
7942    return i;
7943}
7944
7945/* Lookup the character ch in the mapping. If the character
7946   can't be found, Py_None is returned (or NULL, if another
7947   error occurred). */
7948static PyObject *
7949charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
7950{
7951    PyObject *w = PyLong_FromLong((long)c);
7952    PyObject *x;
7953
7954    if (w == NULL)
7955        return NULL;
7956    x = PyObject_GetItem(mapping, w);
7957    Py_DECREF(w);
7958    if (x == NULL) {
7959        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7960            /* No mapping found means: mapping is undefined. */
7961            PyErr_Clear();
7962            x = Py_None;
7963            Py_INCREF(x);
7964            return x;
7965        } else
7966            return NULL;
7967    }
7968    else if (x == Py_None)
7969        return x;
7970    else if (PyLong_Check(x)) {
7971        long value = PyLong_AS_LONG(x);
7972        if (value < 0 || value > 255) {
7973            PyErr_SetString(PyExc_TypeError,
7974                            "character mapping must be in range(256)");
7975            Py_DECREF(x);
7976            return NULL;
7977        }
7978        return x;
7979    }
7980    else if (PyBytes_Check(x))
7981        return x;
7982    else {
7983        /* wrong return value */
7984        PyErr_Format(PyExc_TypeError,
7985                     "character mapping must return integer, bytes or None, not %.400s",
7986                     x->ob_type->tp_name);
7987        Py_DECREF(x);
7988        return NULL;
7989    }
7990}
7991
7992static int
7993charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
7994{
7995    Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7996    /* exponentially overallocate to minimize reallocations */
7997    if (requiredsize < 2*outsize)
7998        requiredsize = 2*outsize;
7999    if (_PyBytes_Resize(outobj, requiredsize))
8000        return -1;
8001    return 0;
8002}
8003
8004typedef enum charmapencode_result {
8005    enc_SUCCESS, enc_FAILED, enc_EXCEPTION
8006} charmapencode_result;
8007/* lookup the character, put the result in the output string and adjust
8008   various state variables. Resize the output bytes object if not enough
8009   space is available. Return a new reference to the object that
8010   was put in the output buffer, or Py_None, if the mapping was undefined
8011   (in which case no character was written) or NULL, if a
8012   reallocation error occurred. The caller must decref the result */
8013static charmapencode_result
8014charmapencode_output(Py_UCS4 c, PyObject *mapping,
8015                     PyObject **outobj, Py_ssize_t *outpos)
8016{
8017    PyObject *rep;
8018    char *outstart;
8019    Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8020
8021    if (Py_TYPE(mapping) == &EncodingMapType) {
8022        int res = encoding_map_lookup(c, mapping);
8023        Py_ssize_t requiredsize = *outpos+1;
8024        if (res == -1)
8025            return enc_FAILED;
8026        if (outsize<requiredsize)
8027            if (charmapencode_resize(outobj, outpos, requiredsize))
8028                return enc_EXCEPTION;
8029        outstart = PyBytes_AS_STRING(*outobj);
8030        outstart[(*outpos)++] = (char)res;
8031        return enc_SUCCESS;
8032    }
8033
8034    rep = charmapencode_lookup(c, mapping);
8035    if (rep==NULL)
8036        return enc_EXCEPTION;
8037    else if (rep==Py_None) {
8038        Py_DECREF(rep);
8039        return enc_FAILED;
8040    } else {
8041        if (PyLong_Check(rep)) {
8042            Py_ssize_t requiredsize = *outpos+1;
8043            if (outsize<requiredsize)
8044                if (charmapencode_resize(outobj, outpos, requiredsize)) {
8045                    Py_DECREF(rep);
8046                    return enc_EXCEPTION;
8047                }
8048            outstart = PyBytes_AS_STRING(*outobj);
8049            outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
8050        }
8051        else {
8052            const char *repchars = PyBytes_AS_STRING(rep);
8053            Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8054            Py_ssize_t requiredsize = *outpos+repsize;
8055            if (outsize<requiredsize)
8056                if (charmapencode_resize(outobj, outpos, requiredsize)) {
8057                    Py_DECREF(rep);
8058                    return enc_EXCEPTION;
8059                }
8060            outstart = PyBytes_AS_STRING(*outobj);
8061            memcpy(outstart + *outpos, repchars, repsize);
8062            *outpos += repsize;
8063        }
8064    }
8065    Py_DECREF(rep);
8066    return enc_SUCCESS;
8067}
8068
8069/* handle an error in PyUnicode_EncodeCharmap
8070   Return 0 on success, -1 on error */
8071static int
8072charmap_encoding_error(
8073    PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
8074    PyObject **exceptionObject,
8075    int *known_errorHandler, PyObject **errorHandler, const char *errors,
8076    PyObject **res, Py_ssize_t *respos)
8077{
8078    PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8079    Py_ssize_t size, repsize;
8080    Py_ssize_t newpos;
8081    enum PyUnicode_Kind kind;
8082    void *data;
8083    Py_ssize_t index;
8084    /* startpos for collecting unencodable chars */
8085    Py_ssize_t collstartpos = *inpos;
8086    Py_ssize_t collendpos = *inpos+1;
8087    Py_ssize_t collpos;
8088    char *encoding = "charmap";
8089    char *reason = "character maps to <undefined>";
8090    charmapencode_result x;
8091    Py_UCS4 ch;
8092    int val;
8093
8094    if (PyUnicode_READY(unicode) == -1)
8095        return -1;
8096    size = PyUnicode_GET_LENGTH(unicode);
8097    /* find all unencodable characters */
8098    while (collendpos < size) {
8099        PyObject *rep;
8100        if (Py_TYPE(mapping) == &EncodingMapType) {
8101            ch = PyUnicode_READ_CHAR(unicode, collendpos);
8102            val = encoding_map_lookup(ch, mapping);
8103            if (val != -1)
8104                break;
8105            ++collendpos;
8106            continue;
8107        }
8108
8109        ch = PyUnicode_READ_CHAR(unicode, collendpos);
8110        rep = charmapencode_lookup(ch, mapping);
8111        if (rep==NULL)
8112            return -1;
8113        else if (rep!=Py_None) {
8114            Py_DECREF(rep);
8115            break;
8116        }
8117        Py_DECREF(rep);
8118        ++collendpos;
8119    }
8120    /* cache callback name lookup
8121     * (if not done yet, i.e. it's the first error) */
8122    if (*known_errorHandler==-1) {
8123        if ((errors==NULL) || (!strcmp(errors, "strict")))
8124            *known_errorHandler = 1;
8125        else if (!strcmp(errors, "replace"))
8126            *known_errorHandler = 2;
8127        else if (!strcmp(errors, "ignore"))
8128            *known_errorHandler = 3;
8129        else if (!strcmp(errors, "xmlcharrefreplace"))
8130            *known_errorHandler = 4;
8131        else
8132            *known_errorHandler = 0;
8133    }
8134    switch (*known_errorHandler) {
8135    case 1: /* strict */
8136        raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8137        return -1;
8138    case 2: /* replace */
8139        for (collpos = collstartpos; collpos<collendpos; ++collpos) {
8140            x = charmapencode_output('?', mapping, res, respos);
8141            if (x==enc_EXCEPTION) {
8142                return -1;
8143            }
8144            else if (x==enc_FAILED) {
8145                raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8146                return -1;
8147            }
8148        }
8149        /* fall through */
8150    case 3: /* ignore */
8151        *inpos = collendpos;
8152        break;
8153    case 4: /* xmlcharrefreplace */
8154        /* generate replacement (temporarily (mis)uses p) */
8155        for (collpos = collstartpos; collpos < collendpos; ++collpos) {
8156            char buffer[2+29+1+1];
8157            char *cp;
8158            sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
8159            for (cp = buffer; *cp; ++cp) {
8160                x = charmapencode_output(*cp, mapping, res, respos);
8161                if (x==enc_EXCEPTION)
8162                    return -1;
8163                else if (x==enc_FAILED) {
8164                    raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8165                    return -1;
8166                }
8167            }
8168        }
8169        *inpos = collendpos;
8170        break;
8171    default:
8172        repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
8173                                                      encoding, reason, unicode, exceptionObject,
8174                                                      collstartpos, collendpos, &newpos);
8175        if (repunicode == NULL)
8176            return -1;
8177        if (PyBytes_Check(repunicode)) {
8178            /* Directly copy bytes result to output. */
8179            Py_ssize_t outsize = PyBytes_Size(*res);
8180            Py_ssize_t requiredsize;
8181            repsize = PyBytes_Size(repunicode);
8182            requiredsize = *respos + repsize;
8183            if (requiredsize > outsize)
8184                /* Make room for all additional bytes. */
8185                if (charmapencode_resize(res, respos, requiredsize)) {
8186                    Py_DECREF(repunicode);
8187                    return -1;
8188                }
8189            memcpy(PyBytes_AsString(*res) + *respos,
8190                   PyBytes_AsString(repunicode),  repsize);
8191            *respos += repsize;
8192            *inpos = newpos;
8193            Py_DECREF(repunicode);
8194            break;
8195        }
8196        /* generate replacement  */
8197        if (PyUnicode_READY(repunicode) == -1) {
8198            Py_DECREF(repunicode);
8199            return -1;
8200        }
8201        repsize = PyUnicode_GET_LENGTH(repunicode);
8202        data = PyUnicode_DATA(repunicode);
8203        kind = PyUnicode_KIND(repunicode);
8204        for (index = 0; index < repsize; index++) {
8205            Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8206            x = charmapencode_output(repch, mapping, res, respos);
8207            if (x==enc_EXCEPTION) {
8208                Py_DECREF(repunicode);
8209                return -1;
8210            }
8211            else if (x==enc_FAILED) {
8212                Py_DECREF(repunicode);
8213                raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8214                return -1;
8215            }
8216        }
8217        *inpos = newpos;
8218        Py_DECREF(repunicode);
8219    }
8220    return 0;
8221}
8222
8223PyObject *
8224_PyUnicode_EncodeCharmap(PyObject *unicode,
8225                         PyObject *mapping,
8226                         const char *errors)
8227{
8228    /* output object */
8229    PyObject *res = NULL;
8230    /* current input position */
8231    Py_ssize_t inpos = 0;
8232    Py_ssize_t size;
8233    /* current output position */
8234    Py_ssize_t respos = 0;
8235    PyObject *errorHandler = NULL;
8236    PyObject *exc = NULL;
8237    /* the following variable is used for caching string comparisons
8238     * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8239     * 3=ignore, 4=xmlcharrefreplace */
8240    int known_errorHandler = -1;
8241    void *data;
8242    int kind;
8243
8244    if (PyUnicode_READY(unicode) == -1)
8245        return NULL;
8246    size = PyUnicode_GET_LENGTH(unicode);
8247    data = PyUnicode_DATA(unicode);
8248    kind = PyUnicode_KIND(unicode);
8249
8250    /* Default to Latin-1 */
8251    if (mapping == NULL)
8252        return unicode_encode_ucs1(unicode, errors, 256);
8253
8254    /* allocate enough for a simple encoding without
8255       replacements, if we need more, we'll resize */
8256    res = PyBytes_FromStringAndSize(NULL, size);
8257    if (res == NULL)
8258        goto onError;
8259    if (size == 0)
8260        return res;
8261
8262    while (inpos<size) {
8263        Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
8264        /* try to encode it */
8265        charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
8266        if (x==enc_EXCEPTION) /* error */
8267            goto onError;
8268        if (x==enc_FAILED) { /* unencodable character */
8269            if (charmap_encoding_error(unicode, &inpos, mapping,
8270                                       &exc,
8271                                       &known_errorHandler, &errorHandler, errors,
8272                                       &res, &respos)) {
8273                goto onError;
8274            }
8275        }
8276        else
8277            /* done with this character => adjust input position */
8278            ++inpos;
8279    }
8280
8281    /* Resize if we allocated to much */
8282    if (respos<PyBytes_GET_SIZE(res))
8283        if (_PyBytes_Resize(&res, respos) < 0)
8284            goto onError;
8285
8286    Py_XDECREF(exc);
8287    Py_XDECREF(errorHandler);
8288    return res;
8289
8290  onError:
8291    Py_XDECREF(res);
8292    Py_XDECREF(exc);
8293    Py_XDECREF(errorHandler);
8294    return NULL;
8295}
8296
8297/* Deprecated */
8298PyObject *
8299PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8300                        Py_ssize_t size,
8301                        PyObject *mapping,
8302                        const char *errors)
8303{
8304    PyObject *result;
8305    PyObject *unicode = PyUnicode_FromUnicode(p, size);
8306    if (unicode == NULL)
8307        return NULL;
8308    result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8309    Py_DECREF(unicode);
8310    return result;
8311}
8312
8313PyObject *
8314PyUnicode_AsCharmapString(PyObject *unicode,
8315                          PyObject *mapping)
8316{
8317    if (!PyUnicode_Check(unicode) || mapping == NULL) {
8318        PyErr_BadArgument();
8319        return NULL;
8320    }
8321    return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
8322}
8323
8324/* create or adjust a UnicodeTranslateError */
8325static void
8326make_translate_exception(PyObject **exceptionObject,
8327                         PyObject *unicode,
8328                         Py_ssize_t startpos, Py_ssize_t endpos,
8329                         const char *reason)
8330{
8331    if (*exceptionObject == NULL) {
8332        *exceptionObject = _PyUnicodeTranslateError_Create(
8333            unicode, startpos, endpos, reason);
8334    }
8335    else {
8336        if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8337            goto onError;
8338        if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8339            goto onError;
8340        if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8341            goto onError;
8342        return;
8343      onError:
8344        Py_CLEAR(*exceptionObject);
8345    }
8346}
8347
8348/* error handling callback helper:
8349   build arguments, call the callback and check the arguments,
8350   put the result into newpos and return the replacement string, which
8351   has to be freed by the caller */
8352static PyObject *
8353unicode_translate_call_errorhandler(const char *errors,
8354                                    PyObject **errorHandler,
8355                                    const char *reason,
8356                                    PyObject *unicode, PyObject **exceptionObject,
8357                                    Py_ssize_t startpos, Py_ssize_t endpos,
8358                                    Py_ssize_t *newpos)
8359{
8360    static char *argparse = "O!n;translating error handler must return (str, int) tuple";
8361
8362    Py_ssize_t i_newpos;
8363    PyObject *restuple;
8364    PyObject *resunicode;
8365
8366    if (*errorHandler == NULL) {
8367        *errorHandler = PyCodec_LookupError(errors);
8368        if (*errorHandler == NULL)
8369            return NULL;
8370    }
8371
8372    make_translate_exception(exceptionObject,
8373                             unicode, startpos, endpos, reason);
8374    if (*exceptionObject == NULL)
8375        return NULL;
8376
8377    restuple = PyObject_CallFunctionObjArgs(
8378        *errorHandler, *exceptionObject, NULL);
8379    if (restuple == NULL)
8380        return NULL;
8381    if (!PyTuple_Check(restuple)) {
8382        PyErr_SetString(PyExc_TypeError, &argparse[4]);
8383        Py_DECREF(restuple);
8384        return NULL;
8385    }
8386    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
8387                          &resunicode, &i_newpos)) {
8388        Py_DECREF(restuple);
8389        return NULL;
8390    }
8391    if (i_newpos<0)
8392        *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
8393    else
8394        *newpos = i_newpos;
8395    if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
8396        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8397        Py_DECREF(restuple);
8398        return NULL;
8399    }
8400    Py_INCREF(resunicode);
8401    Py_DECREF(restuple);
8402    return resunicode;
8403}
8404
8405/* Lookup the character ch in the mapping and put the result in result,
8406   which must be decrefed by the caller.
8407   Return 0 on success, -1 on error */
8408static int
8409charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
8410{
8411    PyObject *w = PyLong_FromLong((long)c);
8412    PyObject *x;
8413
8414    if (w == NULL)
8415        return -1;
8416    x = PyObject_GetItem(mapping, w);
8417    Py_DECREF(w);
8418    if (x == NULL) {
8419        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8420            /* No mapping found means: use 1:1 mapping. */
8421            PyErr_Clear();
8422            *result = NULL;
8423            return 0;
8424        } else
8425            return -1;
8426    }
8427    else if (x == Py_None) {
8428        *result = x;
8429        return 0;
8430    }
8431    else if (PyLong_Check(x)) {
8432        long value = PyLong_AS_LONG(x);
8433        if (value < 0 || value > MAX_UNICODE) {
8434            PyErr_Format(PyExc_ValueError,
8435                         "character mapping must be in range(0x%x)",
8436                         MAX_UNICODE+1);
8437            Py_DECREF(x);
8438            return -1;
8439        }
8440        *result = x;
8441        return 0;
8442    }
8443    else if (PyUnicode_Check(x)) {
8444        *result = x;
8445        return 0;
8446    }
8447    else {
8448        /* wrong return value */
8449        PyErr_SetString(PyExc_TypeError,
8450                        "character mapping must return integer, None or str");
8451        Py_DECREF(x);
8452        return -1;
8453    }
8454}
8455
8456/* lookup the character, write the result into the writer.
8457   Return 1 if the result was written into the writer, return 0 if the mapping
8458   was undefined, raise an exception return -1 on error. */
8459static int
8460charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8461                        _PyUnicodeWriter *writer)
8462{
8463    PyObject *item;
8464
8465    if (charmaptranslate_lookup(ch, mapping, &item))
8466        return -1;
8467
8468    if (item == NULL) {
8469        /* not found => default to 1:1 mapping */
8470        if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8471            return -1;
8472        }
8473        return 1;
8474    }
8475
8476    if (item == Py_None) {
8477        Py_DECREF(item);
8478        return 0;
8479    }
8480
8481    if (PyLong_Check(item)) {
8482        long ch = (Py_UCS4)PyLong_AS_LONG(item);
8483        /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8484           used it */
8485        if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8486            Py_DECREF(item);
8487            return -1;
8488        }
8489        Py_DECREF(item);
8490        return 1;
8491    }
8492
8493    if (!PyUnicode_Check(item)) {
8494        Py_DECREF(item);
8495        return -1;
8496    }
8497
8498    if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8499        Py_DECREF(item);
8500        return -1;
8501    }
8502
8503    Py_DECREF(item);
8504    return 1;
8505}
8506
8507static int
8508unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8509                              Py_UCS1 *translate)
8510{
8511    PyObject *item = NULL;
8512    int ret = 0;
8513
8514    if (charmaptranslate_lookup(ch, mapping, &item)) {
8515        return -1;
8516    }
8517
8518    if (item == Py_None) {
8519        /* deletion */
8520        translate[ch] = 0xfe;
8521    }
8522    else if (item == NULL) {
8523        /* not found => default to 1:1 mapping */
8524        translate[ch] = ch;
8525        return 1;
8526    }
8527    else if (PyLong_Check(item)) {
8528        long replace = PyLong_AS_LONG(item);
8529        /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8530           used it */
8531        if (127 < replace) {
8532            /* invalid character or character outside ASCII:
8533               skip the fast translate */
8534            goto exit;
8535        }
8536        translate[ch] = (Py_UCS1)replace;
8537    }
8538    else if (PyUnicode_Check(item)) {
8539        Py_UCS4 replace;
8540
8541        if (PyUnicode_READY(item) == -1) {
8542            Py_DECREF(item);
8543            return -1;
8544        }
8545        if (PyUnicode_GET_LENGTH(item) != 1)
8546            goto exit;
8547
8548        replace = PyUnicode_READ_CHAR(item, 0);
8549        if (replace > 127)
8550            goto exit;
8551        translate[ch] = (Py_UCS1)replace;
8552    }
8553    else {
8554        /* not None, NULL, long or unicode */
8555        goto exit;
8556    }
8557    ret = 1;
8558
8559  exit:
8560    Py_DECREF(item);
8561    return ret;
8562}
8563
8564/* Fast path for ascii => ascii translation. Return 1 if the whole string
8565   was translated into writer, return 0 if the input string was partially
8566   translated into writer, raise an exception and return -1 on error. */
8567static int
8568unicode_fast_translate(PyObject *input, PyObject *mapping,
8569                       _PyUnicodeWriter *writer, int ignore)
8570{
8571    Py_UCS1 ascii_table[128], ch, ch2;
8572    Py_ssize_t len;
8573    Py_UCS1 *in, *end, *out;
8574    int res = 0;
8575
8576    if (PyUnicode_READY(input) == -1)
8577        return -1;
8578    if (!PyUnicode_IS_ASCII(input))
8579        return 0;
8580    len = PyUnicode_GET_LENGTH(input);
8581
8582    memset(ascii_table, 0xff, 128);
8583
8584    in = PyUnicode_1BYTE_DATA(input);
8585    end = in + len;
8586
8587    assert(PyUnicode_IS_ASCII(writer->buffer));
8588    assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
8589    out = PyUnicode_1BYTE_DATA(writer->buffer);
8590
8591    for (; in < end; in++) {
8592        ch = *in;
8593        ch2 = ascii_table[ch];
8594        if (ch2 == 0xff) {
8595            int translate = unicode_fast_translate_lookup(mapping, ch,
8596                                                          ascii_table);
8597            if (translate < 0)
8598                return -1;
8599            if (translate == 0)
8600                goto exit;
8601            ch2 = ascii_table[ch];
8602        }
8603        if (ch2 == 0xfe) {
8604            if (ignore)
8605                continue;
8606            goto exit;
8607        }
8608        assert(ch2 < 128);
8609        *out = ch2;
8610        out++;
8611    }
8612    res = 1;
8613
8614exit:
8615    writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
8616    return res;
8617}
8618
8619PyObject *
8620_PyUnicode_TranslateCharmap(PyObject *input,
8621                            PyObject *mapping,
8622                            const char *errors)
8623{
8624    /* input object */
8625    char *data;
8626    Py_ssize_t size, i;
8627    int kind;
8628    /* output buffer */
8629    _PyUnicodeWriter writer;
8630    /* error handler */
8631    char *reason = "character maps to <undefined>";
8632    PyObject *errorHandler = NULL;
8633    PyObject *exc = NULL;
8634    int ignore;
8635    int res;
8636
8637    if (mapping == NULL) {
8638        PyErr_BadArgument();
8639        return NULL;
8640    }
8641
8642    if (PyUnicode_READY(input) == -1)
8643        return NULL;
8644    data = (char*)PyUnicode_DATA(input);
8645    kind = PyUnicode_KIND(input);
8646    size = PyUnicode_GET_LENGTH(input);
8647
8648    if (size == 0) {
8649        Py_INCREF(input);
8650        return input;
8651    }
8652
8653    /* allocate enough for a simple 1:1 translation without
8654       replacements, if we need more, we'll resize */
8655    _PyUnicodeWriter_Init(&writer);
8656    if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
8657        goto onError;
8658
8659    ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
8660
8661    res = unicode_fast_translate(input, mapping, &writer, ignore);
8662    if (res < 0) {
8663        _PyUnicodeWriter_Dealloc(&writer);
8664        return NULL;
8665    }
8666    if (res == 1)
8667        return _PyUnicodeWriter_Finish(&writer);
8668
8669    i = writer.pos;
8670    while (i<size) {
8671        /* try to encode it */
8672        int translate;
8673        PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8674        Py_ssize_t newpos;
8675        /* startpos for collecting untranslatable chars */
8676        Py_ssize_t collstart;
8677        Py_ssize_t collend;
8678        Py_UCS4 ch;
8679
8680        ch = PyUnicode_READ(kind, data, i);
8681        translate = charmaptranslate_output(ch, mapping, &writer);
8682        if (translate < 0)
8683            goto onError;
8684
8685        if (translate != 0) {
8686            /* it worked => adjust input pointer */
8687            ++i;
8688            continue;
8689        }
8690
8691        /* untranslatable character */
8692        collstart = i;
8693        collend = i+1;
8694
8695        /* find all untranslatable characters */
8696        while (collend < size) {
8697            PyObject *x;
8698            ch = PyUnicode_READ(kind, data, collend);
8699            if (charmaptranslate_lookup(ch, mapping, &x))
8700                goto onError;
8701            Py_XDECREF(x);
8702            if (x != Py_None)
8703                break;
8704            ++collend;
8705        }
8706
8707        if (ignore) {
8708            i = collend;
8709        }
8710        else {
8711            repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
8712                                                             reason, input, &exc,
8713                                                             collstart, collend, &newpos);
8714            if (repunicode == NULL)
8715                goto onError;
8716            if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
8717                Py_DECREF(repunicode);
8718                goto onError;
8719            }
8720            Py_DECREF(repunicode);
8721            i = newpos;
8722        }
8723    }
8724    Py_XDECREF(exc);
8725    Py_XDECREF(errorHandler);
8726    return _PyUnicodeWriter_Finish(&writer);
8727
8728  onError:
8729    _PyUnicodeWriter_Dealloc(&writer);
8730    Py_XDECREF(exc);
8731    Py_XDECREF(errorHandler);
8732    return NULL;
8733}
8734
8735/* Deprecated. Use PyUnicode_Translate instead. */
8736PyObject *
8737PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8738                           Py_ssize_t size,
8739                           PyObject *mapping,
8740                           const char *errors)
8741{
8742    PyObject *result;
8743    PyObject *unicode = PyUnicode_FromUnicode(p, size);
8744    if (!unicode)
8745        return NULL;
8746    result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8747    Py_DECREF(unicode);
8748    return result;
8749}
8750
8751PyObject *
8752PyUnicode_Translate(PyObject *str,
8753                    PyObject *mapping,
8754                    const char *errors)
8755{
8756    PyObject *result;
8757
8758    str = PyUnicode_FromObject(str);
8759    if (str == NULL)
8760        return NULL;
8761    result = _PyUnicode_TranslateCharmap(str, mapping, errors);
8762    Py_DECREF(str);
8763    return result;
8764}
8765
8766static Py_UCS4
8767fix_decimal_and_space_to_ascii(PyObject *self)
8768{
8769    /* No need to call PyUnicode_READY(self) because this function is only
8770       called as a callback from fixup() which does it already. */
8771    const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8772    const int kind = PyUnicode_KIND(self);
8773    void *data = PyUnicode_DATA(self);
8774    Py_UCS4 maxchar = 127, ch, fixed;
8775    int modified = 0;
8776    Py_ssize_t i;
8777
8778    for (i = 0; i < len; ++i) {
8779        ch = PyUnicode_READ(kind, data, i);
8780        fixed = 0;
8781        if (ch > 127) {
8782            if (Py_UNICODE_ISSPACE(ch))
8783                fixed = ' ';
8784            else {
8785                const int decimal = Py_UNICODE_TODECIMAL(ch);
8786                if (decimal >= 0)
8787                    fixed = '0' + decimal;
8788            }
8789            if (fixed != 0) {
8790                modified = 1;
8791                maxchar = Py_MAX(maxchar, fixed);
8792                PyUnicode_WRITE(kind, data, i, fixed);
8793            }
8794            else
8795                maxchar = Py_MAX(maxchar, ch);
8796        }
8797    }
8798
8799    return (modified) ? maxchar : 0;
8800}
8801
8802PyObject *
8803_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8804{
8805    if (!PyUnicode_Check(unicode)) {
8806        PyErr_BadInternalCall();
8807        return NULL;
8808    }
8809    if (PyUnicode_READY(unicode) == -1)
8810        return NULL;
8811    if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8812        /* If the string is already ASCII, just return the same string */
8813        Py_INCREF(unicode);
8814        return unicode;
8815    }
8816    return fixup(unicode, fix_decimal_and_space_to_ascii);
8817}
8818
8819PyObject *
8820PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8821                                  Py_ssize_t length)
8822{
8823    PyObject *decimal;
8824    Py_ssize_t i;
8825    Py_UCS4 maxchar;
8826    enum PyUnicode_Kind kind;
8827    void *data;
8828
8829    maxchar = 127;
8830    for (i = 0; i < length; i++) {
8831        Py_UCS4 ch = s[i];
8832        if (ch > 127) {
8833            int decimal = Py_UNICODE_TODECIMAL(ch);
8834            if (decimal >= 0)
8835                ch = '0' + decimal;
8836            maxchar = Py_MAX(maxchar, ch);
8837        }
8838    }
8839
8840    /* Copy to a new string */
8841    decimal = PyUnicode_New(length, maxchar);
8842    if (decimal == NULL)
8843        return decimal;
8844    kind = PyUnicode_KIND(decimal);
8845    data = PyUnicode_DATA(decimal);
8846    /* Iterate over code points */
8847    for (i = 0; i < length; i++) {
8848        Py_UCS4 ch = s[i];
8849        if (ch > 127) {
8850            int decimal = Py_UNICODE_TODECIMAL(ch);
8851            if (decimal >= 0)
8852                ch = '0' + decimal;
8853        }
8854        PyUnicode_WRITE(kind, data, i, ch);
8855    }
8856    return unicode_result(decimal);
8857}
8858/* --- Decimal Encoder ---------------------------------------------------- */
8859
8860int
8861PyUnicode_EncodeDecimal(Py_UNICODE *s,
8862                        Py_ssize_t length,
8863                        char *output,
8864                        const char *errors)
8865{
8866    PyObject *unicode;
8867    Py_ssize_t i;
8868    enum PyUnicode_Kind kind;
8869    void *data;
8870
8871    if (output == NULL) {
8872        PyErr_BadArgument();
8873        return -1;
8874    }
8875
8876    unicode = PyUnicode_FromUnicode(s, length);
8877    if (unicode == NULL)
8878        return -1;
8879
8880    if (PyUnicode_READY(unicode) == -1) {
8881        Py_DECREF(unicode);
8882        return -1;
8883    }
8884    kind = PyUnicode_KIND(unicode);
8885    data = PyUnicode_DATA(unicode);
8886
8887    for (i=0; i < length; ) {
8888        PyObject *exc;
8889        Py_UCS4 ch;
8890        int decimal;
8891        Py_ssize_t startpos;
8892
8893        ch = PyUnicode_READ(kind, data, i);
8894
8895        if (Py_UNICODE_ISSPACE(ch)) {
8896            *output++ = ' ';
8897            i++;
8898            continue;
8899        }
8900        decimal = Py_UNICODE_TODECIMAL(ch);
8901        if (decimal >= 0) {
8902            *output++ = '0' + decimal;
8903            i++;
8904            continue;
8905        }
8906        if (0 < ch && ch < 256) {
8907            *output++ = (char)ch;
8908            i++;
8909            continue;
8910        }
8911
8912        startpos = i;
8913        exc = NULL;
8914        raise_encode_exception(&exc, "decimal", unicode,
8915                               startpos, startpos+1,
8916                               "invalid decimal Unicode string");
8917        Py_XDECREF(exc);
8918        Py_DECREF(unicode);
8919        return -1;
8920    }
8921    /* 0-terminate the output string */
8922    *output++ = '\0';
8923    Py_DECREF(unicode);
8924    return 0;
8925}
8926
8927/* --- Helpers ------------------------------------------------------------ */
8928
8929/* helper macro to fixup start/end slice values */
8930#define ADJUST_INDICES(start, end, len)         \
8931    if (end > len)                              \
8932        end = len;                              \
8933    else if (end < 0) {                         \
8934        end += len;                             \
8935        if (end < 0)                            \
8936            end = 0;                            \
8937    }                                           \
8938    if (start < 0) {                            \
8939        start += len;                           \
8940        if (start < 0)                          \
8941            start = 0;                          \
8942    }
8943
8944static Py_ssize_t
8945any_find_slice(int direction, PyObject* s1, PyObject* s2,
8946               Py_ssize_t start,
8947               Py_ssize_t end)
8948{
8949    int kind1, kind2;
8950    void *buf1, *buf2;
8951    Py_ssize_t len1, len2, result;
8952
8953    kind1 = PyUnicode_KIND(s1);
8954    kind2 = PyUnicode_KIND(s2);
8955    if (kind1 < kind2)
8956        return -1;
8957
8958    len1 = PyUnicode_GET_LENGTH(s1);
8959    len2 = PyUnicode_GET_LENGTH(s2);
8960    ADJUST_INDICES(start, end, len1);
8961    if (end - start < len2)
8962        return -1;
8963
8964    buf1 = PyUnicode_DATA(s1);
8965    buf2 = PyUnicode_DATA(s2);
8966    if (len2 == 1) {
8967        Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
8968        result = findchar((const char *)buf1 + kind1*start,
8969                          kind1, end - start, ch, direction);
8970        if (result == -1)
8971            return -1;
8972        else
8973            return start + result;
8974    }
8975
8976    if (kind2 != kind1) {
8977        buf2 = _PyUnicode_AsKind(s2, kind1);
8978        if (!buf2)
8979            return -2;
8980    }
8981
8982    if (direction > 0) {
8983        switch (kind1) {
8984        case PyUnicode_1BYTE_KIND:
8985            if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8986                result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
8987            else
8988                result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
8989            break;
8990        case PyUnicode_2BYTE_KIND:
8991            result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
8992            break;
8993        case PyUnicode_4BYTE_KIND:
8994            result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
8995            break;
8996        default:
8997            assert(0); result = -2;
8998        }
8999    }
9000    else {
9001        switch (kind1) {
9002        case PyUnicode_1BYTE_KIND:
9003            if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9004                result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9005            else
9006                result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9007            break;
9008        case PyUnicode_2BYTE_KIND:
9009            result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9010            break;
9011        case PyUnicode_4BYTE_KIND:
9012            result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9013            break;
9014        default:
9015            assert(0); result = -2;
9016        }
9017    }
9018
9019    if (kind2 != kind1)
9020        PyMem_Free(buf2);
9021
9022    return result;
9023}
9024
9025Py_ssize_t
9026_PyUnicode_InsertThousandsGrouping(
9027    PyObject *unicode, Py_ssize_t index,
9028    Py_ssize_t n_buffer,
9029    void *digits, Py_ssize_t n_digits,
9030    Py_ssize_t min_width,
9031    const char *grouping, PyObject *thousands_sep,
9032    Py_UCS4 *maxchar)
9033{
9034    unsigned int kind, thousands_sep_kind;
9035    char *data, *thousands_sep_data;
9036    Py_ssize_t thousands_sep_len;
9037    Py_ssize_t len;
9038
9039    if (unicode != NULL) {
9040        kind = PyUnicode_KIND(unicode);
9041        data = (char *) PyUnicode_DATA(unicode) + index * kind;
9042    }
9043    else {
9044        kind = PyUnicode_1BYTE_KIND;
9045        data = NULL;
9046    }
9047    thousands_sep_kind = PyUnicode_KIND(thousands_sep);
9048    thousands_sep_data = PyUnicode_DATA(thousands_sep);
9049    thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9050    if (unicode != NULL && thousands_sep_kind != kind) {
9051        if (thousands_sep_kind < kind) {
9052            thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
9053            if (!thousands_sep_data)
9054                return -1;
9055        }
9056        else {
9057            data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
9058            if (!data)
9059                return -1;
9060        }
9061    }
9062
9063    switch (kind) {
9064    case PyUnicode_1BYTE_KIND:
9065        if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
9066            len = asciilib_InsertThousandsGrouping(
9067                (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
9068                min_width, grouping,
9069                (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
9070        else
9071            len = ucs1lib_InsertThousandsGrouping(
9072                (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
9073                min_width, grouping,
9074                (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
9075        break;
9076    case PyUnicode_2BYTE_KIND:
9077        len = ucs2lib_InsertThousandsGrouping(
9078            (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
9079            min_width, grouping,
9080            (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
9081        break;
9082    case PyUnicode_4BYTE_KIND:
9083        len = ucs4lib_InsertThousandsGrouping(
9084            (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
9085            min_width, grouping,
9086            (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
9087        break;
9088    default:
9089        assert(0);
9090        return -1;
9091    }
9092    if (unicode != NULL && thousands_sep_kind != kind) {
9093        if (thousands_sep_kind < kind)
9094            PyMem_Free(thousands_sep_data);
9095        else
9096            PyMem_Free(data);
9097    }
9098    if (unicode == NULL) {
9099        *maxchar = 127;
9100        if (len != n_digits) {
9101            *maxchar = Py_MAX(*maxchar,
9102                                   PyUnicode_MAX_CHAR_VALUE(thousands_sep));
9103        }
9104    }
9105    return len;
9106}
9107
9108
9109Py_ssize_t
9110PyUnicode_Count(PyObject *str,
9111                PyObject *substr,
9112                Py_ssize_t start,
9113                Py_ssize_t end)
9114{
9115    Py_ssize_t result;
9116    PyObject* str_obj;
9117    PyObject* sub_obj;
9118    int kind1, kind2;
9119    void *buf1 = NULL, *buf2 = NULL;
9120    Py_ssize_t len1, len2;
9121
9122    str_obj = PyUnicode_FromObject(str);
9123    if (!str_obj)
9124        return -1;
9125    sub_obj = PyUnicode_FromObject(substr);
9126    if (!sub_obj) {
9127        Py_DECREF(str_obj);
9128        return -1;
9129    }
9130    if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
9131        Py_DECREF(sub_obj);
9132        Py_DECREF(str_obj);
9133        return -1;
9134    }
9135
9136    kind1 = PyUnicode_KIND(str_obj);
9137    kind2 = PyUnicode_KIND(sub_obj);
9138    if (kind1 < kind2) {
9139        Py_DECREF(sub_obj);
9140        Py_DECREF(str_obj);
9141        return 0;
9142    }
9143
9144    len1 = PyUnicode_GET_LENGTH(str_obj);
9145    len2 = PyUnicode_GET_LENGTH(sub_obj);
9146    ADJUST_INDICES(start, end, len1);
9147    if (end - start < len2) {
9148        Py_DECREF(sub_obj);
9149        Py_DECREF(str_obj);
9150        return 0;
9151    }
9152
9153    buf1 = PyUnicode_DATA(str_obj);
9154    buf2 = PyUnicode_DATA(sub_obj);
9155    if (kind2 != kind1) {
9156        buf2 = _PyUnicode_AsKind(sub_obj, kind1);
9157        if (!buf2)
9158            goto onError;
9159    }
9160
9161    switch (kind1) {
9162    case PyUnicode_1BYTE_KIND:
9163        if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
9164            result = asciilib_count(
9165                ((Py_UCS1*)buf1) + start, end - start,
9166                buf2, len2, PY_SSIZE_T_MAX
9167                );
9168        else
9169            result = ucs1lib_count(
9170                ((Py_UCS1*)buf1) + start, end - start,
9171                buf2, len2, PY_SSIZE_T_MAX
9172                );
9173        break;
9174    case PyUnicode_2BYTE_KIND:
9175        result = ucs2lib_count(
9176            ((Py_UCS2*)buf1) + start, end - start,
9177            buf2, len2, PY_SSIZE_T_MAX
9178            );
9179        break;
9180    case PyUnicode_4BYTE_KIND:
9181        result = ucs4lib_count(
9182            ((Py_UCS4*)buf1) + start, end - start,
9183            buf2, len2, PY_SSIZE_T_MAX
9184            );
9185        break;
9186    default:
9187        assert(0); result = 0;
9188    }
9189
9190    Py_DECREF(sub_obj);
9191    Py_DECREF(str_obj);
9192
9193    if (kind2 != kind1)
9194        PyMem_Free(buf2);
9195
9196    return result;
9197  onError:
9198    Py_DECREF(sub_obj);
9199    Py_DECREF(str_obj);
9200    if (kind2 != kind1 && buf2)
9201        PyMem_Free(buf2);
9202    return -1;
9203}
9204
9205Py_ssize_t
9206PyUnicode_Find(PyObject *str,
9207               PyObject *sub,
9208               Py_ssize_t start,
9209               Py_ssize_t end,
9210               int direction)
9211{
9212    Py_ssize_t result;
9213
9214    str = PyUnicode_FromObject(str);
9215    if (!str)
9216        return -2;
9217    sub = PyUnicode_FromObject(sub);
9218    if (!sub) {
9219        Py_DECREF(str);
9220        return -2;
9221    }
9222    if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
9223        Py_DECREF(sub);
9224        Py_DECREF(str);
9225        return -2;
9226    }
9227
9228    result = any_find_slice(direction,
9229        str, sub, start, end
9230        );
9231
9232    Py_DECREF(str);
9233    Py_DECREF(sub);
9234
9235    return result;
9236}
9237
9238Py_ssize_t
9239PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9240                   Py_ssize_t start, Py_ssize_t end,
9241                   int direction)
9242{
9243    int kind;
9244    Py_ssize_t result;
9245    if (PyUnicode_READY(str) == -1)
9246        return -2;
9247    if (start < 0 || end < 0) {
9248        PyErr_SetString(PyExc_IndexError, "string index out of range");
9249        return -2;
9250    }
9251    if (end > PyUnicode_GET_LENGTH(str))
9252        end = PyUnicode_GET_LENGTH(str);
9253    if (start >= end)
9254        return -1;
9255    kind = PyUnicode_KIND(str);
9256    result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9257                      kind, end-start, ch, direction);
9258    if (result == -1)
9259        return -1;
9260    else
9261        return start + result;
9262}
9263
9264static int
9265tailmatch(PyObject *self,
9266          PyObject *substring,
9267          Py_ssize_t start,
9268          Py_ssize_t end,
9269          int direction)
9270{
9271    int kind_self;
9272    int kind_sub;
9273    void *data_self;
9274    void *data_sub;
9275    Py_ssize_t offset;
9276    Py_ssize_t i;
9277    Py_ssize_t end_sub;
9278
9279    if (PyUnicode_READY(self) == -1 ||
9280        PyUnicode_READY(substring) == -1)
9281        return -1;
9282
9283    if (PyUnicode_GET_LENGTH(substring) == 0)
9284        return 1;
9285
9286    ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9287    end -= PyUnicode_GET_LENGTH(substring);
9288    if (end < start)
9289        return 0;
9290
9291    kind_self = PyUnicode_KIND(self);
9292    data_self = PyUnicode_DATA(self);
9293    kind_sub = PyUnicode_KIND(substring);
9294    data_sub = PyUnicode_DATA(substring);
9295    end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9296
9297    if (direction > 0)
9298        offset = end;
9299    else
9300        offset = start;
9301
9302    if (PyUnicode_READ(kind_self, data_self, offset) ==
9303        PyUnicode_READ(kind_sub, data_sub, 0) &&
9304        PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9305        PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9306        /* If both are of the same kind, memcmp is sufficient */
9307        if (kind_self == kind_sub) {
9308            return ! memcmp((char *)data_self +
9309                                (offset * PyUnicode_KIND(substring)),
9310                            data_sub,
9311                            PyUnicode_GET_LENGTH(substring) *
9312                                PyUnicode_KIND(substring));
9313        }
9314        /* otherwise we have to compare each character by first accesing it */
9315        else {
9316            /* We do not need to compare 0 and len(substring)-1 because
9317               the if statement above ensured already that they are equal
9318               when we end up here. */
9319            for (i = 1; i < end_sub; ++i) {
9320                if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9321                    PyUnicode_READ(kind_sub, data_sub, i))
9322                    return 0;
9323            }
9324            return 1;
9325        }
9326    }
9327
9328    return 0;
9329}
9330
9331Py_ssize_t
9332PyUnicode_Tailmatch(PyObject *str,
9333                    PyObject *substr,
9334                    Py_ssize_t start,
9335                    Py_ssize_t end,
9336                    int direction)
9337{
9338    Py_ssize_t result;
9339
9340    str = PyUnicode_FromObject(str);
9341    if (str == NULL)
9342        return -1;
9343    substr = PyUnicode_FromObject(substr);
9344    if (substr == NULL) {
9345        Py_DECREF(str);
9346        return -1;
9347    }
9348
9349    result = tailmatch(str, substr,
9350                       start, end, direction);
9351    Py_DECREF(str);
9352    Py_DECREF(substr);
9353    return result;
9354}
9355
9356/* Apply fixfct filter to the Unicode object self and return a
9357   reference to the modified object */
9358
9359static PyObject *
9360fixup(PyObject *self,
9361      Py_UCS4 (*fixfct)(PyObject *s))
9362{
9363    PyObject *u;
9364    Py_UCS4 maxchar_old, maxchar_new = 0;
9365    PyObject *v;
9366
9367    u = _PyUnicode_Copy(self);
9368    if (u == NULL)
9369        return NULL;
9370    maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
9371
9372    /* fix functions return the new maximum character in a string,
9373       if the kind of the resulting unicode object does not change,
9374       everything is fine.  Otherwise we need to change the string kind
9375       and re-run the fix function. */
9376    maxchar_new = fixfct(u);
9377
9378    if (maxchar_new == 0) {
9379        /* no changes */;
9380        if (PyUnicode_CheckExact(self)) {
9381            Py_DECREF(u);
9382            Py_INCREF(self);
9383            return self;
9384        }
9385        else
9386            return u;
9387    }
9388
9389    maxchar_new = align_maxchar(maxchar_new);
9390
9391    if (maxchar_new == maxchar_old)
9392        return u;
9393
9394    /* In case the maximum character changed, we need to
9395       convert the string to the new category. */
9396    v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9397    if (v == NULL) {
9398        Py_DECREF(u);
9399        return NULL;
9400    }
9401    if (maxchar_new > maxchar_old) {
9402        /* If the maxchar increased so that the kind changed, not all
9403           characters are representable anymore and we need to fix the
9404           string again. This only happens in very few cases. */
9405        _PyUnicode_FastCopyCharacters(v, 0,
9406                                      self, 0, PyUnicode_GET_LENGTH(self));
9407        maxchar_old = fixfct(v);
9408        assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
9409    }
9410    else {
9411        _PyUnicode_FastCopyCharacters(v, 0,
9412                                      u, 0, PyUnicode_GET_LENGTH(self));
9413    }
9414    Py_DECREF(u);
9415    assert(_PyUnicode_CheckConsistency(v, 1));
9416    return v;
9417}
9418
9419static PyObject *
9420ascii_upper_or_lower(PyObject *self, int lower)
9421{
9422    Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9423    char *resdata, *data = PyUnicode_DATA(self);
9424    PyObject *res;
9425
9426    res = PyUnicode_New(len, 127);
9427    if (res == NULL)
9428        return NULL;
9429    resdata = PyUnicode_DATA(res);
9430    if (lower)
9431        _Py_bytes_lower(resdata, data, len);
9432    else
9433        _Py_bytes_upper(resdata, data, len);
9434    return res;
9435}
9436
9437static Py_UCS4
9438handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
9439{
9440    Py_ssize_t j;
9441    int final_sigma;
9442    Py_UCS4 c = 0;   /* initialize to prevent gcc warning */
9443    /* U+03A3 is in the Final_Sigma context when, it is found like this:
9444
9445     \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9446
9447    where ! is a negation and \p{xxx} is a character with property xxx.
9448    */
9449    for (j = i - 1; j >= 0; j--) {
9450        c = PyUnicode_READ(kind, data, j);
9451        if (!_PyUnicode_IsCaseIgnorable(c))
9452            break;
9453    }
9454    final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9455    if (final_sigma) {
9456        for (j = i + 1; j < length; j++) {
9457            c = PyUnicode_READ(kind, data, j);
9458            if (!_PyUnicode_IsCaseIgnorable(c))
9459                break;
9460        }
9461        final_sigma = j == length || !_PyUnicode_IsCased(c);
9462    }
9463    return (final_sigma) ? 0x3C2 : 0x3C3;
9464}
9465
9466static int
9467lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9468           Py_UCS4 c, Py_UCS4 *mapped)
9469{
9470    /* Obscure special case. */
9471    if (c == 0x3A3) {
9472        mapped[0] = handle_capital_sigma(kind, data, length, i);
9473        return 1;
9474    }
9475    return _PyUnicode_ToLowerFull(c, mapped);
9476}
9477
9478static Py_ssize_t
9479do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9480{
9481    Py_ssize_t i, k = 0;
9482    int n_res, j;
9483    Py_UCS4 c, mapped[3];
9484
9485    c = PyUnicode_READ(kind, data, 0);
9486    n_res = _PyUnicode_ToUpperFull(c, mapped);
9487    for (j = 0; j < n_res; j++) {
9488        *maxchar = Py_MAX(*maxchar, mapped[j]);
9489        res[k++] = mapped[j];
9490    }
9491    for (i = 1; i < length; i++) {
9492        c = PyUnicode_READ(kind, data, i);
9493        n_res = lower_ucs4(kind, data, length, i, c, mapped);
9494        for (j = 0; j < n_res; j++) {
9495            *maxchar = Py_MAX(*maxchar, mapped[j]);
9496            res[k++] = mapped[j];
9497        }
9498    }
9499    return k;
9500}
9501
9502static Py_ssize_t
9503do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9504    Py_ssize_t i, k = 0;
9505
9506    for (i = 0; i < length; i++) {
9507        Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9508        int n_res, j;
9509        if (Py_UNICODE_ISUPPER(c)) {
9510            n_res = lower_ucs4(kind, data, length, i, c, mapped);
9511        }
9512        else if (Py_UNICODE_ISLOWER(c)) {
9513            n_res = _PyUnicode_ToUpperFull(c, mapped);
9514        }
9515        else {
9516            n_res = 1;
9517            mapped[0] = c;
9518        }
9519        for (j = 0; j < n_res; j++) {
9520            *maxchar = Py_MAX(*maxchar, mapped[j]);
9521            res[k++] = mapped[j];
9522        }
9523    }
9524    return k;
9525}
9526
9527static Py_ssize_t
9528do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9529                  Py_UCS4 *maxchar, int lower)
9530{
9531    Py_ssize_t i, k = 0;
9532
9533    for (i = 0; i < length; i++) {
9534        Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9535        int n_res, j;
9536        if (lower)
9537            n_res = lower_ucs4(kind, data, length, i, c, mapped);
9538        else
9539            n_res = _PyUnicode_ToUpperFull(c, mapped);
9540        for (j = 0; j < n_res; j++) {
9541            *maxchar = Py_MAX(*maxchar, mapped[j]);
9542            res[k++] = mapped[j];
9543        }
9544    }
9545    return k;
9546}
9547
9548static Py_ssize_t
9549do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9550{
9551    return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9552}
9553
9554static Py_ssize_t
9555do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9556{
9557    return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9558}
9559
9560static Py_ssize_t
9561do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9562{
9563    Py_ssize_t i, k = 0;
9564
9565    for (i = 0; i < length; i++) {
9566        Py_UCS4 c = PyUnicode_READ(kind, data, i);
9567        Py_UCS4 mapped[3];
9568        int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9569        for (j = 0; j < n_res; j++) {
9570            *maxchar = Py_MAX(*maxchar, mapped[j]);
9571            res[k++] = mapped[j];
9572        }
9573    }
9574    return k;
9575}
9576
9577static Py_ssize_t
9578do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9579{
9580    Py_ssize_t i, k = 0;
9581    int previous_is_cased;
9582
9583    previous_is_cased = 0;
9584    for (i = 0; i < length; i++) {
9585        const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9586        Py_UCS4 mapped[3];
9587        int n_res, j;
9588
9589        if (previous_is_cased)
9590            n_res = lower_ucs4(kind, data, length, i, c, mapped);
9591        else
9592            n_res = _PyUnicode_ToTitleFull(c, mapped);
9593
9594        for (j = 0; j < n_res; j++) {
9595            *maxchar = Py_MAX(*maxchar, mapped[j]);
9596            res[k++] = mapped[j];
9597        }
9598
9599        previous_is_cased = _PyUnicode_IsCased(c);
9600    }
9601    return k;
9602}
9603
9604static PyObject *
9605case_operation(PyObject *self,
9606               Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9607{
9608    PyObject *res = NULL;
9609    Py_ssize_t length, newlength = 0;
9610    int kind, outkind;
9611    void *data, *outdata;
9612    Py_UCS4 maxchar = 0, *tmp, *tmpend;
9613
9614    assert(PyUnicode_IS_READY(self));
9615
9616    kind = PyUnicode_KIND(self);
9617    data = PyUnicode_DATA(self);
9618    length = PyUnicode_GET_LENGTH(self);
9619    if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
9620        PyErr_SetString(PyExc_OverflowError, "string is too long");
9621        return NULL;
9622    }
9623    tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
9624    if (tmp == NULL)
9625        return PyErr_NoMemory();
9626    newlength = perform(kind, data, length, tmp, &maxchar);
9627    res = PyUnicode_New(newlength, maxchar);
9628    if (res == NULL)
9629        goto leave;
9630    tmpend = tmp + newlength;
9631    outdata = PyUnicode_DATA(res);
9632    outkind = PyUnicode_KIND(res);
9633    switch (outkind) {
9634    case PyUnicode_1BYTE_KIND:
9635        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9636        break;
9637    case PyUnicode_2BYTE_KIND:
9638        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9639        break;
9640    case PyUnicode_4BYTE_KIND:
9641        memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9642        break;
9643    default:
9644        assert(0);
9645        break;
9646    }
9647  leave:
9648    PyMem_FREE(tmp);
9649    return res;
9650}
9651
9652PyObject *
9653PyUnicode_Join(PyObject *separator, PyObject *seq)
9654{
9655    PyObject *sep = NULL;
9656    Py_ssize_t seplen;
9657    PyObject *res = NULL; /* the result */
9658    PyObject *fseq;          /* PySequence_Fast(seq) */
9659    Py_ssize_t seqlen;       /* len(fseq) -- number of items in sequence */
9660    PyObject **items;
9661    PyObject *item;
9662    Py_ssize_t sz, i, res_offset;
9663    Py_UCS4 maxchar;
9664    Py_UCS4 item_maxchar;
9665    int use_memcpy;
9666    unsigned char *res_data = NULL, *sep_data = NULL;
9667    PyObject *last_obj;
9668    unsigned int kind = 0;
9669
9670    fseq = PySequence_Fast(seq, "can only join an iterable");
9671    if (fseq == NULL) {
9672        return NULL;
9673    }
9674
9675    /* NOTE: the following code can't call back into Python code,
9676     * so we are sure that fseq won't be mutated.
9677     */
9678
9679    seqlen = PySequence_Fast_GET_SIZE(fseq);
9680    /* If empty sequence, return u"". */
9681    if (seqlen == 0) {
9682        Py_DECREF(fseq);
9683        _Py_RETURN_UNICODE_EMPTY();
9684    }
9685
9686    /* If singleton sequence with an exact Unicode, return that. */
9687    last_obj = NULL;
9688    items = PySequence_Fast_ITEMS(fseq);
9689    if (seqlen == 1) {
9690        if (PyUnicode_CheckExact(items[0])) {
9691            res = items[0];
9692            Py_INCREF(res);
9693            Py_DECREF(fseq);
9694            return res;
9695        }
9696        seplen = 0;
9697        maxchar = 0;
9698    }
9699    else {
9700        /* Set up sep and seplen */
9701        if (separator == NULL) {
9702            /* fall back to a blank space separator */
9703            sep = PyUnicode_FromOrdinal(' ');
9704            if (!sep)
9705                goto onError;
9706            seplen = 1;
9707            maxchar = 32;
9708        }
9709        else {
9710            if (!PyUnicode_Check(separator)) {
9711                PyErr_Format(PyExc_TypeError,
9712                             "separator: expected str instance,"
9713                             " %.80s found",
9714                             Py_TYPE(separator)->tp_name);
9715                goto onError;
9716            }
9717            if (PyUnicode_READY(separator))
9718                goto onError;
9719            sep = separator;
9720            seplen = PyUnicode_GET_LENGTH(separator);
9721            maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9722            /* inc refcount to keep this code path symmetric with the
9723               above case of a blank separator */
9724            Py_INCREF(sep);
9725        }
9726        last_obj = sep;
9727    }
9728
9729    /* There are at least two things to join, or else we have a subclass
9730     * of str in the sequence.
9731     * Do a pre-pass to figure out the total amount of space we'll
9732     * need (sz), and see whether all argument are strings.
9733     */
9734    sz = 0;
9735#ifdef Py_DEBUG
9736    use_memcpy = 0;
9737#else
9738    use_memcpy = 1;
9739#endif
9740    for (i = 0; i < seqlen; i++) {
9741        const Py_ssize_t old_sz = sz;
9742        item = items[i];
9743        if (!PyUnicode_Check(item)) {
9744            PyErr_Format(PyExc_TypeError,
9745                         "sequence item %zd: expected str instance,"
9746                         " %.80s found",
9747                         i, Py_TYPE(item)->tp_name);
9748            goto onError;
9749        }
9750        if (PyUnicode_READY(item) == -1)
9751            goto onError;
9752        sz += PyUnicode_GET_LENGTH(item);
9753        item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
9754        maxchar = Py_MAX(maxchar, item_maxchar);
9755        if (i != 0)
9756            sz += seplen;
9757        if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9758            PyErr_SetString(PyExc_OverflowError,
9759                            "join() result is too long for a Python string");
9760            goto onError;
9761        }
9762        if (use_memcpy && last_obj != NULL) {
9763            if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9764                use_memcpy = 0;
9765        }
9766        last_obj = item;
9767    }
9768
9769    res = PyUnicode_New(sz, maxchar);
9770    if (res == NULL)
9771        goto onError;
9772
9773    /* Catenate everything. */
9774#ifdef Py_DEBUG
9775    use_memcpy = 0;
9776#else
9777    if (use_memcpy) {
9778        res_data = PyUnicode_1BYTE_DATA(res);
9779        kind = PyUnicode_KIND(res);
9780        if (seplen != 0)
9781            sep_data = PyUnicode_1BYTE_DATA(sep);
9782    }
9783#endif
9784    if (use_memcpy) {
9785        for (i = 0; i < seqlen; ++i) {
9786            Py_ssize_t itemlen;
9787            item = items[i];
9788
9789            /* Copy item, and maybe the separator. */
9790            if (i && seplen != 0) {
9791                Py_MEMCPY(res_data,
9792                          sep_data,
9793                          kind * seplen);
9794                res_data += kind * seplen;
9795            }
9796
9797            itemlen = PyUnicode_GET_LENGTH(item);
9798            if (itemlen != 0) {
9799                Py_MEMCPY(res_data,
9800                          PyUnicode_DATA(item),
9801                          kind * itemlen);
9802                res_data += kind * itemlen;
9803            }
9804        }
9805        assert(res_data == PyUnicode_1BYTE_DATA(res)
9806                           + kind * PyUnicode_GET_LENGTH(res));
9807    }
9808    else {
9809        for (i = 0, res_offset = 0; i < seqlen; ++i) {
9810            Py_ssize_t itemlen;
9811            item = items[i];
9812
9813            /* Copy item, and maybe the separator. */
9814            if (i && seplen != 0) {
9815                _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
9816                res_offset += seplen;
9817            }
9818
9819            itemlen = PyUnicode_GET_LENGTH(item);
9820            if (itemlen != 0) {
9821                _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
9822                res_offset += itemlen;
9823            }
9824        }
9825        assert(res_offset == PyUnicode_GET_LENGTH(res));
9826    }
9827
9828    Py_DECREF(fseq);
9829    Py_XDECREF(sep);
9830    assert(_PyUnicode_CheckConsistency(res, 1));
9831    return res;
9832
9833  onError:
9834    Py_DECREF(fseq);
9835    Py_XDECREF(sep);
9836    Py_XDECREF(res);
9837    return NULL;
9838}
9839
9840#define FILL(kind, data, value, start, length) \
9841    do { \
9842        Py_ssize_t i_ = 0; \
9843        assert(kind != PyUnicode_WCHAR_KIND); \
9844        switch ((kind)) { \
9845        case PyUnicode_1BYTE_KIND: { \
9846            unsigned char * to_ = (unsigned char *)((data)) + (start); \
9847            memset(to_, (unsigned char)value, (length)); \
9848            break; \
9849        } \
9850        case PyUnicode_2BYTE_KIND: { \
9851            Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9852            for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9853            break; \
9854        } \
9855        case PyUnicode_4BYTE_KIND: { \
9856            Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9857            for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9858            break; \
9859        } \
9860        default: assert(0); \
9861        } \
9862    } while (0)
9863
9864void
9865_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9866                    Py_UCS4 fill_char)
9867{
9868    const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
9869    const void *data = PyUnicode_DATA(unicode);
9870    assert(PyUnicode_IS_READY(unicode));
9871    assert(unicode_modifiable(unicode));
9872    assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
9873    assert(start >= 0);
9874    assert(start + length <= PyUnicode_GET_LENGTH(unicode));
9875    FILL(kind, data, fill_char, start, length);
9876}
9877
9878Py_ssize_t
9879PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9880               Py_UCS4 fill_char)
9881{
9882    Py_ssize_t maxlen;
9883
9884    if (!PyUnicode_Check(unicode)) {
9885        PyErr_BadInternalCall();
9886        return -1;
9887    }
9888    if (PyUnicode_READY(unicode) == -1)
9889        return -1;
9890    if (unicode_check_modifiable(unicode))
9891        return -1;
9892
9893    if (start < 0) {
9894        PyErr_SetString(PyExc_IndexError, "string index out of range");
9895        return -1;
9896    }
9897    if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
9898        PyErr_SetString(PyExc_ValueError,
9899                         "fill character is bigger than "
9900                         "the string maximum character");
9901        return -1;
9902    }
9903
9904    maxlen = PyUnicode_GET_LENGTH(unicode) - start;
9905    length = Py_MIN(maxlen, length);
9906    if (length <= 0)
9907        return 0;
9908
9909    _PyUnicode_FastFill(unicode, start, length, fill_char);
9910    return length;
9911}
9912
9913static PyObject *
9914pad(PyObject *self,
9915    Py_ssize_t left,
9916    Py_ssize_t right,
9917    Py_UCS4 fill)
9918{
9919    PyObject *u;
9920    Py_UCS4 maxchar;
9921    int kind;
9922    void *data;
9923
9924    if (left < 0)
9925        left = 0;
9926    if (right < 0)
9927        right = 0;
9928
9929    if (left == 0 && right == 0)
9930        return unicode_result_unchanged(self);
9931
9932    if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9933        right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
9934        PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9935        return NULL;
9936    }
9937    maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9938    maxchar = Py_MAX(maxchar, fill);
9939    u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
9940    if (!u)
9941        return NULL;
9942
9943    kind = PyUnicode_KIND(u);
9944    data = PyUnicode_DATA(u);
9945    if (left)
9946        FILL(kind, data, fill, 0, left);
9947    if (right)
9948        FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
9949    _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
9950    assert(_PyUnicode_CheckConsistency(u, 1));
9951    return u;
9952}
9953
9954PyObject *
9955PyUnicode_Splitlines(PyObject *string, int keepends)
9956{
9957    PyObject *list;
9958
9959    string = PyUnicode_FromObject(string);
9960    if (string == NULL)
9961        return NULL;
9962    if (PyUnicode_READY(string) == -1) {
9963        Py_DECREF(string);
9964        return NULL;
9965    }
9966
9967    switch (PyUnicode_KIND(string)) {
9968    case PyUnicode_1BYTE_KIND:
9969        if (PyUnicode_IS_ASCII(string))
9970            list = asciilib_splitlines(
9971                string, PyUnicode_1BYTE_DATA(string),
9972                PyUnicode_GET_LENGTH(string), keepends);
9973        else
9974            list = ucs1lib_splitlines(
9975                string, PyUnicode_1BYTE_DATA(string),
9976                PyUnicode_GET_LENGTH(string), keepends);
9977        break;
9978    case PyUnicode_2BYTE_KIND:
9979        list = ucs2lib_splitlines(
9980            string, PyUnicode_2BYTE_DATA(string),
9981            PyUnicode_GET_LENGTH(string), keepends);
9982        break;
9983    case PyUnicode_4BYTE_KIND:
9984        list = ucs4lib_splitlines(
9985            string, PyUnicode_4BYTE_DATA(string),
9986            PyUnicode_GET_LENGTH(string), keepends);
9987        break;
9988    default:
9989        assert(0);
9990        list = 0;
9991    }
9992    Py_DECREF(string);
9993    return list;
9994}
9995
9996static PyObject *
9997split(PyObject *self,
9998      PyObject *substring,
9999      Py_ssize_t maxcount)
10000{
10001    int kind1, kind2;
10002    void *buf1, *buf2;
10003    Py_ssize_t len1, len2;
10004    PyObject* out;
10005
10006    if (maxcount < 0)
10007        maxcount = PY_SSIZE_T_MAX;
10008
10009    if (PyUnicode_READY(self) == -1)
10010        return NULL;
10011
10012    if (substring == NULL)
10013        switch (PyUnicode_KIND(self)) {
10014        case PyUnicode_1BYTE_KIND:
10015            if (PyUnicode_IS_ASCII(self))
10016                return asciilib_split_whitespace(
10017                    self,  PyUnicode_1BYTE_DATA(self),
10018                    PyUnicode_GET_LENGTH(self), maxcount
10019                    );
10020            else
10021                return ucs1lib_split_whitespace(
10022                    self,  PyUnicode_1BYTE_DATA(self),
10023                    PyUnicode_GET_LENGTH(self), maxcount
10024                    );
10025        case PyUnicode_2BYTE_KIND:
10026            return ucs2lib_split_whitespace(
10027                self,  PyUnicode_2BYTE_DATA(self),
10028                PyUnicode_GET_LENGTH(self), maxcount
10029                );
10030        case PyUnicode_4BYTE_KIND:
10031            return ucs4lib_split_whitespace(
10032                self,  PyUnicode_4BYTE_DATA(self),
10033                PyUnicode_GET_LENGTH(self), maxcount
10034                );
10035        default:
10036            assert(0);
10037            return NULL;
10038        }
10039
10040    if (PyUnicode_READY(substring) == -1)
10041        return NULL;
10042
10043    kind1 = PyUnicode_KIND(self);
10044    kind2 = PyUnicode_KIND(substring);
10045    len1 = PyUnicode_GET_LENGTH(self);
10046    len2 = PyUnicode_GET_LENGTH(substring);
10047    if (kind1 < kind2 || len1 < len2) {
10048        out = PyList_New(1);
10049        if (out == NULL)
10050            return NULL;
10051        Py_INCREF(self);
10052        PyList_SET_ITEM(out, 0, self);
10053        return out;
10054    }
10055    buf1 = PyUnicode_DATA(self);
10056    buf2 = PyUnicode_DATA(substring);
10057    if (kind2 != kind1) {
10058        buf2 = _PyUnicode_AsKind(substring, kind1);
10059        if (!buf2)
10060            return NULL;
10061    }
10062
10063    switch (kind1) {
10064    case PyUnicode_1BYTE_KIND:
10065        if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10066            out = asciilib_split(
10067                self,  buf1, len1, buf2, len2, maxcount);
10068        else
10069            out = ucs1lib_split(
10070                self,  buf1, len1, buf2, len2, maxcount);
10071        break;
10072    case PyUnicode_2BYTE_KIND:
10073        out = ucs2lib_split(
10074            self,  buf1, len1, buf2, len2, maxcount);
10075        break;
10076    case PyUnicode_4BYTE_KIND:
10077        out = ucs4lib_split(
10078            self,  buf1, len1, buf2, len2, maxcount);
10079        break;
10080    default:
10081        out = NULL;
10082    }
10083    if (kind2 != kind1)
10084        PyMem_Free(buf2);
10085    return out;
10086}
10087
10088static PyObject *
10089rsplit(PyObject *self,
10090       PyObject *substring,
10091       Py_ssize_t maxcount)
10092{
10093    int kind1, kind2;
10094    void *buf1, *buf2;
10095    Py_ssize_t len1, len2;
10096    PyObject* out;
10097
10098    if (maxcount < 0)
10099        maxcount = PY_SSIZE_T_MAX;
10100
10101    if (PyUnicode_READY(self) == -1)
10102        return NULL;
10103
10104    if (substring == NULL)
10105        switch (PyUnicode_KIND(self)) {
10106        case PyUnicode_1BYTE_KIND:
10107            if (PyUnicode_IS_ASCII(self))
10108                return asciilib_rsplit_whitespace(
10109                    self,  PyUnicode_1BYTE_DATA(self),
10110                    PyUnicode_GET_LENGTH(self), maxcount
10111                    );
10112            else
10113                return ucs1lib_rsplit_whitespace(
10114                    self,  PyUnicode_1BYTE_DATA(self),
10115                    PyUnicode_GET_LENGTH(self), maxcount
10116                    );
10117        case PyUnicode_2BYTE_KIND:
10118            return ucs2lib_rsplit_whitespace(
10119                self,  PyUnicode_2BYTE_DATA(self),
10120                PyUnicode_GET_LENGTH(self), maxcount
10121                );
10122        case PyUnicode_4BYTE_KIND:
10123            return ucs4lib_rsplit_whitespace(
10124                self,  PyUnicode_4BYTE_DATA(self),
10125                PyUnicode_GET_LENGTH(self), maxcount
10126                );
10127        default:
10128            assert(0);
10129            return NULL;
10130        }
10131
10132    if (PyUnicode_READY(substring) == -1)
10133        return NULL;
10134
10135    kind1 = PyUnicode_KIND(self);
10136    kind2 = PyUnicode_KIND(substring);
10137    len1 = PyUnicode_GET_LENGTH(self);
10138    len2 = PyUnicode_GET_LENGTH(substring);
10139    if (kind1 < kind2 || len1 < len2) {
10140        out = PyList_New(1);
10141        if (out == NULL)
10142            return NULL;
10143        Py_INCREF(self);
10144        PyList_SET_ITEM(out, 0, self);
10145        return out;
10146    }
10147    buf1 = PyUnicode_DATA(self);
10148    buf2 = PyUnicode_DATA(substring);
10149    if (kind2 != kind1) {
10150        buf2 = _PyUnicode_AsKind(substring, kind1);
10151        if (!buf2)
10152            return NULL;
10153    }
10154
10155    switch (kind1) {
10156    case PyUnicode_1BYTE_KIND:
10157        if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10158            out = asciilib_rsplit(
10159                self,  buf1, len1, buf2, len2, maxcount);
10160        else
10161            out = ucs1lib_rsplit(
10162                self,  buf1, len1, buf2, len2, maxcount);
10163        break;
10164    case PyUnicode_2BYTE_KIND:
10165        out = ucs2lib_rsplit(
10166            self,  buf1, len1, buf2, len2, maxcount);
10167        break;
10168    case PyUnicode_4BYTE_KIND:
10169        out = ucs4lib_rsplit(
10170            self,  buf1, len1, buf2, len2, maxcount);
10171        break;
10172    default:
10173        out = NULL;
10174    }
10175    if (kind2 != kind1)
10176        PyMem_Free(buf2);
10177    return out;
10178}
10179
10180static Py_ssize_t
10181anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10182            PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
10183{
10184    switch (kind) {
10185    case PyUnicode_1BYTE_KIND:
10186        if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10187            return asciilib_find(buf1, len1, buf2, len2, offset);
10188        else
10189            return ucs1lib_find(buf1, len1, buf2, len2, offset);
10190    case PyUnicode_2BYTE_KIND:
10191        return ucs2lib_find(buf1, len1, buf2, len2, offset);
10192    case PyUnicode_4BYTE_KIND:
10193        return ucs4lib_find(buf1, len1, buf2, len2, offset);
10194    }
10195    assert(0);
10196    return -1;
10197}
10198
10199static Py_ssize_t
10200anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10201             PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
10202{
10203    switch (kind) {
10204    case PyUnicode_1BYTE_KIND:
10205        if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10206            return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10207        else
10208            return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10209    case PyUnicode_2BYTE_KIND:
10210        return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10211    case PyUnicode_4BYTE_KIND:
10212        return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10213    }
10214    assert(0);
10215    return 0;
10216}
10217
10218static void
10219replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10220                      Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10221{
10222    int kind = PyUnicode_KIND(u);
10223    void *data = PyUnicode_DATA(u);
10224    Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10225    if (kind == PyUnicode_1BYTE_KIND) {
10226        ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10227                                      (Py_UCS1 *)data + len,
10228                                      u1, u2, maxcount);
10229    }
10230    else if (kind == PyUnicode_2BYTE_KIND) {
10231        ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10232                                      (Py_UCS2 *)data + len,
10233                                      u1, u2, maxcount);
10234    }
10235    else {
10236        assert(kind == PyUnicode_4BYTE_KIND);
10237        ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10238                                      (Py_UCS4 *)data + len,
10239                                      u1, u2, maxcount);
10240    }
10241}
10242
10243static PyObject *
10244replace(PyObject *self, PyObject *str1,
10245        PyObject *str2, Py_ssize_t maxcount)
10246{
10247    PyObject *u;
10248    char *sbuf = PyUnicode_DATA(self);
10249    char *buf1 = PyUnicode_DATA(str1);
10250    char *buf2 = PyUnicode_DATA(str2);
10251    int srelease = 0, release1 = 0, release2 = 0;
10252    int skind = PyUnicode_KIND(self);
10253    int kind1 = PyUnicode_KIND(str1);
10254    int kind2 = PyUnicode_KIND(str2);
10255    Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10256    Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10257    Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
10258    int mayshrink;
10259    Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
10260
10261    if (maxcount < 0)
10262        maxcount = PY_SSIZE_T_MAX;
10263    else if (maxcount == 0 || slen == 0)
10264        goto nothing;
10265
10266    if (str1 == str2)
10267        goto nothing;
10268
10269    maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10270    maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10271    if (maxchar < maxchar_str1)
10272        /* substring too wide to be present */
10273        goto nothing;
10274    maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10275    /* Replacing str1 with str2 may cause a maxchar reduction in the
10276       result string. */
10277    mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
10278    maxchar = Py_MAX(maxchar, maxchar_str2);
10279
10280    if (len1 == len2) {
10281        /* same length */
10282        if (len1 == 0)
10283            goto nothing;
10284        if (len1 == 1) {
10285            /* replace characters */
10286            Py_UCS4 u1, u2;
10287            Py_ssize_t pos;
10288
10289            u1 = PyUnicode_READ(kind1, buf1, 0);
10290            pos = findchar(sbuf, skind, slen, u1, 1);
10291            if (pos < 0)
10292                goto nothing;
10293            u2 = PyUnicode_READ(kind2, buf2, 0);
10294            u = PyUnicode_New(slen, maxchar);
10295            if (!u)
10296                goto error;
10297
10298            _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10299            replace_1char_inplace(u, pos, u1, u2, maxcount);
10300        }
10301        else {
10302            int rkind = skind;
10303            char *res;
10304            Py_ssize_t i;
10305
10306            if (kind1 < rkind) {
10307                /* widen substring */
10308                buf1 = _PyUnicode_AsKind(str1, rkind);
10309                if (!buf1) goto error;
10310                release1 = 1;
10311            }
10312            i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
10313            if (i < 0)
10314                goto nothing;
10315            if (rkind > kind2) {
10316                /* widen replacement */
10317                buf2 = _PyUnicode_AsKind(str2, rkind);
10318                if (!buf2) goto error;
10319                release2 = 1;
10320            }
10321            else if (rkind < kind2) {
10322                /* widen self and buf1 */
10323                rkind = kind2;
10324                if (release1) PyMem_Free(buf1);
10325                release1 = 0;
10326                sbuf = _PyUnicode_AsKind(self, rkind);
10327                if (!sbuf) goto error;
10328                srelease = 1;
10329                buf1 = _PyUnicode_AsKind(str1, rkind);
10330                if (!buf1) goto error;
10331                release1 = 1;
10332            }
10333            u = PyUnicode_New(slen, maxchar);
10334            if (!u)
10335                goto error;
10336            assert(PyUnicode_KIND(u) == rkind);
10337            res = PyUnicode_DATA(u);
10338
10339            memcpy(res, sbuf, rkind * slen);
10340            /* change everything in-place, starting with this one */
10341            memcpy(res + rkind * i,
10342                   buf2,
10343                   rkind * len2);
10344            i += len1;
10345
10346            while ( --maxcount > 0) {
10347                i = anylib_find(rkind, self,
10348                                sbuf+rkind*i, slen-i,
10349                                str1, buf1, len1, i);
10350                if (i == -1)
10351                    break;
10352                memcpy(res + rkind * i,
10353                       buf2,
10354                       rkind * len2);
10355                i += len1;
10356            }
10357        }
10358    }
10359    else {
10360        Py_ssize_t n, i, j, ires;
10361        Py_ssize_t new_size;
10362        int rkind = skind;
10363        char *res;
10364
10365        if (kind1 < rkind) {
10366            /* widen substring */
10367            buf1 = _PyUnicode_AsKind(str1, rkind);
10368            if (!buf1) goto error;
10369            release1 = 1;
10370        }
10371        n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
10372        if (n == 0)
10373            goto nothing;
10374        if (kind2 < rkind) {
10375            /* widen replacement */
10376            buf2 = _PyUnicode_AsKind(str2, rkind);
10377            if (!buf2) goto error;
10378            release2 = 1;
10379        }
10380        else if (kind2 > rkind) {
10381            /* widen self and buf1 */
10382            rkind = kind2;
10383            sbuf = _PyUnicode_AsKind(self, rkind);
10384            if (!sbuf) goto error;
10385            srelease = 1;
10386            if (release1) PyMem_Free(buf1);
10387            release1 = 0;
10388            buf1 = _PyUnicode_AsKind(str1, rkind);
10389            if (!buf1) goto error;
10390            release1 = 1;
10391        }
10392        /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10393           PyUnicode_GET_LENGTH(str1))); */
10394        if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
10395                PyErr_SetString(PyExc_OverflowError,
10396                                "replace string is too long");
10397                goto error;
10398        }
10399        new_size = slen + n * (len2 - len1);
10400        if (new_size == 0) {
10401            _Py_INCREF_UNICODE_EMPTY();
10402            if (!unicode_empty)
10403                goto error;
10404            u = unicode_empty;
10405            goto done;
10406        }
10407        if (new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
10408            PyErr_SetString(PyExc_OverflowError,
10409                            "replace string is too long");
10410            goto error;
10411        }
10412        u = PyUnicode_New(new_size, maxchar);
10413        if (!u)
10414            goto error;
10415        assert(PyUnicode_KIND(u) == rkind);
10416        res = PyUnicode_DATA(u);
10417        ires = i = 0;
10418        if (len1 > 0) {
10419            while (n-- > 0) {
10420                /* look for next match */
10421                j = anylib_find(rkind, self,
10422                                sbuf + rkind * i, slen-i,
10423                                str1, buf1, len1, i);
10424                if (j == -1)
10425                    break;
10426                else if (j > i) {
10427                    /* copy unchanged part [i:j] */
10428                    memcpy(res + rkind * ires,
10429                           sbuf + rkind * i,
10430                           rkind * (j-i));
10431                    ires += j - i;
10432                }
10433                /* copy substitution string */
10434                if (len2 > 0) {
10435                    memcpy(res + rkind * ires,
10436                           buf2,
10437                           rkind * len2);
10438                    ires += len2;
10439                }
10440                i = j + len1;
10441            }
10442            if (i < slen)
10443                /* copy tail [i:] */
10444                memcpy(res + rkind * ires,
10445                       sbuf + rkind * i,
10446                       rkind * (slen-i));
10447        }
10448        else {
10449            /* interleave */
10450            while (n > 0) {
10451                memcpy(res + rkind * ires,
10452                       buf2,
10453                       rkind * len2);
10454                ires += len2;
10455                if (--n <= 0)
10456                    break;
10457                memcpy(res + rkind * ires,
10458                       sbuf + rkind * i,
10459                       rkind);
10460                ires++;
10461                i++;
10462            }
10463            memcpy(res + rkind * ires,
10464                   sbuf + rkind * i,
10465                   rkind * (slen-i));
10466        }
10467    }
10468
10469    if (mayshrink) {
10470        unicode_adjust_maxchar(&u);
10471        if (u == NULL)
10472            goto error;
10473    }
10474
10475  done:
10476    if (srelease)
10477        PyMem_FREE(sbuf);
10478    if (release1)
10479        PyMem_FREE(buf1);
10480    if (release2)
10481        PyMem_FREE(buf2);
10482    assert(_PyUnicode_CheckConsistency(u, 1));
10483    return u;
10484
10485  nothing:
10486    /* nothing to replace; return original string (when possible) */
10487    if (srelease)
10488        PyMem_FREE(sbuf);
10489    if (release1)
10490        PyMem_FREE(buf1);
10491    if (release2)
10492        PyMem_FREE(buf2);
10493    return unicode_result_unchanged(self);
10494
10495  error:
10496    if (srelease && sbuf)
10497        PyMem_FREE(sbuf);
10498    if (release1 && buf1)
10499        PyMem_FREE(buf1);
10500    if (release2 && buf2)
10501        PyMem_FREE(buf2);
10502    return NULL;
10503}
10504
10505/* --- Unicode Object Methods --------------------------------------------- */
10506
10507PyDoc_STRVAR(title__doc__,
10508             "S.title() -> str\n\
10509\n\
10510Return a titlecased version of S, i.e. words start with title case\n\
10511characters, all remaining cased characters have lower case.");
10512
10513static PyObject*
10514unicode_title(PyObject *self)
10515{
10516    if (PyUnicode_READY(self) == -1)
10517        return NULL;
10518    return case_operation(self, do_title);
10519}
10520
10521PyDoc_STRVAR(capitalize__doc__,
10522             "S.capitalize() -> str\n\
10523\n\
10524Return a capitalized version of S, i.e. make the first character\n\
10525have upper case and the rest lower case.");
10526
10527static PyObject*
10528unicode_capitalize(PyObject *self)
10529{
10530    if (PyUnicode_READY(self) == -1)
10531        return NULL;
10532    if (PyUnicode_GET_LENGTH(self) == 0)
10533        return unicode_result_unchanged(self);
10534    return case_operation(self, do_capitalize);
10535}
10536
10537PyDoc_STRVAR(casefold__doc__,
10538             "S.casefold() -> str\n\
10539\n\
10540Return a version of S suitable for caseless comparisons.");
10541
10542static PyObject *
10543unicode_casefold(PyObject *self)
10544{
10545    if (PyUnicode_READY(self) == -1)
10546        return NULL;
10547    if (PyUnicode_IS_ASCII(self))
10548        return ascii_upper_or_lower(self, 1);
10549    return case_operation(self, do_casefold);
10550}
10551
10552
10553/* Argument converter.  Coerces to a single unicode character */
10554
10555static int
10556convert_uc(PyObject *obj, void *addr)
10557{
10558    Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
10559    PyObject *uniobj;
10560
10561    uniobj = PyUnicode_FromObject(obj);
10562    if (uniobj == NULL) {
10563        PyErr_SetString(PyExc_TypeError,
10564                        "The fill character cannot be converted to Unicode");
10565        return 0;
10566    }
10567    if (PyUnicode_GET_LENGTH(uniobj) != 1) {
10568        PyErr_SetString(PyExc_TypeError,
10569                        "The fill character must be exactly one character long");
10570        Py_DECREF(uniobj);
10571        return 0;
10572    }
10573    *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
10574    Py_DECREF(uniobj);
10575    return 1;
10576}
10577
10578PyDoc_STRVAR(center__doc__,
10579             "S.center(width[, fillchar]) -> str\n\
10580\n\
10581Return S centered in a string of length width. Padding is\n\
10582done using the specified fill character (default is a space)");
10583
10584static PyObject *
10585unicode_center(PyObject *self, PyObject *args)
10586{
10587    Py_ssize_t marg, left;
10588    Py_ssize_t width;
10589    Py_UCS4 fillchar = ' ';
10590
10591    if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
10592        return NULL;
10593
10594    if (PyUnicode_READY(self) == -1)
10595        return NULL;
10596
10597    if (PyUnicode_GET_LENGTH(self) >= width)
10598        return unicode_result_unchanged(self);
10599
10600    marg = width - PyUnicode_GET_LENGTH(self);
10601    left = marg / 2 + (marg & width & 1);
10602
10603    return pad(self, left, marg - left, fillchar);
10604}
10605
10606/* This function assumes that str1 and str2 are readied by the caller. */
10607
10608static int
10609unicode_compare(PyObject *str1, PyObject *str2)
10610{
10611#define COMPARE(TYPE1, TYPE2) \
10612    do { \
10613        TYPE1* p1 = (TYPE1 *)data1; \
10614        TYPE2* p2 = (TYPE2 *)data2; \
10615        TYPE1* end = p1 + len; \
10616        Py_UCS4 c1, c2; \
10617        for (; p1 != end; p1++, p2++) { \
10618            c1 = *p1; \
10619            c2 = *p2; \
10620            if (c1 != c2) \
10621                return (c1 < c2) ? -1 : 1; \
10622        } \
10623    } \
10624    while (0)
10625
10626    int kind1, kind2;
10627    void *data1, *data2;
10628    Py_ssize_t len1, len2, len;
10629
10630    kind1 = PyUnicode_KIND(str1);
10631    kind2 = PyUnicode_KIND(str2);
10632    data1 = PyUnicode_DATA(str1);
10633    data2 = PyUnicode_DATA(str2);
10634    len1 = PyUnicode_GET_LENGTH(str1);
10635    len2 = PyUnicode_GET_LENGTH(str2);
10636    len = Py_MIN(len1, len2);
10637
10638    switch(kind1) {
10639    case PyUnicode_1BYTE_KIND:
10640    {
10641        switch(kind2) {
10642        case PyUnicode_1BYTE_KIND:
10643        {
10644            int cmp = memcmp(data1, data2, len);
10645            /* normalize result of memcmp() into the range [-1; 1] */
10646            if (cmp < 0)
10647                return -1;
10648            if (cmp > 0)
10649                return 1;
10650            break;
10651        }
10652        case PyUnicode_2BYTE_KIND:
10653            COMPARE(Py_UCS1, Py_UCS2);
10654            break;
10655        case PyUnicode_4BYTE_KIND:
10656            COMPARE(Py_UCS1, Py_UCS4);
10657            break;
10658        default:
10659            assert(0);
10660        }
10661        break;
10662    }
10663    case PyUnicode_2BYTE_KIND:
10664    {
10665        switch(kind2) {
10666        case PyUnicode_1BYTE_KIND:
10667            COMPARE(Py_UCS2, Py_UCS1);
10668            break;
10669        case PyUnicode_2BYTE_KIND:
10670        {
10671            COMPARE(Py_UCS2, Py_UCS2);
10672            break;
10673        }
10674        case PyUnicode_4BYTE_KIND:
10675            COMPARE(Py_UCS2, Py_UCS4);
10676            break;
10677        default:
10678            assert(0);
10679        }
10680        break;
10681    }
10682    case PyUnicode_4BYTE_KIND:
10683    {
10684        switch(kind2) {
10685        case PyUnicode_1BYTE_KIND:
10686            COMPARE(Py_UCS4, Py_UCS1);
10687            break;
10688        case PyUnicode_2BYTE_KIND:
10689            COMPARE(Py_UCS4, Py_UCS2);
10690            break;
10691        case PyUnicode_4BYTE_KIND:
10692        {
10693#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10694            int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10695            /* normalize result of wmemcmp() into the range [-1; 1] */
10696            if (cmp < 0)
10697                return -1;
10698            if (cmp > 0)
10699                return 1;
10700#else
10701            COMPARE(Py_UCS4, Py_UCS4);
10702#endif
10703            break;
10704        }
10705        default:
10706            assert(0);
10707        }
10708        break;
10709    }
10710    default:
10711        assert(0);
10712    }
10713
10714    if (len1 == len2)
10715        return 0;
10716    if (len1 < len2)
10717        return -1;
10718    else
10719        return 1;
10720
10721#undef COMPARE
10722}
10723
10724Py_LOCAL(int)
10725unicode_compare_eq(PyObject *str1, PyObject *str2)
10726{
10727    int kind;
10728    void *data1, *data2;
10729    Py_ssize_t len;
10730    int cmp;
10731
10732    len = PyUnicode_GET_LENGTH(str1);
10733    if (PyUnicode_GET_LENGTH(str2) != len)
10734        return 0;
10735    kind = PyUnicode_KIND(str1);
10736    if (PyUnicode_KIND(str2) != kind)
10737        return 0;
10738    data1 = PyUnicode_DATA(str1);
10739    data2 = PyUnicode_DATA(str2);
10740
10741    cmp = memcmp(data1, data2, len * kind);
10742    return (cmp == 0);
10743}
10744
10745
10746int
10747PyUnicode_Compare(PyObject *left, PyObject *right)
10748{
10749    if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10750        if (PyUnicode_READY(left) == -1 ||
10751            PyUnicode_READY(right) == -1)
10752            return -1;
10753
10754        /* a string is equal to itself */
10755        if (left == right)
10756            return 0;
10757
10758        return unicode_compare(left, right);
10759    }
10760    PyErr_Format(PyExc_TypeError,
10761                 "Can't compare %.100s and %.100s",
10762                 left->ob_type->tp_name,
10763                 right->ob_type->tp_name);
10764    return -1;
10765}
10766
10767int
10768_PyUnicode_CompareWithId(PyObject *left, _Py_Identifier *right)
10769{
10770    PyObject *right_str = _PyUnicode_FromId(right);   /* borrowed */
10771    if (right_str == NULL)
10772        return -1;
10773    return PyUnicode_Compare(left, right_str);
10774}
10775
10776int
10777PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10778{
10779    Py_ssize_t i;
10780    int kind;
10781    Py_UCS4 chr;
10782
10783    assert(_PyUnicode_CHECK(uni));
10784    if (PyUnicode_READY(uni) == -1)
10785        return -1;
10786    kind = PyUnicode_KIND(uni);
10787    if (kind == PyUnicode_1BYTE_KIND) {
10788        const void *data = PyUnicode_1BYTE_DATA(uni);
10789        size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
10790        size_t len, len2 = strlen(str);
10791        int cmp;
10792
10793        len = Py_MIN(len1, len2);
10794        cmp = memcmp(data, str, len);
10795        if (cmp != 0) {
10796            if (cmp < 0)
10797                return -1;
10798            else
10799                return 1;
10800        }
10801        if (len1 > len2)
10802            return 1; /* uni is longer */
10803        if (len1 < len2)
10804            return -1; /* str is longer */
10805        return 0;
10806    }
10807    else {
10808        void *data = PyUnicode_DATA(uni);
10809        /* Compare Unicode string and source character set string */
10810        for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10811            if (chr != (unsigned char)str[i])
10812                return (chr < (unsigned char)(str[i])) ? -1 : 1;
10813        /* This check keeps Python strings that end in '\0' from comparing equal
10814         to C strings identical up to that point. */
10815        if (PyUnicode_GET_LENGTH(uni) != i || chr)
10816            return 1; /* uni is longer */
10817        if (str[i])
10818            return -1; /* str is longer */
10819        return 0;
10820    }
10821}
10822
10823
10824#define TEST_COND(cond)                         \
10825    ((cond) ? Py_True : Py_False)
10826
10827PyObject *
10828PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
10829{
10830    int result;
10831    PyObject *v;
10832
10833    if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
10834        Py_RETURN_NOTIMPLEMENTED;
10835
10836    if (PyUnicode_READY(left) == -1 ||
10837        PyUnicode_READY(right) == -1)
10838        return NULL;
10839
10840    if (left == right) {
10841        switch (op) {
10842        case Py_EQ:
10843        case Py_LE:
10844        case Py_GE:
10845            /* a string is equal to itself */
10846            v = Py_True;
10847            break;
10848        case Py_NE:
10849        case Py_LT:
10850        case Py_GT:
10851            v = Py_False;
10852            break;
10853        default:
10854            PyErr_BadArgument();
10855            return NULL;
10856        }
10857    }
10858    else if (op == Py_EQ || op == Py_NE) {
10859        result = unicode_compare_eq(left, right);
10860        result ^= (op == Py_NE);
10861        v = TEST_COND(result);
10862    }
10863    else {
10864        result = unicode_compare(left, right);
10865
10866        /* Convert the return value to a Boolean */
10867        switch (op) {
10868        case Py_LE:
10869            v = TEST_COND(result <= 0);
10870            break;
10871        case Py_GE:
10872            v = TEST_COND(result >= 0);
10873            break;
10874        case Py_LT:
10875            v = TEST_COND(result == -1);
10876            break;
10877        case Py_GT:
10878            v = TEST_COND(result == 1);
10879            break;
10880        default:
10881            PyErr_BadArgument();
10882            return NULL;
10883        }
10884    }
10885    Py_INCREF(v);
10886    return v;
10887}
10888
10889int
10890PyUnicode_Contains(PyObject *container, PyObject *element)
10891{
10892    PyObject *str, *sub;
10893    int kind1, kind2;
10894    void *buf1, *buf2;
10895    Py_ssize_t len1, len2;
10896    int result;
10897
10898    /* Coerce the two arguments */
10899    sub = PyUnicode_FromObject(element);
10900    if (!sub) {
10901        PyErr_Format(PyExc_TypeError,
10902                     "'in <string>' requires string as left operand, not %s",
10903                     element->ob_type->tp_name);
10904        return -1;
10905    }
10906
10907    str = PyUnicode_FromObject(container);
10908    if (!str) {
10909        Py_DECREF(sub);
10910        return -1;
10911    }
10912
10913    kind1 = PyUnicode_KIND(str);
10914    kind2 = PyUnicode_KIND(sub);
10915    if (kind1 < kind2) {
10916        Py_DECREF(sub);
10917        Py_DECREF(str);
10918        return 0;
10919    }
10920    len1 = PyUnicode_GET_LENGTH(str);
10921    len2 = PyUnicode_GET_LENGTH(sub);
10922    if (len1 < len2) {
10923        Py_DECREF(sub);
10924        Py_DECREF(str);
10925        return 0;
10926    }
10927    buf1 = PyUnicode_DATA(str);
10928    buf2 = PyUnicode_DATA(sub);
10929    if (len2 == 1) {
10930        Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
10931        result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
10932        Py_DECREF(sub);
10933        Py_DECREF(str);
10934        return result;
10935    }
10936    if (kind2 != kind1) {
10937        buf2 = _PyUnicode_AsKind(sub, kind1);
10938        if (!buf2) {
10939            Py_DECREF(sub);
10940            Py_DECREF(str);
10941            return -1;
10942        }
10943    }
10944
10945    switch (kind1) {
10946    case PyUnicode_1BYTE_KIND:
10947        result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10948        break;
10949    case PyUnicode_2BYTE_KIND:
10950        result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10951        break;
10952    case PyUnicode_4BYTE_KIND:
10953        result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10954        break;
10955    default:
10956        result = -1;
10957        assert(0);
10958    }
10959
10960    Py_DECREF(str);
10961    Py_DECREF(sub);
10962
10963    if (kind2 != kind1)
10964        PyMem_Free(buf2);
10965
10966    return result;
10967}
10968
10969/* Concat to string or Unicode object giving a new Unicode object. */
10970
10971PyObject *
10972PyUnicode_Concat(PyObject *left, PyObject *right)
10973{
10974    PyObject *u = NULL, *v = NULL, *w;
10975    Py_UCS4 maxchar, maxchar2;
10976    Py_ssize_t u_len, v_len, new_len;
10977
10978    /* Coerce the two arguments */
10979    u = PyUnicode_FromObject(left);
10980    if (u == NULL)
10981        goto onError;
10982    v = PyUnicode_FromObject(right);
10983    if (v == NULL)
10984        goto onError;
10985
10986    /* Shortcuts */
10987    if (v == unicode_empty) {
10988        Py_DECREF(v);
10989        return u;
10990    }
10991    if (u == unicode_empty) {
10992        Py_DECREF(u);
10993        return v;
10994    }
10995
10996    u_len = PyUnicode_GET_LENGTH(u);
10997    v_len = PyUnicode_GET_LENGTH(v);
10998    if (u_len > PY_SSIZE_T_MAX - v_len) {
10999        PyErr_SetString(PyExc_OverflowError,
11000                        "strings are too large to concat");
11001        goto onError;
11002    }
11003    new_len = u_len + v_len;
11004
11005    maxchar = PyUnicode_MAX_CHAR_VALUE(u);
11006    maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
11007    maxchar = Py_MAX(maxchar, maxchar2);
11008
11009    /* Concat the two Unicode strings */
11010    w = PyUnicode_New(new_len, maxchar);
11011    if (w == NULL)
11012        goto onError;
11013    _PyUnicode_FastCopyCharacters(w, 0, u, 0, u_len);
11014    _PyUnicode_FastCopyCharacters(w, u_len, v, 0, v_len);
11015    Py_DECREF(u);
11016    Py_DECREF(v);
11017    assert(_PyUnicode_CheckConsistency(w, 1));
11018    return w;
11019
11020  onError:
11021    Py_XDECREF(u);
11022    Py_XDECREF(v);
11023    return NULL;
11024}
11025
11026void
11027PyUnicode_Append(PyObject **p_left, PyObject *right)
11028{
11029    PyObject *left, *res;
11030    Py_UCS4 maxchar, maxchar2;
11031    Py_ssize_t left_len, right_len, new_len;
11032
11033    if (p_left == NULL) {
11034        if (!PyErr_Occurred())
11035            PyErr_BadInternalCall();
11036        return;
11037    }
11038    left = *p_left;
11039    if (right == NULL || left == NULL
11040        || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
11041        if (!PyErr_Occurred())
11042            PyErr_BadInternalCall();
11043        goto error;
11044    }
11045
11046    if (PyUnicode_READY(left) == -1)
11047        goto error;
11048    if (PyUnicode_READY(right) == -1)
11049        goto error;
11050
11051    /* Shortcuts */
11052    if (left == unicode_empty) {
11053        Py_DECREF(left);
11054        Py_INCREF(right);
11055        *p_left = right;
11056        return;
11057    }
11058    if (right == unicode_empty)
11059        return;
11060
11061    left_len = PyUnicode_GET_LENGTH(left);
11062    right_len = PyUnicode_GET_LENGTH(right);
11063    if (left_len > PY_SSIZE_T_MAX - right_len) {
11064        PyErr_SetString(PyExc_OverflowError,
11065                        "strings are too large to concat");
11066        goto error;
11067    }
11068    new_len = left_len + right_len;
11069
11070    if (unicode_modifiable(left)
11071        && PyUnicode_CheckExact(right)
11072        && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
11073        /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11074           to change the structure size, but characters are stored just after
11075           the structure, and so it requires to move all characters which is
11076           not so different than duplicating the string. */
11077        && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11078    {
11079        /* append inplace */
11080        if (unicode_resize(p_left, new_len) != 0)
11081            goto error;
11082
11083        /* copy 'right' into the newly allocated area of 'left' */
11084        _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
11085    }
11086    else {
11087        maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11088        maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11089        maxchar = Py_MAX(maxchar, maxchar2);
11090
11091        /* Concat the two Unicode strings */
11092        res = PyUnicode_New(new_len, maxchar);
11093        if (res == NULL)
11094            goto error;
11095        _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11096        _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
11097        Py_DECREF(left);
11098        *p_left = res;
11099    }
11100    assert(_PyUnicode_CheckConsistency(*p_left, 1));
11101    return;
11102
11103error:
11104    Py_CLEAR(*p_left);
11105}
11106
11107void
11108PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11109{
11110    PyUnicode_Append(pleft, right);
11111    Py_XDECREF(right);
11112}
11113
11114PyDoc_STRVAR(count__doc__,
11115             "S.count(sub[, start[, end]]) -> int\n\
11116\n\
11117Return the number of non-overlapping occurrences of substring sub in\n\
11118string S[start:end].  Optional arguments start and end are\n\
11119interpreted as in slice notation.");
11120
11121static PyObject *
11122unicode_count(PyObject *self, PyObject *args)
11123{
11124    PyObject *substring = NULL;   /* initialize to fix a compiler warning */
11125    Py_ssize_t start = 0;
11126    Py_ssize_t end = PY_SSIZE_T_MAX;
11127    PyObject *result;
11128    int kind1, kind2;
11129    void *buf1, *buf2;
11130    Py_ssize_t len1, len2, iresult;
11131
11132    if (!stringlib_parse_args_finds_unicode("count", args, &substring,
11133                                            &start, &end))
11134        return NULL;
11135
11136    kind1 = PyUnicode_KIND(self);
11137    kind2 = PyUnicode_KIND(substring);
11138    if (kind1 < kind2) {
11139        Py_DECREF(substring);
11140        return PyLong_FromLong(0);
11141    }
11142    len1 = PyUnicode_GET_LENGTH(self);
11143    len2 = PyUnicode_GET_LENGTH(substring);
11144    ADJUST_INDICES(start, end, len1);
11145    if (end - start < len2) {
11146        Py_DECREF(substring);
11147        return PyLong_FromLong(0);
11148    }
11149    buf1 = PyUnicode_DATA(self);
11150    buf2 = PyUnicode_DATA(substring);
11151    if (kind2 != kind1) {
11152        buf2 = _PyUnicode_AsKind(substring, kind1);
11153        if (!buf2) {
11154            Py_DECREF(substring);
11155            return NULL;
11156        }
11157    }
11158    switch (kind1) {
11159    case PyUnicode_1BYTE_KIND:
11160        iresult = ucs1lib_count(
11161            ((Py_UCS1*)buf1) + start, end - start,
11162            buf2, len2, PY_SSIZE_T_MAX
11163            );
11164        break;
11165    case PyUnicode_2BYTE_KIND:
11166        iresult = ucs2lib_count(
11167            ((Py_UCS2*)buf1) + start, end - start,
11168            buf2, len2, PY_SSIZE_T_MAX
11169            );
11170        break;
11171    case PyUnicode_4BYTE_KIND:
11172        iresult = ucs4lib_count(
11173            ((Py_UCS4*)buf1) + start, end - start,
11174            buf2, len2, PY_SSIZE_T_MAX
11175            );
11176        break;
11177    default:
11178        assert(0); iresult = 0;
11179    }
11180
11181    result = PyLong_FromSsize_t(iresult);
11182
11183    if (kind2 != kind1)
11184        PyMem_Free(buf2);
11185
11186    Py_DECREF(substring);
11187
11188    return result;
11189}
11190
11191PyDoc_STRVAR(encode__doc__,
11192             "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
11193\n\
11194Encode S using the codec registered for encoding. Default encoding\n\
11195is 'utf-8'. errors may be given to set a different error\n\
11196handling scheme. Default is 'strict' meaning that encoding errors raise\n\
11197a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
11198'xmlcharrefreplace' as well as any other name registered with\n\
11199codecs.register_error that can handle UnicodeEncodeErrors.");
11200
11201static PyObject *
11202unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
11203{
11204    static char *kwlist[] = {"encoding", "errors", 0};
11205    char *encoding = NULL;
11206    char *errors = NULL;
11207
11208    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
11209                                     kwlist, &encoding, &errors))
11210        return NULL;
11211    return PyUnicode_AsEncodedString(self, encoding, errors);
11212}
11213
11214PyDoc_STRVAR(expandtabs__doc__,
11215             "S.expandtabs(tabsize=8) -> str\n\
11216\n\
11217Return a copy of S where all tab characters are expanded using spaces.\n\
11218If tabsize is not given, a tab size of 8 characters is assumed.");
11219
11220static PyObject*
11221unicode_expandtabs(PyObject *self, PyObject *args, PyObject *kwds)
11222{
11223    Py_ssize_t i, j, line_pos, src_len, incr;
11224    Py_UCS4 ch;
11225    PyObject *u;
11226    void *src_data, *dest_data;
11227    static char *kwlist[] = {"tabsize", 0};
11228    int tabsize = 8;
11229    int kind;
11230    int found;
11231
11232    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:expandtabs",
11233                                     kwlist, &tabsize))
11234        return NULL;
11235
11236    if (PyUnicode_READY(self) == -1)
11237        return NULL;
11238
11239    /* First pass: determine size of output string */
11240    src_len = PyUnicode_GET_LENGTH(self);
11241    i = j = line_pos = 0;
11242    kind = PyUnicode_KIND(self);
11243    src_data = PyUnicode_DATA(self);
11244    found = 0;
11245    for (; i < src_len; i++) {
11246        ch = PyUnicode_READ(kind, src_data, i);
11247        if (ch == '\t') {
11248            found = 1;
11249            if (tabsize > 0) {
11250                incr = tabsize - (line_pos % tabsize); /* cannot overflow */
11251                if (j > PY_SSIZE_T_MAX - incr)
11252                    goto overflow;
11253                line_pos += incr;
11254                j += incr;
11255            }
11256        }
11257        else {
11258            if (j > PY_SSIZE_T_MAX - 1)
11259                goto overflow;
11260            line_pos++;
11261            j++;
11262            if (ch == '\n' || ch == '\r')
11263                line_pos = 0;
11264        }
11265    }
11266    if (!found)
11267        return unicode_result_unchanged(self);
11268
11269    /* Second pass: create output string and fill it */
11270    u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
11271    if (!u)
11272        return NULL;
11273    dest_data = PyUnicode_DATA(u);
11274
11275    i = j = line_pos = 0;
11276
11277    for (; i < src_len; i++) {
11278        ch = PyUnicode_READ(kind, src_data, i);
11279        if (ch == '\t') {
11280            if (tabsize > 0) {
11281                incr = tabsize - (line_pos % tabsize);
11282                line_pos += incr;
11283                FILL(kind, dest_data, ' ', j, incr);
11284                j += incr;
11285            }
11286        }
11287        else {
11288            line_pos++;
11289            PyUnicode_WRITE(kind, dest_data, j, ch);
11290            j++;
11291            if (ch == '\n' || ch == '\r')
11292                line_pos = 0;
11293        }
11294    }
11295    assert (j == PyUnicode_GET_LENGTH(u));
11296    return unicode_result(u);
11297
11298  overflow:
11299    PyErr_SetString(PyExc_OverflowError, "new string is too long");
11300    return NULL;
11301}
11302
11303PyDoc_STRVAR(find__doc__,
11304             "S.find(sub[, start[, end]]) -> int\n\
11305\n\
11306Return the lowest index in S where substring sub is found,\n\
11307such that sub is contained within S[start:end].  Optional\n\
11308arguments start and end are interpreted as in slice notation.\n\
11309\n\
11310Return -1 on failure.");
11311
11312static PyObject *
11313unicode_find(PyObject *self, PyObject *args)
11314{
11315    /* initialize variables to prevent gcc warning */
11316    PyObject *substring = NULL;
11317    Py_ssize_t start = 0;
11318    Py_ssize_t end = 0;
11319    Py_ssize_t result;
11320
11321    if (!stringlib_parse_args_finds_unicode("find", args, &substring,
11322                                            &start, &end))
11323        return NULL;
11324
11325    if (PyUnicode_READY(self) == -1) {
11326        Py_DECREF(substring);
11327        return NULL;
11328    }
11329    if (PyUnicode_READY(substring) == -1) {
11330        Py_DECREF(substring);
11331        return NULL;
11332    }
11333
11334    result = any_find_slice(1, self, substring, start, end);
11335
11336    Py_DECREF(substring);
11337
11338    if (result == -2)
11339        return NULL;
11340
11341    return PyLong_FromSsize_t(result);
11342}
11343
11344static PyObject *
11345unicode_getitem(PyObject *self, Py_ssize_t index)
11346{
11347    void *data;
11348    enum PyUnicode_Kind kind;
11349    Py_UCS4 ch;
11350
11351    if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
11352        PyErr_BadArgument();
11353        return NULL;
11354    }
11355    if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11356        PyErr_SetString(PyExc_IndexError, "string index out of range");
11357        return NULL;
11358    }
11359    kind = PyUnicode_KIND(self);
11360    data = PyUnicode_DATA(self);
11361    ch = PyUnicode_READ(kind, data, index);
11362    return unicode_char(ch);
11363}
11364
11365/* Believe it or not, this produces the same value for ASCII strings
11366   as bytes_hash(). */
11367static Py_hash_t
11368unicode_hash(PyObject *self)
11369{
11370    Py_ssize_t len;
11371    Py_uhash_t x;  /* Unsigned for defined overflow behavior. */
11372
11373#ifdef Py_DEBUG
11374    assert(_Py_HashSecret_Initialized);
11375#endif
11376    if (_PyUnicode_HASH(self) != -1)
11377        return _PyUnicode_HASH(self);
11378    if (PyUnicode_READY(self) == -1)
11379        return -1;
11380    len = PyUnicode_GET_LENGTH(self);
11381    /*
11382      We make the hash of the empty string be 0, rather than using
11383      (prefix ^ suffix), since this slightly obfuscates the hash secret
11384    */
11385    if (len == 0) {
11386        _PyUnicode_HASH(self) = 0;
11387        return 0;
11388    }
11389    x = _Py_HashBytes(PyUnicode_DATA(self),
11390                      PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
11391    _PyUnicode_HASH(self) = x;
11392    return x;
11393}
11394
11395PyDoc_STRVAR(index__doc__,
11396             "S.index(sub[, start[, end]]) -> int\n\
11397\n\
11398Like S.find() but raise ValueError when the substring is not found.");
11399
11400static PyObject *
11401unicode_index(PyObject *self, PyObject *args)
11402{
11403    /* initialize variables to prevent gcc warning */
11404    Py_ssize_t result;
11405    PyObject *substring = NULL;
11406    Py_ssize_t start = 0;
11407    Py_ssize_t end = 0;
11408
11409    if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11410                                            &start, &end))
11411        return NULL;
11412
11413    if (PyUnicode_READY(self) == -1) {
11414        Py_DECREF(substring);
11415        return NULL;
11416    }
11417    if (PyUnicode_READY(substring) == -1) {
11418        Py_DECREF(substring);
11419        return NULL;
11420    }
11421
11422    result = any_find_slice(1, self, substring, start, end);
11423
11424    Py_DECREF(substring);
11425
11426    if (result == -2)
11427        return NULL;
11428
11429    if (result < 0) {
11430        PyErr_SetString(PyExc_ValueError, "substring not found");
11431        return NULL;
11432    }
11433
11434    return PyLong_FromSsize_t(result);
11435}
11436
11437PyDoc_STRVAR(islower__doc__,
11438             "S.islower() -> bool\n\
11439\n\
11440Return True if all cased characters in S are lowercase and there is\n\
11441at least one cased character in S, False otherwise.");
11442
11443static PyObject*
11444unicode_islower(PyObject *self)
11445{
11446    Py_ssize_t i, length;
11447    int kind;
11448    void *data;
11449    int cased;
11450
11451    if (PyUnicode_READY(self) == -1)
11452        return NULL;
11453    length = PyUnicode_GET_LENGTH(self);
11454    kind = PyUnicode_KIND(self);
11455    data = PyUnicode_DATA(self);
11456
11457    /* Shortcut for single character strings */
11458    if (length == 1)
11459        return PyBool_FromLong(
11460            Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
11461
11462    /* Special case for empty strings */
11463    if (length == 0)
11464        return PyBool_FromLong(0);
11465
11466    cased = 0;
11467    for (i = 0; i < length; i++) {
11468        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11469
11470        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11471            return PyBool_FromLong(0);
11472        else if (!cased && Py_UNICODE_ISLOWER(ch))
11473            cased = 1;
11474    }
11475    return PyBool_FromLong(cased);
11476}
11477
11478PyDoc_STRVAR(isupper__doc__,
11479             "S.isupper() -> bool\n\
11480\n\
11481Return True if all cased characters in S are uppercase and there is\n\
11482at least one cased character in S, False otherwise.");
11483
11484static PyObject*
11485unicode_isupper(PyObject *self)
11486{
11487    Py_ssize_t i, length;
11488    int kind;
11489    void *data;
11490    int cased;
11491
11492    if (PyUnicode_READY(self) == -1)
11493        return NULL;
11494    length = PyUnicode_GET_LENGTH(self);
11495    kind = PyUnicode_KIND(self);
11496    data = PyUnicode_DATA(self);
11497
11498    /* Shortcut for single character strings */
11499    if (length == 1)
11500        return PyBool_FromLong(
11501            Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
11502
11503    /* Special case for empty strings */
11504    if (length == 0)
11505        return PyBool_FromLong(0);
11506
11507    cased = 0;
11508    for (i = 0; i < length; i++) {
11509        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11510
11511        if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11512            return PyBool_FromLong(0);
11513        else if (!cased && Py_UNICODE_ISUPPER(ch))
11514            cased = 1;
11515    }
11516    return PyBool_FromLong(cased);
11517}
11518
11519PyDoc_STRVAR(istitle__doc__,
11520             "S.istitle() -> bool\n\
11521\n\
11522Return True if S is a titlecased string and there is at least one\n\
11523character in S, i.e. upper- and titlecase characters may only\n\
11524follow uncased characters and lowercase characters only cased ones.\n\
11525Return False otherwise.");
11526
11527static PyObject*
11528unicode_istitle(PyObject *self)
11529{
11530    Py_ssize_t i, length;
11531    int kind;
11532    void *data;
11533    int cased, previous_is_cased;
11534
11535    if (PyUnicode_READY(self) == -1)
11536        return NULL;
11537    length = PyUnicode_GET_LENGTH(self);
11538    kind = PyUnicode_KIND(self);
11539    data = PyUnicode_DATA(self);
11540
11541    /* Shortcut for single character strings */
11542    if (length == 1) {
11543        Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11544        return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11545                               (Py_UNICODE_ISUPPER(ch) != 0));
11546    }
11547
11548    /* Special case for empty strings */
11549    if (length == 0)
11550        return PyBool_FromLong(0);
11551
11552    cased = 0;
11553    previous_is_cased = 0;
11554    for (i = 0; i < length; i++) {
11555        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11556
11557        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11558            if (previous_is_cased)
11559                return PyBool_FromLong(0);
11560            previous_is_cased = 1;
11561            cased = 1;
11562        }
11563        else if (Py_UNICODE_ISLOWER(ch)) {
11564            if (!previous_is_cased)
11565                return PyBool_FromLong(0);
11566            previous_is_cased = 1;
11567            cased = 1;
11568        }
11569        else
11570            previous_is_cased = 0;
11571    }
11572    return PyBool_FromLong(cased);
11573}
11574
11575PyDoc_STRVAR(isspace__doc__,
11576             "S.isspace() -> bool\n\
11577\n\
11578Return True if all characters in S are whitespace\n\
11579and there is at least one character in S, False otherwise.");
11580
11581static PyObject*
11582unicode_isspace(PyObject *self)
11583{
11584    Py_ssize_t i, length;
11585    int kind;
11586    void *data;
11587
11588    if (PyUnicode_READY(self) == -1)
11589        return NULL;
11590    length = PyUnicode_GET_LENGTH(self);
11591    kind = PyUnicode_KIND(self);
11592    data = PyUnicode_DATA(self);
11593
11594    /* Shortcut for single character strings */
11595    if (length == 1)
11596        return PyBool_FromLong(
11597            Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
11598
11599    /* Special case for empty strings */
11600    if (length == 0)
11601        return PyBool_FromLong(0);
11602
11603    for (i = 0; i < length; i++) {
11604        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11605        if (!Py_UNICODE_ISSPACE(ch))
11606            return PyBool_FromLong(0);
11607    }
11608    return PyBool_FromLong(1);
11609}
11610
11611PyDoc_STRVAR(isalpha__doc__,
11612             "S.isalpha() -> bool\n\
11613\n\
11614Return True if all characters in S are alphabetic\n\
11615and there is at least one character in S, False otherwise.");
11616
11617static PyObject*
11618unicode_isalpha(PyObject *self)
11619{
11620    Py_ssize_t i, length;
11621    int kind;
11622    void *data;
11623
11624    if (PyUnicode_READY(self) == -1)
11625        return NULL;
11626    length = PyUnicode_GET_LENGTH(self);
11627    kind = PyUnicode_KIND(self);
11628    data = PyUnicode_DATA(self);
11629
11630    /* Shortcut for single character strings */
11631    if (length == 1)
11632        return PyBool_FromLong(
11633            Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
11634
11635    /* Special case for empty strings */
11636    if (length == 0)
11637        return PyBool_FromLong(0);
11638
11639    for (i = 0; i < length; i++) {
11640        if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
11641            return PyBool_FromLong(0);
11642    }
11643    return PyBool_FromLong(1);
11644}
11645
11646PyDoc_STRVAR(isalnum__doc__,
11647             "S.isalnum() -> bool\n\
11648\n\
11649Return True if all characters in S are alphanumeric\n\
11650and there is at least one character in S, False otherwise.");
11651
11652static PyObject*
11653unicode_isalnum(PyObject *self)
11654{
11655    int kind;
11656    void *data;
11657    Py_ssize_t len, i;
11658
11659    if (PyUnicode_READY(self) == -1)
11660        return NULL;
11661
11662    kind = PyUnicode_KIND(self);
11663    data = PyUnicode_DATA(self);
11664    len = PyUnicode_GET_LENGTH(self);
11665
11666    /* Shortcut for single character strings */
11667    if (len == 1) {
11668        const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11669        return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11670    }
11671
11672    /* Special case for empty strings */
11673    if (len == 0)
11674        return PyBool_FromLong(0);
11675
11676    for (i = 0; i < len; i++) {
11677        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11678        if (!Py_UNICODE_ISALNUM(ch))
11679            return PyBool_FromLong(0);
11680    }
11681    return PyBool_FromLong(1);
11682}
11683
11684PyDoc_STRVAR(isdecimal__doc__,
11685             "S.isdecimal() -> bool\n\
11686\n\
11687Return True if there are only decimal characters in S,\n\
11688False otherwise.");
11689
11690static PyObject*
11691unicode_isdecimal(PyObject *self)
11692{
11693    Py_ssize_t i, length;
11694    int kind;
11695    void *data;
11696
11697    if (PyUnicode_READY(self) == -1)
11698        return NULL;
11699    length = PyUnicode_GET_LENGTH(self);
11700    kind = PyUnicode_KIND(self);
11701    data = PyUnicode_DATA(self);
11702
11703    /* Shortcut for single character strings */
11704    if (length == 1)
11705        return PyBool_FromLong(
11706            Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
11707
11708    /* Special case for empty strings */
11709    if (length == 0)
11710        return PyBool_FromLong(0);
11711
11712    for (i = 0; i < length; i++) {
11713        if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
11714            return PyBool_FromLong(0);
11715    }
11716    return PyBool_FromLong(1);
11717}
11718
11719PyDoc_STRVAR(isdigit__doc__,
11720             "S.isdigit() -> bool\n\
11721\n\
11722Return True if all characters in S are digits\n\
11723and there is at least one character in S, False otherwise.");
11724
11725static PyObject*
11726unicode_isdigit(PyObject *self)
11727{
11728    Py_ssize_t i, length;
11729    int kind;
11730    void *data;
11731
11732    if (PyUnicode_READY(self) == -1)
11733        return NULL;
11734    length = PyUnicode_GET_LENGTH(self);
11735    kind = PyUnicode_KIND(self);
11736    data = PyUnicode_DATA(self);
11737
11738    /* Shortcut for single character strings */
11739    if (length == 1) {
11740        const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11741        return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11742    }
11743
11744    /* Special case for empty strings */
11745    if (length == 0)
11746        return PyBool_FromLong(0);
11747
11748    for (i = 0; i < length; i++) {
11749        if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
11750            return PyBool_FromLong(0);
11751    }
11752    return PyBool_FromLong(1);
11753}
11754
11755PyDoc_STRVAR(isnumeric__doc__,
11756             "S.isnumeric() -> bool\n\
11757\n\
11758Return True if there are only numeric characters in S,\n\
11759False otherwise.");
11760
11761static PyObject*
11762unicode_isnumeric(PyObject *self)
11763{
11764    Py_ssize_t i, length;
11765    int kind;
11766    void *data;
11767
11768    if (PyUnicode_READY(self) == -1)
11769        return NULL;
11770    length = PyUnicode_GET_LENGTH(self);
11771    kind = PyUnicode_KIND(self);
11772    data = PyUnicode_DATA(self);
11773
11774    /* Shortcut for single character strings */
11775    if (length == 1)
11776        return PyBool_FromLong(
11777            Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
11778
11779    /* Special case for empty strings */
11780    if (length == 0)
11781        return PyBool_FromLong(0);
11782
11783    for (i = 0; i < length; i++) {
11784        if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
11785            return PyBool_FromLong(0);
11786    }
11787    return PyBool_FromLong(1);
11788}
11789
11790int
11791PyUnicode_IsIdentifier(PyObject *self)
11792{
11793    int kind;
11794    void *data;
11795    Py_ssize_t i;
11796    Py_UCS4 first;
11797
11798    if (PyUnicode_READY(self) == -1) {
11799        Py_FatalError("identifier not ready");
11800        return 0;
11801    }
11802
11803    /* Special case for empty strings */
11804    if (PyUnicode_GET_LENGTH(self) == 0)
11805        return 0;
11806    kind = PyUnicode_KIND(self);
11807    data = PyUnicode_DATA(self);
11808
11809    /* PEP 3131 says that the first character must be in
11810       XID_Start and subsequent characters in XID_Continue,
11811       and for the ASCII range, the 2.x rules apply (i.e
11812       start with letters and underscore, continue with
11813       letters, digits, underscore). However, given the current
11814       definition of XID_Start and XID_Continue, it is sufficient
11815       to check just for these, except that _ must be allowed
11816       as starting an identifier.  */
11817    first = PyUnicode_READ(kind, data, 0);
11818    if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
11819        return 0;
11820
11821    for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
11822        if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
11823            return 0;
11824    return 1;
11825}
11826
11827PyDoc_STRVAR(isidentifier__doc__,
11828             "S.isidentifier() -> bool\n\
11829\n\
11830Return True if S is a valid identifier according\n\
11831to the language definition.\n\
11832\n\
11833Use keyword.iskeyword() to test for reserved identifiers\n\
11834such as \"def\" and \"class\".\n");
11835
11836static PyObject*
11837unicode_isidentifier(PyObject *self)
11838{
11839    return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11840}
11841
11842PyDoc_STRVAR(isprintable__doc__,
11843             "S.isprintable() -> bool\n\
11844\n\
11845Return True if all characters in S are considered\n\
11846printable in repr() or S is empty, False otherwise.");
11847
11848static PyObject*
11849unicode_isprintable(PyObject *self)
11850{
11851    Py_ssize_t i, length;
11852    int kind;
11853    void *data;
11854
11855    if (PyUnicode_READY(self) == -1)
11856        return NULL;
11857    length = PyUnicode_GET_LENGTH(self);
11858    kind = PyUnicode_KIND(self);
11859    data = PyUnicode_DATA(self);
11860
11861    /* Shortcut for single character strings */
11862    if (length == 1)
11863        return PyBool_FromLong(
11864            Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
11865
11866    for (i = 0; i < length; i++) {
11867        if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
11868            Py_RETURN_FALSE;
11869        }
11870    }
11871    Py_RETURN_TRUE;
11872}
11873
11874PyDoc_STRVAR(join__doc__,
11875             "S.join(iterable) -> str\n\
11876\n\
11877Return a string which is the concatenation of the strings in the\n\
11878iterable.  The separator between elements is S.");
11879
11880static PyObject*
11881unicode_join(PyObject *self, PyObject *data)
11882{
11883    return PyUnicode_Join(self, data);
11884}
11885
11886static Py_ssize_t
11887unicode_length(PyObject *self)
11888{
11889    if (PyUnicode_READY(self) == -1)
11890        return -1;
11891    return PyUnicode_GET_LENGTH(self);
11892}
11893
11894PyDoc_STRVAR(ljust__doc__,
11895             "S.ljust(width[, fillchar]) -> str\n\
11896\n\
11897Return S left-justified in a Unicode string of length width. Padding is\n\
11898done using the specified fill character (default is a space).");
11899
11900static PyObject *
11901unicode_ljust(PyObject *self, PyObject *args)
11902{
11903    Py_ssize_t width;
11904    Py_UCS4 fillchar = ' ';
11905
11906    if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
11907        return NULL;
11908
11909    if (PyUnicode_READY(self) == -1)
11910        return NULL;
11911
11912    if (PyUnicode_GET_LENGTH(self) >= width)
11913        return unicode_result_unchanged(self);
11914
11915    return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
11916}
11917
11918PyDoc_STRVAR(lower__doc__,
11919             "S.lower() -> str\n\
11920\n\
11921Return a copy of the string S converted to lowercase.");
11922
11923static PyObject*
11924unicode_lower(PyObject *self)
11925{
11926    if (PyUnicode_READY(self) == -1)
11927        return NULL;
11928    if (PyUnicode_IS_ASCII(self))
11929        return ascii_upper_or_lower(self, 1);
11930    return case_operation(self, do_lower);
11931}
11932
11933#define LEFTSTRIP 0
11934#define RIGHTSTRIP 1
11935#define BOTHSTRIP 2
11936
11937/* Arrays indexed by above */
11938static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11939
11940#define STRIPNAME(i) (stripformat[i]+3)
11941
11942/* externally visible for str.strip(unicode) */
11943PyObject *
11944_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
11945{
11946    void *data;
11947    int kind;
11948    Py_ssize_t i, j, len;
11949    BLOOM_MASK sepmask;
11950    Py_ssize_t seplen;
11951
11952    if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11953        return NULL;
11954
11955    kind = PyUnicode_KIND(self);
11956    data = PyUnicode_DATA(self);
11957    len = PyUnicode_GET_LENGTH(self);
11958    seplen = PyUnicode_GET_LENGTH(sepobj);
11959    sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11960                              PyUnicode_DATA(sepobj),
11961                              seplen);
11962
11963    i = 0;
11964    if (striptype != RIGHTSTRIP) {
11965        while (i < len) {
11966            Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11967            if (!BLOOM(sepmask, ch))
11968                break;
11969            if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
11970                break;
11971            i++;
11972        }
11973    }
11974
11975    j = len;
11976    if (striptype != LEFTSTRIP) {
11977        j--;
11978        while (j >= i) {
11979            Py_UCS4 ch = PyUnicode_READ(kind, data, j);
11980            if (!BLOOM(sepmask, ch))
11981                break;
11982            if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
11983                break;
11984            j--;
11985        }
11986
11987        j++;
11988    }
11989
11990    return PyUnicode_Substring(self, i, j);
11991}
11992
11993PyObject*
11994PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11995{
11996    unsigned char *data;
11997    int kind;
11998    Py_ssize_t length;
11999
12000    if (PyUnicode_READY(self) == -1)
12001        return NULL;
12002
12003    length = PyUnicode_GET_LENGTH(self);
12004    end = Py_MIN(end, length);
12005
12006    if (start == 0 && end == length)
12007        return unicode_result_unchanged(self);
12008
12009    if (start < 0 || end < 0) {
12010        PyErr_SetString(PyExc_IndexError, "string index out of range");
12011        return NULL;
12012    }
12013    if (start >= length || end < start)
12014        _Py_RETURN_UNICODE_EMPTY();
12015
12016    length = end - start;
12017    if (PyUnicode_IS_ASCII(self)) {
12018        data = PyUnicode_1BYTE_DATA(self);
12019        return _PyUnicode_FromASCII((char*)(data + start), length);
12020    }
12021    else {
12022        kind = PyUnicode_KIND(self);
12023        data = PyUnicode_1BYTE_DATA(self);
12024        return PyUnicode_FromKindAndData(kind,
12025                                         data + kind * start,
12026                                         length);
12027    }
12028}
12029
12030static PyObject *
12031do_strip(PyObject *self, int striptype)
12032{
12033    Py_ssize_t len, i, j;
12034
12035    if (PyUnicode_READY(self) == -1)
12036        return NULL;
12037
12038    len = PyUnicode_GET_LENGTH(self);
12039
12040    if (PyUnicode_IS_ASCII(self)) {
12041        Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12042
12043        i = 0;
12044        if (striptype != RIGHTSTRIP) {
12045            while (i < len) {
12046                Py_UCS1 ch = data[i];
12047                if (!_Py_ascii_whitespace[ch])
12048                    break;
12049                i++;
12050            }
12051        }
12052
12053        j = len;
12054        if (striptype != LEFTSTRIP) {
12055            j--;
12056            while (j >= i) {
12057                Py_UCS1 ch = data[j];
12058                if (!_Py_ascii_whitespace[ch])
12059                    break;
12060                j--;
12061            }
12062            j++;
12063        }
12064    }
12065    else {
12066        int kind = PyUnicode_KIND(self);
12067        void *data = PyUnicode_DATA(self);
12068
12069        i = 0;
12070        if (striptype != RIGHTSTRIP) {
12071            while (i < len) {
12072                Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12073                if (!Py_UNICODE_ISSPACE(ch))
12074                    break;
12075                i++;
12076            }
12077        }
12078
12079        j = len;
12080        if (striptype != LEFTSTRIP) {
12081            j--;
12082            while (j >= i) {
12083                Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12084                if (!Py_UNICODE_ISSPACE(ch))
12085                    break;
12086                j--;
12087            }
12088            j++;
12089        }
12090    }
12091
12092    return PyUnicode_Substring(self, i, j);
12093}
12094
12095
12096static PyObject *
12097do_argstrip(PyObject *self, int striptype, PyObject *args)
12098{
12099    PyObject *sep = NULL;
12100
12101    if (!PyArg_ParseTuple(args, stripformat[striptype], &sep))
12102        return NULL;
12103
12104    if (sep != NULL && sep != Py_None) {
12105        if (PyUnicode_Check(sep))
12106            return _PyUnicode_XStrip(self, striptype, sep);
12107        else {
12108            PyErr_Format(PyExc_TypeError,
12109                         "%s arg must be None or str",
12110                         STRIPNAME(striptype));
12111            return NULL;
12112        }
12113    }
12114
12115    return do_strip(self, striptype);
12116}
12117
12118
12119PyDoc_STRVAR(strip__doc__,
12120             "S.strip([chars]) -> str\n\
12121\n\
12122Return a copy of the string S with leading and trailing\n\
12123whitespace removed.\n\
12124If chars is given and not None, remove characters in chars instead.");
12125
12126static PyObject *
12127unicode_strip(PyObject *self, PyObject *args)
12128{
12129    if (PyTuple_GET_SIZE(args) == 0)
12130        return do_strip(self, BOTHSTRIP); /* Common case */
12131    else
12132        return do_argstrip(self, BOTHSTRIP, args);
12133}
12134
12135
12136PyDoc_STRVAR(lstrip__doc__,
12137             "S.lstrip([chars]) -> str\n\
12138\n\
12139Return a copy of the string S with leading whitespace removed.\n\
12140If chars is given and not None, remove characters in chars instead.");
12141
12142static PyObject *
12143unicode_lstrip(PyObject *self, PyObject *args)
12144{
12145    if (PyTuple_GET_SIZE(args) == 0)
12146        return do_strip(self, LEFTSTRIP); /* Common case */
12147    else
12148        return do_argstrip(self, LEFTSTRIP, args);
12149}
12150
12151
12152PyDoc_STRVAR(rstrip__doc__,
12153             "S.rstrip([chars]) -> str\n\
12154\n\
12155Return a copy of the string S with trailing whitespace removed.\n\
12156If chars is given and not None, remove characters in chars instead.");
12157
12158static PyObject *
12159unicode_rstrip(PyObject *self, PyObject *args)
12160{
12161    if (PyTuple_GET_SIZE(args) == 0)
12162        return do_strip(self, RIGHTSTRIP); /* Common case */
12163    else
12164        return do_argstrip(self, RIGHTSTRIP, args);
12165}
12166
12167
12168static PyObject*
12169unicode_repeat(PyObject *str, Py_ssize_t len)
12170{
12171    PyObject *u;
12172    Py_ssize_t nchars, n;
12173
12174    if (len < 1)
12175        _Py_RETURN_UNICODE_EMPTY();
12176
12177    /* no repeat, return original string */
12178    if (len == 1)
12179        return unicode_result_unchanged(str);
12180
12181    if (PyUnicode_READY(str) == -1)
12182        return NULL;
12183
12184    if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
12185        PyErr_SetString(PyExc_OverflowError,
12186                        "repeated string is too long");
12187        return NULL;
12188    }
12189    nchars = len * PyUnicode_GET_LENGTH(str);
12190
12191    u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
12192    if (!u)
12193        return NULL;
12194    assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
12195
12196    if (PyUnicode_GET_LENGTH(str) == 1) {
12197        const int kind = PyUnicode_KIND(str);
12198        const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
12199        if (kind == PyUnicode_1BYTE_KIND) {
12200            void *to = PyUnicode_DATA(u);
12201            memset(to, (unsigned char)fill_char, len);
12202        }
12203        else if (kind == PyUnicode_2BYTE_KIND) {
12204            Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
12205            for (n = 0; n < len; ++n)
12206                ucs2[n] = fill_char;
12207        } else {
12208            Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12209            assert(kind == PyUnicode_4BYTE_KIND);
12210            for (n = 0; n < len; ++n)
12211                ucs4[n] = fill_char;
12212        }
12213    }
12214    else {
12215        /* number of characters copied this far */
12216        Py_ssize_t done = PyUnicode_GET_LENGTH(str);
12217        const Py_ssize_t char_size = PyUnicode_KIND(str);
12218        char *to = (char *) PyUnicode_DATA(u);
12219        Py_MEMCPY(to, PyUnicode_DATA(str),
12220                  PyUnicode_GET_LENGTH(str) * char_size);
12221        while (done < nchars) {
12222            n = (done <= nchars-done) ? done : nchars-done;
12223            Py_MEMCPY(to + (done * char_size), to, n * char_size);
12224            done += n;
12225        }
12226    }
12227
12228    assert(_PyUnicode_CheckConsistency(u, 1));
12229    return u;
12230}
12231
12232PyObject *
12233PyUnicode_Replace(PyObject *obj,
12234                  PyObject *subobj,
12235                  PyObject *replobj,
12236                  Py_ssize_t maxcount)
12237{
12238    PyObject *self;
12239    PyObject *str1;
12240    PyObject *str2;
12241    PyObject *result;
12242
12243    self = PyUnicode_FromObject(obj);
12244    if (self == NULL)
12245        return NULL;
12246    str1 = PyUnicode_FromObject(subobj);
12247    if (str1 == NULL) {
12248        Py_DECREF(self);
12249        return NULL;
12250    }
12251    str2 = PyUnicode_FromObject(replobj);
12252    if (str2 == NULL) {
12253        Py_DECREF(self);
12254        Py_DECREF(str1);
12255        return NULL;
12256    }
12257    if (PyUnicode_READY(self) == -1 ||
12258        PyUnicode_READY(str1) == -1 ||
12259        PyUnicode_READY(str2) == -1)
12260        result = NULL;
12261    else
12262        result = replace(self, str1, str2, maxcount);
12263    Py_DECREF(self);
12264    Py_DECREF(str1);
12265    Py_DECREF(str2);
12266    return result;
12267}
12268
12269PyDoc_STRVAR(replace__doc__,
12270             "S.replace(old, new[, count]) -> str\n\
12271\n\
12272Return a copy of S with all occurrences of substring\n\
12273old replaced by new.  If the optional argument count is\n\
12274given, only the first count occurrences are replaced.");
12275
12276static PyObject*
12277unicode_replace(PyObject *self, PyObject *args)
12278{
12279    PyObject *str1;
12280    PyObject *str2;
12281    Py_ssize_t maxcount = -1;
12282    PyObject *result;
12283
12284    if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
12285        return NULL;
12286    if (PyUnicode_READY(self) == -1)
12287        return NULL;
12288    str1 = PyUnicode_FromObject(str1);
12289    if (str1 == NULL)
12290        return NULL;
12291    str2 = PyUnicode_FromObject(str2);
12292    if (str2 == NULL) {
12293        Py_DECREF(str1);
12294        return NULL;
12295    }
12296    if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1)
12297        result = NULL;
12298    else
12299        result = replace(self, str1, str2, maxcount);
12300
12301    Py_DECREF(str1);
12302    Py_DECREF(str2);
12303    return result;
12304}
12305
12306static PyObject *
12307unicode_repr(PyObject *unicode)
12308{
12309    PyObject *repr;
12310    Py_ssize_t isize;
12311    Py_ssize_t osize, squote, dquote, i, o;
12312    Py_UCS4 max, quote;
12313    int ikind, okind, unchanged;
12314    void *idata, *odata;
12315
12316    if (PyUnicode_READY(unicode) == -1)
12317        return NULL;
12318
12319    isize = PyUnicode_GET_LENGTH(unicode);
12320    idata = PyUnicode_DATA(unicode);
12321
12322    /* Compute length of output, quote characters, and
12323       maximum character */
12324    osize = 0;
12325    max = 127;
12326    squote = dquote = 0;
12327    ikind = PyUnicode_KIND(unicode);
12328    for (i = 0; i < isize; i++) {
12329        Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12330        Py_ssize_t incr = 1;
12331        switch (ch) {
12332        case '\'': squote++; break;
12333        case '"':  dquote++; break;
12334        case '\\': case '\t': case '\r': case '\n':
12335            incr = 2;
12336            break;
12337        default:
12338            /* Fast-path ASCII */
12339            if (ch < ' ' || ch == 0x7f)
12340                incr = 4; /* \xHH */
12341            else if (ch < 0x7f)
12342                ;
12343            else if (Py_UNICODE_ISPRINTABLE(ch))
12344                max = ch > max ? ch : max;
12345            else if (ch < 0x100)
12346                incr = 4; /* \xHH */
12347            else if (ch < 0x10000)
12348                incr = 6; /* \uHHHH */
12349            else
12350                incr = 10; /* \uHHHHHHHH */
12351        }
12352        if (osize > PY_SSIZE_T_MAX - incr) {
12353            PyErr_SetString(PyExc_OverflowError,
12354                            "string is too long to generate repr");
12355            return NULL;
12356        }
12357        osize += incr;
12358    }
12359
12360    quote = '\'';
12361    unchanged = (osize == isize);
12362    if (squote) {
12363        unchanged = 0;
12364        if (dquote)
12365            /* Both squote and dquote present. Use squote,
12366               and escape them */
12367            osize += squote;
12368        else
12369            quote = '"';
12370    }
12371    osize += 2;   /* quotes */
12372
12373    repr = PyUnicode_New(osize, max);
12374    if (repr == NULL)
12375        return NULL;
12376    okind = PyUnicode_KIND(repr);
12377    odata = PyUnicode_DATA(repr);
12378
12379    PyUnicode_WRITE(okind, odata, 0, quote);
12380    PyUnicode_WRITE(okind, odata, osize-1, quote);
12381    if (unchanged) {
12382        _PyUnicode_FastCopyCharacters(repr, 1,
12383                                      unicode, 0,
12384                                      isize);
12385    }
12386    else {
12387        for (i = 0, o = 1; i < isize; i++) {
12388            Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12389
12390            /* Escape quotes and backslashes */
12391            if ((ch == quote) || (ch == '\\')) {
12392                PyUnicode_WRITE(okind, odata, o++, '\\');
12393                PyUnicode_WRITE(okind, odata, o++, ch);
12394                continue;
12395            }
12396
12397            /* Map special whitespace to '\t', \n', '\r' */
12398            if (ch == '\t') {
12399                PyUnicode_WRITE(okind, odata, o++, '\\');
12400                PyUnicode_WRITE(okind, odata, o++, 't');
12401            }
12402            else if (ch == '\n') {
12403                PyUnicode_WRITE(okind, odata, o++, '\\');
12404                PyUnicode_WRITE(okind, odata, o++, 'n');
12405            }
12406            else if (ch == '\r') {
12407                PyUnicode_WRITE(okind, odata, o++, '\\');
12408                PyUnicode_WRITE(okind, odata, o++, 'r');
12409            }
12410
12411            /* Map non-printable US ASCII to '\xhh' */
12412            else if (ch < ' ' || ch == 0x7F) {
12413                PyUnicode_WRITE(okind, odata, o++, '\\');
12414                PyUnicode_WRITE(okind, odata, o++, 'x');
12415                PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12416                PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12417            }
12418
12419            /* Copy ASCII characters as-is */
12420            else if (ch < 0x7F) {
12421                PyUnicode_WRITE(okind, odata, o++, ch);
12422            }
12423
12424            /* Non-ASCII characters */
12425            else {
12426                /* Map Unicode whitespace and control characters
12427                   (categories Z* and C* except ASCII space)
12428                */
12429                if (!Py_UNICODE_ISPRINTABLE(ch)) {
12430                    PyUnicode_WRITE(okind, odata, o++, '\\');
12431                    /* Map 8-bit characters to '\xhh' */
12432                    if (ch <= 0xff) {
12433                        PyUnicode_WRITE(okind, odata, o++, 'x');
12434                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12435                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12436                    }
12437                    /* Map 16-bit characters to '\uxxxx' */
12438                    else if (ch <= 0xffff) {
12439                        PyUnicode_WRITE(okind, odata, o++, 'u');
12440                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12441                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12442                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12443                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12444                    }
12445                    /* Map 21-bit characters to '\U00xxxxxx' */
12446                    else {
12447                        PyUnicode_WRITE(okind, odata, o++, 'U');
12448                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12449                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12450                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12451                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12452                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12453                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12454                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12455                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12456                    }
12457                }
12458                /* Copy characters as-is */
12459                else {
12460                    PyUnicode_WRITE(okind, odata, o++, ch);
12461                }
12462            }
12463        }
12464    }
12465    /* Closing quote already added at the beginning */
12466    assert(_PyUnicode_CheckConsistency(repr, 1));
12467    return repr;
12468}
12469
12470PyDoc_STRVAR(rfind__doc__,
12471             "S.rfind(sub[, start[, end]]) -> int\n\
12472\n\
12473Return the highest index in S where substring sub is found,\n\
12474such that sub is contained within S[start:end].  Optional\n\
12475arguments start and end are interpreted as in slice notation.\n\
12476\n\
12477Return -1 on failure.");
12478
12479static PyObject *
12480unicode_rfind(PyObject *self, PyObject *args)
12481{
12482    /* initialize variables to prevent gcc warning */
12483    PyObject *substring = NULL;
12484    Py_ssize_t start = 0;
12485    Py_ssize_t end = 0;
12486    Py_ssize_t result;
12487
12488    if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12489                                            &start, &end))
12490        return NULL;
12491
12492    if (PyUnicode_READY(self) == -1) {
12493        Py_DECREF(substring);
12494        return NULL;
12495    }
12496    if (PyUnicode_READY(substring) == -1) {
12497        Py_DECREF(substring);
12498        return NULL;
12499    }
12500
12501    result = any_find_slice(-1, self, substring, start, end);
12502
12503    Py_DECREF(substring);
12504
12505    if (result == -2)
12506        return NULL;
12507
12508    return PyLong_FromSsize_t(result);
12509}
12510
12511PyDoc_STRVAR(rindex__doc__,
12512             "S.rindex(sub[, start[, end]]) -> int\n\
12513\n\
12514Like S.rfind() but raise ValueError when the substring is not found.");
12515
12516static PyObject *
12517unicode_rindex(PyObject *self, PyObject *args)
12518{
12519    /* initialize variables to prevent gcc warning */
12520    PyObject *substring = NULL;
12521    Py_ssize_t start = 0;
12522    Py_ssize_t end = 0;
12523    Py_ssize_t result;
12524
12525    if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12526                                            &start, &end))
12527        return NULL;
12528
12529    if (PyUnicode_READY(self) == -1) {
12530        Py_DECREF(substring);
12531        return NULL;
12532    }
12533    if (PyUnicode_READY(substring) == -1) {
12534        Py_DECREF(substring);
12535        return NULL;
12536    }
12537
12538    result = any_find_slice(-1, self, substring, start, end);
12539
12540    Py_DECREF(substring);
12541
12542    if (result == -2)
12543        return NULL;
12544
12545    if (result < 0) {
12546        PyErr_SetString(PyExc_ValueError, "substring not found");
12547        return NULL;
12548    }
12549
12550    return PyLong_FromSsize_t(result);
12551}
12552
12553PyDoc_STRVAR(rjust__doc__,
12554             "S.rjust(width[, fillchar]) -> str\n\
12555\n\
12556Return S right-justified in a string of length width. Padding is\n\
12557done using the specified fill character (default is a space).");
12558
12559static PyObject *
12560unicode_rjust(PyObject *self, PyObject *args)
12561{
12562    Py_ssize_t width;
12563    Py_UCS4 fillchar = ' ';
12564
12565    if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
12566        return NULL;
12567
12568    if (PyUnicode_READY(self) == -1)
12569        return NULL;
12570
12571    if (PyUnicode_GET_LENGTH(self) >= width)
12572        return unicode_result_unchanged(self);
12573
12574    return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
12575}
12576
12577PyObject *
12578PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
12579{
12580    PyObject *result;
12581
12582    s = PyUnicode_FromObject(s);
12583    if (s == NULL)
12584        return NULL;
12585    if (sep != NULL) {
12586        sep = PyUnicode_FromObject(sep);
12587        if (sep == NULL) {
12588            Py_DECREF(s);
12589            return NULL;
12590        }
12591    }
12592
12593    result = split(s, sep, maxsplit);
12594
12595    Py_DECREF(s);
12596    Py_XDECREF(sep);
12597    return result;
12598}
12599
12600PyDoc_STRVAR(split__doc__,
12601             "S.split(sep=None, maxsplit=-1) -> list of strings\n\
12602\n\
12603Return a list of the words in S, using sep as the\n\
12604delimiter string.  If maxsplit is given, at most maxsplit\n\
12605splits are done. If sep is not specified or is None, any\n\
12606whitespace string is a separator and empty strings are\n\
12607removed from the result.");
12608
12609static PyObject*
12610unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
12611{
12612    static char *kwlist[] = {"sep", "maxsplit", 0};
12613    PyObject *substring = Py_None;
12614    Py_ssize_t maxcount = -1;
12615
12616    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
12617                                     kwlist, &substring, &maxcount))
12618        return NULL;
12619
12620    if (substring == Py_None)
12621        return split(self, NULL, maxcount);
12622    else if (PyUnicode_Check(substring))
12623        return split(self, substring, maxcount);
12624    else
12625        return PyUnicode_Split(self, substring, maxcount);
12626}
12627
12628PyObject *
12629PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12630{
12631    PyObject* str_obj;
12632    PyObject* sep_obj;
12633    PyObject* out;
12634    int kind1, kind2;
12635    void *buf1, *buf2;
12636    Py_ssize_t len1, len2;
12637
12638    str_obj = PyUnicode_FromObject(str_in);
12639    if (!str_obj)
12640        return NULL;
12641    sep_obj = PyUnicode_FromObject(sep_in);
12642    if (!sep_obj) {
12643        Py_DECREF(str_obj);
12644        return NULL;
12645    }
12646    if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
12647        Py_DECREF(sep_obj);
12648        Py_DECREF(str_obj);
12649        return NULL;
12650    }
12651
12652    kind1 = PyUnicode_KIND(str_obj);
12653    kind2 = PyUnicode_KIND(sep_obj);
12654    len1 = PyUnicode_GET_LENGTH(str_obj);
12655    len2 = PyUnicode_GET_LENGTH(sep_obj);
12656    if (kind1 < kind2 || len1 < len2) {
12657        _Py_INCREF_UNICODE_EMPTY();
12658        if (!unicode_empty)
12659            out = NULL;
12660        else {
12661            out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
12662            Py_DECREF(unicode_empty);
12663        }
12664        Py_DECREF(sep_obj);
12665        Py_DECREF(str_obj);
12666        return out;
12667    }
12668    buf1 = PyUnicode_DATA(str_obj);
12669    buf2 = PyUnicode_DATA(sep_obj);
12670    if (kind2 != kind1) {
12671        buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12672        if (!buf2)
12673            goto onError;
12674    }
12675
12676    switch (kind1) {
12677    case PyUnicode_1BYTE_KIND:
12678        if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12679            out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12680        else
12681            out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12682        break;
12683    case PyUnicode_2BYTE_KIND:
12684        out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12685        break;
12686    case PyUnicode_4BYTE_KIND:
12687        out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12688        break;
12689    default:
12690        assert(0);
12691        out = 0;
12692    }
12693
12694    Py_DECREF(sep_obj);
12695    Py_DECREF(str_obj);
12696    if (kind2 != kind1)
12697        PyMem_Free(buf2);
12698
12699    return out;
12700  onError:
12701    Py_DECREF(sep_obj);
12702    Py_DECREF(str_obj);
12703    if (kind2 != kind1 && buf2)
12704        PyMem_Free(buf2);
12705    return NULL;
12706}
12707
12708
12709PyObject *
12710PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12711{
12712    PyObject* str_obj;
12713    PyObject* sep_obj;
12714    PyObject* out;
12715    int kind1, kind2;
12716    void *buf1, *buf2;
12717    Py_ssize_t len1, len2;
12718
12719    str_obj = PyUnicode_FromObject(str_in);
12720    if (!str_obj)
12721        return NULL;
12722    sep_obj = PyUnicode_FromObject(sep_in);
12723    if (!sep_obj) {
12724        Py_DECREF(str_obj);
12725        return NULL;
12726    }
12727
12728    kind1 = PyUnicode_KIND(str_obj);
12729    kind2 = PyUnicode_KIND(sep_obj);
12730    len1 = PyUnicode_GET_LENGTH(str_obj);
12731    len2 = PyUnicode_GET_LENGTH(sep_obj);
12732    if (kind1 < kind2 || len1 < len2) {
12733        _Py_INCREF_UNICODE_EMPTY();
12734        if (!unicode_empty)
12735            out = NULL;
12736        else {
12737            out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
12738            Py_DECREF(unicode_empty);
12739        }
12740        Py_DECREF(sep_obj);
12741        Py_DECREF(str_obj);
12742        return out;
12743    }
12744    buf1 = PyUnicode_DATA(str_obj);
12745    buf2 = PyUnicode_DATA(sep_obj);
12746    if (kind2 != kind1) {
12747        buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12748        if (!buf2)
12749            goto onError;
12750    }
12751
12752    switch (kind1) {
12753    case PyUnicode_1BYTE_KIND:
12754        if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12755            out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12756        else
12757            out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12758        break;
12759    case PyUnicode_2BYTE_KIND:
12760        out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12761        break;
12762    case PyUnicode_4BYTE_KIND:
12763        out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12764        break;
12765    default:
12766        assert(0);
12767        out = 0;
12768    }
12769
12770    Py_DECREF(sep_obj);
12771    Py_DECREF(str_obj);
12772    if (kind2 != kind1)
12773        PyMem_Free(buf2);
12774
12775    return out;
12776  onError:
12777    Py_DECREF(sep_obj);
12778    Py_DECREF(str_obj);
12779    if (kind2 != kind1 && buf2)
12780        PyMem_Free(buf2);
12781    return NULL;
12782}
12783
12784PyDoc_STRVAR(partition__doc__,
12785             "S.partition(sep) -> (head, sep, tail)\n\
12786\n\
12787Search for the separator sep in S, and return the part before it,\n\
12788the separator itself, and the part after it.  If the separator is not\n\
12789found, return S and two empty strings.");
12790
12791static PyObject*
12792unicode_partition(PyObject *self, PyObject *separator)
12793{
12794    return PyUnicode_Partition(self, separator);
12795}
12796
12797PyDoc_STRVAR(rpartition__doc__,
12798             "S.rpartition(sep) -> (head, sep, tail)\n\
12799\n\
12800Search for the separator sep in S, starting at the end of S, and return\n\
12801the part before it, the separator itself, and the part after it.  If the\n\
12802separator is not found, return two empty strings and S.");
12803
12804static PyObject*
12805unicode_rpartition(PyObject *self, PyObject *separator)
12806{
12807    return PyUnicode_RPartition(self, separator);
12808}
12809
12810PyObject *
12811PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
12812{
12813    PyObject *result;
12814
12815    s = PyUnicode_FromObject(s);
12816    if (s == NULL)
12817        return NULL;
12818    if (sep != NULL) {
12819        sep = PyUnicode_FromObject(sep);
12820        if (sep == NULL) {
12821            Py_DECREF(s);
12822            return NULL;
12823        }
12824    }
12825
12826    result = rsplit(s, sep, maxsplit);
12827
12828    Py_DECREF(s);
12829    Py_XDECREF(sep);
12830    return result;
12831}
12832
12833PyDoc_STRVAR(rsplit__doc__,
12834             "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
12835\n\
12836Return a list of the words in S, using sep as the\n\
12837delimiter string, starting at the end of the string and\n\
12838working to the front.  If maxsplit is given, at most maxsplit\n\
12839splits are done. If sep is not specified, any whitespace string\n\
12840is a separator.");
12841
12842static PyObject*
12843unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
12844{
12845    static char *kwlist[] = {"sep", "maxsplit", 0};
12846    PyObject *substring = Py_None;
12847    Py_ssize_t maxcount = -1;
12848
12849    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
12850                                     kwlist, &substring, &maxcount))
12851        return NULL;
12852
12853    if (substring == Py_None)
12854        return rsplit(self, NULL, maxcount);
12855    else if (PyUnicode_Check(substring))
12856        return rsplit(self, substring, maxcount);
12857    else
12858        return PyUnicode_RSplit(self, substring, maxcount);
12859}
12860
12861PyDoc_STRVAR(splitlines__doc__,
12862             "S.splitlines([keepends]) -> list of strings\n\
12863\n\
12864Return a list of the lines in S, breaking at line boundaries.\n\
12865Line breaks are not included in the resulting list unless keepends\n\
12866is given and true.");
12867
12868static PyObject*
12869unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
12870{
12871    static char *kwlist[] = {"keepends", 0};
12872    int keepends = 0;
12873
12874    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12875                                     kwlist, &keepends))
12876        return NULL;
12877
12878    return PyUnicode_Splitlines(self, keepends);
12879}
12880
12881static
12882PyObject *unicode_str(PyObject *self)
12883{
12884    return unicode_result_unchanged(self);
12885}
12886
12887PyDoc_STRVAR(swapcase__doc__,
12888             "S.swapcase() -> str\n\
12889\n\
12890Return a copy of S with uppercase characters converted to lowercase\n\
12891and vice versa.");
12892
12893static PyObject*
12894unicode_swapcase(PyObject *self)
12895{
12896    if (PyUnicode_READY(self) == -1)
12897        return NULL;
12898    return case_operation(self, do_swapcase);
12899}
12900
12901/*[clinic input]
12902
12903@staticmethod
12904str.maketrans as unicode_maketrans
12905
12906  x: object
12907
12908  y: unicode=NULL
12909
12910  z: unicode=NULL
12911
12912  /
12913
12914Return a translation table usable for str.translate().
12915
12916If there is only one argument, it must be a dictionary mapping Unicode
12917ordinals (integers) or characters to Unicode ordinals, strings or None.
12918Character keys will be then converted to ordinals.
12919If there are two arguments, they must be strings of equal length, and
12920in the resulting dictionary, each character in x will be mapped to the
12921character at the same position in y. If there is a third argument, it
12922must be a string, whose characters will be mapped to None in the result.
12923[clinic start generated code]*/
12924
12925static PyObject *
12926unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
12927/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
12928{
12929    PyObject *new = NULL, *key, *value;
12930    Py_ssize_t i = 0;
12931    int res;
12932
12933    new = PyDict_New();
12934    if (!new)
12935        return NULL;
12936    if (y != NULL) {
12937        int x_kind, y_kind, z_kind;
12938        void *x_data, *y_data, *z_data;
12939
12940        /* x must be a string too, of equal length */
12941        if (!PyUnicode_Check(x)) {
12942            PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12943                            "be a string if there is a second argument");
12944            goto err;
12945        }
12946        if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
12947            PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12948                            "arguments must have equal length");
12949            goto err;
12950        }
12951        /* create entries for translating chars in x to those in y */
12952        x_kind = PyUnicode_KIND(x);
12953        y_kind = PyUnicode_KIND(y);
12954        x_data = PyUnicode_DATA(x);
12955        y_data = PyUnicode_DATA(y);
12956        for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12957            key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
12958            if (!key)
12959                goto err;
12960            value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
12961            if (!value) {
12962                Py_DECREF(key);
12963                goto err;
12964            }
12965            res = PyDict_SetItem(new, key, value);
12966            Py_DECREF(key);
12967            Py_DECREF(value);
12968            if (res < 0)
12969                goto err;
12970        }
12971        /* create entries for deleting chars in z */
12972        if (z != NULL) {
12973            z_kind = PyUnicode_KIND(z);
12974            z_data = PyUnicode_DATA(z);
12975            for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
12976                key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
12977                if (!key)
12978                    goto err;
12979                res = PyDict_SetItem(new, key, Py_None);
12980                Py_DECREF(key);
12981                if (res < 0)
12982                    goto err;
12983            }
12984        }
12985    } else {
12986        int kind;
12987        void *data;
12988
12989        /* x must be a dict */
12990        if (!PyDict_CheckExact(x)) {
12991            PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12992                            "to maketrans it must be a dict");
12993            goto err;
12994        }
12995        /* copy entries into the new dict, converting string keys to int keys */
12996        while (PyDict_Next(x, &i, &key, &value)) {
12997            if (PyUnicode_Check(key)) {
12998                /* convert string keys to integer keys */
12999                PyObject *newkey;
13000                if (PyUnicode_GET_LENGTH(key) != 1) {
13001                    PyErr_SetString(PyExc_ValueError, "string keys in translate "
13002                                    "table must be of length 1");
13003                    goto err;
13004                }
13005                kind = PyUnicode_KIND(key);
13006                data = PyUnicode_DATA(key);
13007                newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
13008                if (!newkey)
13009                    goto err;
13010                res = PyDict_SetItem(new, newkey, value);
13011                Py_DECREF(newkey);
13012                if (res < 0)
13013                    goto err;
13014            } else if (PyLong_Check(key)) {
13015                /* just keep integer keys */
13016                if (PyDict_SetItem(new, key, value) < 0)
13017                    goto err;
13018            } else {
13019                PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13020                                "be strings or integers");
13021                goto err;
13022            }
13023        }
13024    }
13025    return new;
13026  err:
13027    Py_DECREF(new);
13028    return NULL;
13029}
13030
13031PyDoc_STRVAR(translate__doc__,
13032             "S.translate(table) -> str\n\
13033\n\
13034Return a copy of the string S, where all characters have been mapped\n\
13035through the given translation table, which must be a mapping of\n\
13036Unicode ordinals to Unicode ordinals, strings, or None.\n\
13037Unmapped characters are left untouched. Characters mapped to None\n\
13038are deleted.");
13039
13040static PyObject*
13041unicode_translate(PyObject *self, PyObject *table)
13042{
13043    return _PyUnicode_TranslateCharmap(self, table, "ignore");
13044}
13045
13046PyDoc_STRVAR(upper__doc__,
13047             "S.upper() -> str\n\
13048\n\
13049Return a copy of S converted to uppercase.");
13050
13051static PyObject*
13052unicode_upper(PyObject *self)
13053{
13054    if (PyUnicode_READY(self) == -1)
13055        return NULL;
13056    if (PyUnicode_IS_ASCII(self))
13057        return ascii_upper_or_lower(self, 0);
13058    return case_operation(self, do_upper);
13059}
13060
13061PyDoc_STRVAR(zfill__doc__,
13062             "S.zfill(width) -> str\n\
13063\n\
13064Pad a numeric string S with zeros on the left, to fill a field\n\
13065of the specified width. The string S is never truncated.");
13066
13067static PyObject *
13068unicode_zfill(PyObject *self, PyObject *args)
13069{
13070    Py_ssize_t fill;
13071    PyObject *u;
13072    Py_ssize_t width;
13073    int kind;
13074    void *data;
13075    Py_UCS4 chr;
13076
13077    if (!PyArg_ParseTuple(args, "n:zfill", &width))
13078        return NULL;
13079
13080    if (PyUnicode_READY(self) == -1)
13081        return NULL;
13082
13083    if (PyUnicode_GET_LENGTH(self) >= width)
13084        return unicode_result_unchanged(self);
13085
13086    fill = width - PyUnicode_GET_LENGTH(self);
13087
13088    u = pad(self, fill, 0, '0');
13089
13090    if (u == NULL)
13091        return NULL;
13092
13093    kind = PyUnicode_KIND(u);
13094    data = PyUnicode_DATA(u);
13095    chr = PyUnicode_READ(kind, data, fill);
13096
13097    if (chr == '+' || chr == '-') {
13098        /* move sign to beginning of string */
13099        PyUnicode_WRITE(kind, data, 0, chr);
13100        PyUnicode_WRITE(kind, data, fill, '0');
13101    }
13102
13103    assert(_PyUnicode_CheckConsistency(u, 1));
13104    return u;
13105}
13106
13107#if 0
13108static PyObject *
13109unicode__decimal2ascii(PyObject *self)
13110{
13111    return PyUnicode_TransformDecimalAndSpaceToASCII(self);
13112}
13113#endif
13114
13115PyDoc_STRVAR(startswith__doc__,
13116             "S.startswith(prefix[, start[, end]]) -> bool\n\
13117\n\
13118Return True if S starts with the specified prefix, False otherwise.\n\
13119With optional start, test S beginning at that position.\n\
13120With optional end, stop comparing S at that position.\n\
13121prefix can also be a tuple of strings to try.");
13122
13123static PyObject *
13124unicode_startswith(PyObject *self,
13125                   PyObject *args)
13126{
13127    PyObject *subobj;
13128    PyObject *substring;
13129    Py_ssize_t start = 0;
13130    Py_ssize_t end = PY_SSIZE_T_MAX;
13131    int result;
13132
13133    if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
13134        return NULL;
13135    if (PyTuple_Check(subobj)) {
13136        Py_ssize_t i;
13137        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
13138            substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
13139            if (substring == NULL)
13140                return NULL;
13141            result = tailmatch(self, substring, start, end, -1);
13142            Py_DECREF(substring);
13143            if (result == -1)
13144                return NULL;
13145            if (result) {
13146                Py_RETURN_TRUE;
13147            }
13148        }
13149        /* nothing matched */
13150        Py_RETURN_FALSE;
13151    }
13152    substring = PyUnicode_FromObject(subobj);
13153    if (substring == NULL) {
13154        if (PyErr_ExceptionMatches(PyExc_TypeError))
13155            PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
13156                         "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
13157        return NULL;
13158    }
13159    result = tailmatch(self, substring, start, end, -1);
13160    Py_DECREF(substring);
13161    if (result == -1)
13162        return NULL;
13163    return PyBool_FromLong(result);
13164}
13165
13166
13167PyDoc_STRVAR(endswith__doc__,
13168             "S.endswith(suffix[, start[, end]]) -> bool\n\
13169\n\
13170Return True if S ends with the specified suffix, False otherwise.\n\
13171With optional start, test S beginning at that position.\n\
13172With optional end, stop comparing S at that position.\n\
13173suffix can also be a tuple of strings to try.");
13174
13175static PyObject *
13176unicode_endswith(PyObject *self,
13177                 PyObject *args)
13178{
13179    PyObject *subobj;
13180    PyObject *substring;
13181    Py_ssize_t start = 0;
13182    Py_ssize_t end = PY_SSIZE_T_MAX;
13183    int result;
13184
13185    if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
13186        return NULL;
13187    if (PyTuple_Check(subobj)) {
13188        Py_ssize_t i;
13189        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
13190            substring = PyUnicode_FromObject(
13191                PyTuple_GET_ITEM(subobj, i));
13192            if (substring == NULL)
13193                return NULL;
13194            result = tailmatch(self, substring, start, end, +1);
13195            Py_DECREF(substring);
13196            if (result == -1)
13197                return NULL;
13198            if (result) {
13199                Py_RETURN_TRUE;
13200            }
13201        }
13202        Py_RETURN_FALSE;
13203    }
13204    substring = PyUnicode_FromObject(subobj);
13205    if (substring == NULL) {
13206        if (PyErr_ExceptionMatches(PyExc_TypeError))
13207            PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
13208                         "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
13209        return NULL;
13210    }
13211    result = tailmatch(self, substring, start, end, +1);
13212    Py_DECREF(substring);
13213    if (result == -1)
13214        return NULL;
13215    return PyBool_FromLong(result);
13216}
13217
13218Py_LOCAL_INLINE(void)
13219_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
13220{
13221    if (!writer->readonly)
13222        writer->size = PyUnicode_GET_LENGTH(writer->buffer);
13223    else {
13224        /* Copy-on-write mode: set buffer size to 0 so
13225         * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13226         * next write. */
13227        writer->size = 0;
13228    }
13229    writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13230    writer->data = PyUnicode_DATA(writer->buffer);
13231    writer->kind = PyUnicode_KIND(writer->buffer);
13232}
13233
13234void
13235_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
13236{
13237    memset(writer, 0, sizeof(*writer));
13238#ifdef Py_DEBUG
13239    writer->kind = 5;    /* invalid kind */
13240#endif
13241    writer->min_char = 127;
13242}
13243
13244int
13245_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13246                                 Py_ssize_t length, Py_UCS4 maxchar)
13247{
13248#ifdef MS_WINDOWS
13249   /* On Windows, overallocate by 50% is the best factor */
13250#  define OVERALLOCATE_FACTOR 2
13251#else
13252   /* On Linux, overallocate by 25% is the best factor */
13253#  define OVERALLOCATE_FACTOR 4
13254#endif
13255    Py_ssize_t newlen;
13256    PyObject *newbuffer;
13257
13258    assert(length > 0);
13259
13260    if (length > PY_SSIZE_T_MAX - writer->pos) {
13261        PyErr_NoMemory();
13262        return -1;
13263    }
13264    newlen = writer->pos + length;
13265
13266    maxchar = Py_MAX(maxchar, writer->min_char);
13267
13268    if (writer->buffer == NULL) {
13269        assert(!writer->readonly);
13270        if (writer->overallocate
13271            && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13272            /* overallocate to limit the number of realloc() */
13273            newlen += newlen / OVERALLOCATE_FACTOR;
13274        }
13275        if (newlen < writer->min_length)
13276            newlen = writer->min_length;
13277
13278        writer->buffer = PyUnicode_New(newlen, maxchar);
13279        if (writer->buffer == NULL)
13280            return -1;
13281    }
13282    else if (newlen > writer->size) {
13283        if (writer->overallocate
13284            && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13285            /* overallocate to limit the number of realloc() */
13286            newlen += newlen / OVERALLOCATE_FACTOR;
13287        }
13288        if (newlen < writer->min_length)
13289            newlen = writer->min_length;
13290
13291        if (maxchar > writer->maxchar || writer->readonly) {
13292            /* resize + widen */
13293            newbuffer = PyUnicode_New(newlen, maxchar);
13294            if (newbuffer == NULL)
13295                return -1;
13296            _PyUnicode_FastCopyCharacters(newbuffer, 0,
13297                                          writer->buffer, 0, writer->pos);
13298            Py_DECREF(writer->buffer);
13299            writer->readonly = 0;
13300        }
13301        else {
13302            newbuffer = resize_compact(writer->buffer, newlen);
13303            if (newbuffer == NULL)
13304                return -1;
13305        }
13306        writer->buffer = newbuffer;
13307    }
13308    else if (maxchar > writer->maxchar) {
13309        assert(!writer->readonly);
13310        newbuffer = PyUnicode_New(writer->size, maxchar);
13311        if (newbuffer == NULL)
13312            return -1;
13313        _PyUnicode_FastCopyCharacters(newbuffer, 0,
13314                                      writer->buffer, 0, writer->pos);
13315        Py_DECREF(writer->buffer);
13316        writer->buffer = newbuffer;
13317    }
13318    _PyUnicodeWriter_Update(writer);
13319    return 0;
13320
13321#undef OVERALLOCATE_FACTOR
13322}
13323
13324Py_LOCAL_INLINE(int)
13325_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
13326{
13327    if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13328        return -1;
13329    PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13330    writer->pos++;
13331    return 0;
13332}
13333
13334int
13335_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13336{
13337    return _PyUnicodeWriter_WriteCharInline(writer, ch);
13338}
13339
13340int
13341_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13342{
13343    Py_UCS4 maxchar;
13344    Py_ssize_t len;
13345
13346    if (PyUnicode_READY(str) == -1)
13347        return -1;
13348    len = PyUnicode_GET_LENGTH(str);
13349    if (len == 0)
13350        return 0;
13351    maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13352    if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
13353        if (writer->buffer == NULL && !writer->overallocate) {
13354            assert(_PyUnicode_CheckConsistency(str, 1));
13355            writer->readonly = 1;
13356            Py_INCREF(str);
13357            writer->buffer = str;
13358            _PyUnicodeWriter_Update(writer);
13359            writer->pos += len;
13360            return 0;
13361        }
13362        if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13363            return -1;
13364    }
13365    _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13366                                  str, 0, len);
13367    writer->pos += len;
13368    return 0;
13369}
13370
13371int
13372_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13373                                Py_ssize_t start, Py_ssize_t end)
13374{
13375    Py_UCS4 maxchar;
13376    Py_ssize_t len;
13377
13378    if (PyUnicode_READY(str) == -1)
13379        return -1;
13380
13381    assert(0 <= start);
13382    assert(end <= PyUnicode_GET_LENGTH(str));
13383    assert(start <= end);
13384
13385    if (end == 0)
13386        return 0;
13387
13388    if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13389        return _PyUnicodeWriter_WriteStr(writer, str);
13390
13391    if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13392        maxchar = _PyUnicode_FindMaxChar(str, start, end);
13393    else
13394        maxchar = writer->maxchar;
13395    len = end - start;
13396
13397    if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13398        return -1;
13399
13400    _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13401                                  str, start, len);
13402    writer->pos += len;
13403    return 0;
13404}
13405
13406int
13407_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13408                                  const char *ascii, Py_ssize_t len)
13409{
13410    if (len == -1)
13411        len = strlen(ascii);
13412
13413    assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128);
13414
13415    if (writer->buffer == NULL && !writer->overallocate) {
13416        PyObject *str;
13417
13418        str = _PyUnicode_FromASCII(ascii, len);
13419        if (str == NULL)
13420            return -1;
13421
13422        writer->readonly = 1;
13423        writer->buffer = str;
13424        _PyUnicodeWriter_Update(writer);
13425        writer->pos += len;
13426        return 0;
13427    }
13428
13429    if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13430        return -1;
13431
13432    switch (writer->kind)
13433    {
13434    case PyUnicode_1BYTE_KIND:
13435    {
13436        const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13437        Py_UCS1 *data = writer->data;
13438
13439        Py_MEMCPY(data + writer->pos, str, len);
13440        break;
13441    }
13442    case PyUnicode_2BYTE_KIND:
13443    {
13444        _PyUnicode_CONVERT_BYTES(
13445            Py_UCS1, Py_UCS2,
13446            ascii, ascii + len,
13447            (Py_UCS2 *)writer->data + writer->pos);
13448        break;
13449    }
13450    case PyUnicode_4BYTE_KIND:
13451    {
13452        _PyUnicode_CONVERT_BYTES(
13453            Py_UCS1, Py_UCS4,
13454            ascii, ascii + len,
13455            (Py_UCS4 *)writer->data + writer->pos);
13456        break;
13457    }
13458    default:
13459        assert(0);
13460    }
13461
13462    writer->pos += len;
13463    return 0;
13464}
13465
13466int
13467_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13468                                   const char *str, Py_ssize_t len)
13469{
13470    Py_UCS4 maxchar;
13471
13472    maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13473    if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13474        return -1;
13475    unicode_write_cstr(writer->buffer, writer->pos, str, len);
13476    writer->pos += len;
13477    return 0;
13478}
13479
13480PyObject *
13481_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
13482{
13483    PyObject *str;
13484    if (writer->pos == 0) {
13485        Py_CLEAR(writer->buffer);
13486        _Py_RETURN_UNICODE_EMPTY();
13487    }
13488    if (writer->readonly) {
13489        str = writer->buffer;
13490        writer->buffer = NULL;
13491        assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13492        return str;
13493    }
13494    if (PyUnicode_GET_LENGTH(writer->buffer) != writer->pos) {
13495        PyObject *newbuffer;
13496        newbuffer = resize_compact(writer->buffer, writer->pos);
13497        if (newbuffer == NULL) {
13498            Py_CLEAR(writer->buffer);
13499            return NULL;
13500        }
13501        writer->buffer = newbuffer;
13502    }
13503    str = writer->buffer;
13504    writer->buffer = NULL;
13505    assert(_PyUnicode_CheckConsistency(str, 1));
13506    return unicode_result_ready(str);
13507}
13508
13509void
13510_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
13511{
13512    Py_CLEAR(writer->buffer);
13513}
13514
13515#include "stringlib/unicode_format.h"
13516
13517PyDoc_STRVAR(format__doc__,
13518             "S.format(*args, **kwargs) -> str\n\
13519\n\
13520Return a formatted version of S, using substitutions from args and kwargs.\n\
13521The substitutions are identified by braces ('{' and '}').");
13522
13523PyDoc_STRVAR(format_map__doc__,
13524             "S.format_map(mapping) -> str\n\
13525\n\
13526Return a formatted version of S, using substitutions from mapping.\n\
13527The substitutions are identified by braces ('{' and '}').");
13528
13529static PyObject *
13530unicode__format__(PyObject* self, PyObject* args)
13531{
13532    PyObject *format_spec;
13533    _PyUnicodeWriter writer;
13534    int ret;
13535
13536    if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
13537        return NULL;
13538
13539    if (PyUnicode_READY(self) == -1)
13540        return NULL;
13541    _PyUnicodeWriter_Init(&writer);
13542    ret = _PyUnicode_FormatAdvancedWriter(&writer,
13543                                          self, format_spec, 0,
13544                                          PyUnicode_GET_LENGTH(format_spec));
13545    if (ret == -1) {
13546        _PyUnicodeWriter_Dealloc(&writer);
13547        return NULL;
13548    }
13549    return _PyUnicodeWriter_Finish(&writer);
13550}
13551
13552PyDoc_STRVAR(p_format__doc__,
13553             "S.__format__(format_spec) -> str\n\
13554\n\
13555Return a formatted version of S as described by format_spec.");
13556
13557static PyObject *
13558unicode__sizeof__(PyObject *v)
13559{
13560    Py_ssize_t size;
13561
13562    /* If it's a compact object, account for base structure +
13563       character data. */
13564    if (PyUnicode_IS_COMPACT_ASCII(v))
13565        size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
13566    else if (PyUnicode_IS_COMPACT(v))
13567        size = sizeof(PyCompactUnicodeObject) +
13568            (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
13569    else {
13570        /* If it is a two-block object, account for base object, and
13571           for character block if present. */
13572        size = sizeof(PyUnicodeObject);
13573        if (_PyUnicode_DATA_ANY(v))
13574            size += (PyUnicode_GET_LENGTH(v) + 1) *
13575                PyUnicode_KIND(v);
13576    }
13577    /* If the wstr pointer is present, account for it unless it is shared
13578       with the data pointer. Check if the data is not shared. */
13579    if (_PyUnicode_HAS_WSTR_MEMORY(v))
13580        size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
13581    if (_PyUnicode_HAS_UTF8_MEMORY(v))
13582        size += PyUnicode_UTF8_LENGTH(v) + 1;
13583
13584    return PyLong_FromSsize_t(size);
13585}
13586
13587PyDoc_STRVAR(sizeof__doc__,
13588             "S.__sizeof__() -> size of S in memory, in bytes");
13589
13590static PyObject *
13591unicode_getnewargs(PyObject *v)
13592{
13593    PyObject *copy = _PyUnicode_Copy(v);
13594    if (!copy)
13595        return NULL;
13596    return Py_BuildValue("(N)", copy);
13597}
13598
13599static PyMethodDef unicode_methods[] = {
13600    {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
13601    {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
13602    {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
13603    {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
13604    {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
13605    {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
13606    {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
13607    {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
13608    {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
13609    {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
13610    {"expandtabs", (PyCFunction) unicode_expandtabs,
13611     METH_VARARGS | METH_KEYWORDS, expandtabs__doc__},
13612    {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
13613    {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
13614    {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13615    {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
13616    {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
13617    {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
13618    {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13619    {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13620    {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
13621    {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
13622    {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
13623    {"splitlines", (PyCFunction) unicode_splitlines,
13624     METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
13625    {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
13626    {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
13627    {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
13628    {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
13629    {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13630    {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13631    {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
13632    {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
13633    {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
13634    {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
13635    {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
13636    {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
13637    {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
13638    {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
13639    {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
13640    {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
13641    {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
13642    {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
13643    {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
13644    {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
13645    {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
13646    UNICODE_MAKETRANS_METHODDEF
13647    {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
13648#if 0
13649    /* These methods are just used for debugging the implementation. */
13650    {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
13651#endif
13652
13653    {"__getnewargs__",  (PyCFunction)unicode_getnewargs, METH_NOARGS},
13654    {NULL, NULL}
13655};
13656
13657static PyObject *
13658unicode_mod(PyObject *v, PyObject *w)
13659{
13660    if (!PyUnicode_Check(v))
13661        Py_RETURN_NOTIMPLEMENTED;
13662    return PyUnicode_Format(v, w);
13663}
13664
13665static PyNumberMethods unicode_as_number = {
13666    0,              /*nb_add*/
13667    0,              /*nb_subtract*/
13668    0,              /*nb_multiply*/
13669    unicode_mod,            /*nb_remainder*/
13670};
13671
13672static PySequenceMethods unicode_as_sequence = {
13673    (lenfunc) unicode_length,       /* sq_length */
13674    PyUnicode_Concat,           /* sq_concat */
13675    (ssizeargfunc) unicode_repeat,  /* sq_repeat */
13676    (ssizeargfunc) unicode_getitem,     /* sq_item */
13677    0,                  /* sq_slice */
13678    0,                  /* sq_ass_item */
13679    0,                  /* sq_ass_slice */
13680    PyUnicode_Contains,         /* sq_contains */
13681};
13682
13683static PyObject*
13684unicode_subscript(PyObject* self, PyObject* item)
13685{
13686    if (PyUnicode_READY(self) == -1)
13687        return NULL;
13688
13689    if (PyIndex_Check(item)) {
13690        Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
13691        if (i == -1 && PyErr_Occurred())
13692            return NULL;
13693        if (i < 0)
13694            i += PyUnicode_GET_LENGTH(self);
13695        return unicode_getitem(self, i);
13696    } else if (PySlice_Check(item)) {
13697        Py_ssize_t start, stop, step, slicelength, cur, i;
13698        PyObject *result;
13699        void *src_data, *dest_data;
13700        int src_kind, dest_kind;
13701        Py_UCS4 ch, max_char, kind_limit;
13702
13703        if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
13704                                 &start, &stop, &step, &slicelength) < 0) {
13705            return NULL;
13706        }
13707
13708        if (slicelength <= 0) {
13709            _Py_RETURN_UNICODE_EMPTY();
13710        } else if (start == 0 && step == 1 &&
13711                   slicelength == PyUnicode_GET_LENGTH(self)) {
13712            return unicode_result_unchanged(self);
13713        } else if (step == 1) {
13714            return PyUnicode_Substring(self,
13715                                       start, start + slicelength);
13716        }
13717        /* General case */
13718        src_kind = PyUnicode_KIND(self);
13719        src_data = PyUnicode_DATA(self);
13720        if (!PyUnicode_IS_ASCII(self)) {
13721            kind_limit = kind_maxchar_limit(src_kind);
13722            max_char = 0;
13723            for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13724                ch = PyUnicode_READ(src_kind, src_data, cur);
13725                if (ch > max_char) {
13726                    max_char = ch;
13727                    if (max_char >= kind_limit)
13728                        break;
13729                }
13730            }
13731        }
13732        else
13733            max_char = 127;
13734        result = PyUnicode_New(slicelength, max_char);
13735        if (result == NULL)
13736            return NULL;
13737        dest_kind = PyUnicode_KIND(result);
13738        dest_data = PyUnicode_DATA(result);
13739
13740        for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13741            Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13742            PyUnicode_WRITE(dest_kind, dest_data, i, ch);
13743        }
13744        assert(_PyUnicode_CheckConsistency(result, 1));
13745        return result;
13746    } else {
13747        PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13748        return NULL;
13749    }
13750}
13751
13752static PyMappingMethods unicode_as_mapping = {
13753    (lenfunc)unicode_length,        /* mp_length */
13754    (binaryfunc)unicode_subscript,  /* mp_subscript */
13755    (objobjargproc)0,           /* mp_ass_subscript */
13756};
13757
13758
13759/* Helpers for PyUnicode_Format() */
13760
13761struct unicode_formatter_t {
13762    PyObject *args;
13763    int args_owned;
13764    Py_ssize_t arglen, argidx;
13765    PyObject *dict;
13766
13767    enum PyUnicode_Kind fmtkind;
13768    Py_ssize_t fmtcnt, fmtpos;
13769    void *fmtdata;
13770    PyObject *fmtstr;
13771
13772    _PyUnicodeWriter writer;
13773};
13774
13775struct unicode_format_arg_t {
13776    Py_UCS4 ch;
13777    int flags;
13778    Py_ssize_t width;
13779    int prec;
13780    int sign;
13781};
13782
13783static PyObject *
13784unicode_format_getnextarg(struct unicode_formatter_t *ctx)
13785{
13786    Py_ssize_t argidx = ctx->argidx;
13787
13788    if (argidx < ctx->arglen) {
13789        ctx->argidx++;
13790        if (ctx->arglen < 0)
13791            return ctx->args;
13792        else
13793            return PyTuple_GetItem(ctx->args, argidx);
13794    }
13795    PyErr_SetString(PyExc_TypeError,
13796                    "not enough arguments for format string");
13797    return NULL;
13798}
13799
13800/* Returns a new reference to a PyUnicode object, or NULL on failure. */
13801
13802/* Format a float into the writer if the writer is not NULL, or into *p_output
13803   otherwise.
13804
13805   Return 0 on success, raise an exception and return -1 on error. */
13806static int
13807formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
13808            PyObject **p_output,
13809            _PyUnicodeWriter *writer)
13810{
13811    char *p;
13812    double x;
13813    Py_ssize_t len;
13814    int prec;
13815    int dtoa_flags;
13816
13817    x = PyFloat_AsDouble(v);
13818    if (x == -1.0 && PyErr_Occurred())
13819        return -1;
13820
13821    prec = arg->prec;
13822    if (prec < 0)
13823        prec = 6;
13824
13825    if (arg->flags & F_ALT)
13826        dtoa_flags = Py_DTSF_ALT;
13827    else
13828        dtoa_flags = 0;
13829    p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
13830    if (p == NULL)
13831        return -1;
13832    len = strlen(p);
13833    if (writer) {
13834        if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
13835            PyMem_Free(p);
13836            return -1;
13837        }
13838    }
13839    else
13840        *p_output = _PyUnicode_FromASCII(p, len);
13841    PyMem_Free(p);
13842    return 0;
13843}
13844
13845/* formatlong() emulates the format codes d, u, o, x and X, and
13846 * the F_ALT flag, for Python's long (unbounded) ints.  It's not used for
13847 * Python's regular ints.
13848 * Return value:  a new PyUnicodeObject*, or NULL if error.
13849 *     The output string is of the form
13850 *         "-"? ("0x" | "0X")? digit+
13851 *     "0x"/"0X" are present only for x and X conversions, with F_ALT
13852 *         set in flags.  The case of hex digits will be correct,
13853 *     There will be at least prec digits, zero-filled on the left if
13854 *         necessary to get that many.
13855 * val          object to be converted
13856 * flags        bitmask of format flags; only F_ALT is looked at
13857 * prec         minimum number of digits; 0-fill on left if needed
13858 * type         a character in [duoxX]; u acts the same as d
13859 *
13860 * CAUTION:  o, x and X conversions on regular ints can never
13861 * produce a '-' sign, but can for Python's unbounded ints.
13862 */
13863PyObject *
13864_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
13865{
13866    PyObject *result = NULL;
13867    char *buf;
13868    Py_ssize_t i;
13869    int sign;           /* 1 if '-', else 0 */
13870    int len;            /* number of characters */
13871    Py_ssize_t llen;
13872    int numdigits;      /* len == numnondigits + numdigits */
13873    int numnondigits = 0;
13874
13875    /* Avoid exceeding SSIZE_T_MAX */
13876    if (prec > INT_MAX-3) {
13877        PyErr_SetString(PyExc_OverflowError,
13878                        "precision too large");
13879        return NULL;
13880    }
13881
13882    assert(PyLong_Check(val));
13883
13884    switch (type) {
13885    default:
13886        assert(!"'type' not in [diuoxX]");
13887    case 'd':
13888    case 'i':
13889    case 'u':
13890        /* int and int subclasses should print numerically when a numeric */
13891        /* format code is used (see issue18780) */
13892        result = PyNumber_ToBase(val, 10);
13893        break;
13894    case 'o':
13895        numnondigits = 2;
13896        result = PyNumber_ToBase(val, 8);
13897        break;
13898    case 'x':
13899    case 'X':
13900        numnondigits = 2;
13901        result = PyNumber_ToBase(val, 16);
13902        break;
13903    }
13904    if (!result)
13905        return NULL;
13906
13907    assert(unicode_modifiable(result));
13908    assert(PyUnicode_IS_READY(result));
13909    assert(PyUnicode_IS_ASCII(result));
13910
13911    /* To modify the string in-place, there can only be one reference. */
13912    if (Py_REFCNT(result) != 1) {
13913        Py_DECREF(result);
13914        PyErr_BadInternalCall();
13915        return NULL;
13916    }
13917    buf = PyUnicode_DATA(result);
13918    llen = PyUnicode_GET_LENGTH(result);
13919    if (llen > INT_MAX) {
13920        Py_DECREF(result);
13921        PyErr_SetString(PyExc_ValueError,
13922                        "string too large in _PyUnicode_FormatLong");
13923        return NULL;
13924    }
13925    len = (int)llen;
13926    sign = buf[0] == '-';
13927    numnondigits += sign;
13928    numdigits = len - numnondigits;
13929    assert(numdigits > 0);
13930
13931    /* Get rid of base marker unless F_ALT */
13932    if (((alt) == 0 &&
13933        (type == 'o' || type == 'x' || type == 'X'))) {
13934        assert(buf[sign] == '0');
13935        assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
13936               buf[sign+1] == 'o');
13937        numnondigits -= 2;
13938        buf += 2;
13939        len -= 2;
13940        if (sign)
13941            buf[0] = '-';
13942        assert(len == numnondigits + numdigits);
13943        assert(numdigits > 0);
13944    }
13945
13946    /* Fill with leading zeroes to meet minimum width. */
13947    if (prec > numdigits) {
13948        PyObject *r1 = PyBytes_FromStringAndSize(NULL,
13949                                numnondigits + prec);
13950        char *b1;
13951        if (!r1) {
13952            Py_DECREF(result);
13953            return NULL;
13954        }
13955        b1 = PyBytes_AS_STRING(r1);
13956        for (i = 0; i < numnondigits; ++i)
13957            *b1++ = *buf++;
13958        for (i = 0; i < prec - numdigits; i++)
13959            *b1++ = '0';
13960        for (i = 0; i < numdigits; i++)
13961            *b1++ = *buf++;
13962        *b1 = '\0';
13963        Py_DECREF(result);
13964        result = r1;
13965        buf = PyBytes_AS_STRING(result);
13966        len = numnondigits + prec;
13967    }
13968
13969    /* Fix up case for hex conversions. */
13970    if (type == 'X') {
13971        /* Need to convert all lower case letters to upper case.
13972           and need to convert 0x to 0X (and -0x to -0X). */
13973        for (i = 0; i < len; i++)
13974            if (buf[i] >= 'a' && buf[i] <= 'x')
13975                buf[i] -= 'a'-'A';
13976    }
13977    if (!PyUnicode_Check(result)
13978        || buf != PyUnicode_DATA(result)) {
13979        PyObject *unicode;
13980        unicode = _PyUnicode_FromASCII(buf, len);
13981        Py_DECREF(result);
13982        result = unicode;
13983    }
13984    else if (len != PyUnicode_GET_LENGTH(result)) {
13985        if (PyUnicode_Resize(&result, len) < 0)
13986            Py_CLEAR(result);
13987    }
13988    return result;
13989}
13990
13991/* Format an integer or a float as an integer.
13992 * Return 1 if the number has been formatted into the writer,
13993 *        0 if the number has been formatted into *p_output
13994 *       -1 and raise an exception on error */
13995static int
13996mainformatlong(PyObject *v,
13997               struct unicode_format_arg_t *arg,
13998               PyObject **p_output,
13999               _PyUnicodeWriter *writer)
14000{
14001    PyObject *iobj, *res;
14002    char type = (char)arg->ch;
14003
14004    if (!PyNumber_Check(v))
14005        goto wrongtype;
14006
14007    /* make sure number is a type of integer for o, x, and X */
14008    if (!PyLong_Check(v)) {
14009        if (type == 'o' || type == 'x' || type == 'X') {
14010            iobj = PyNumber_Index(v);
14011            if (iobj == NULL) {
14012                if (PyErr_ExceptionMatches(PyExc_TypeError))
14013                    goto wrongtype;
14014                return -1;
14015            }
14016        }
14017        else {
14018            iobj = PyNumber_Long(v);
14019            if (iobj == NULL ) {
14020                if (PyErr_ExceptionMatches(PyExc_TypeError))
14021                    goto wrongtype;
14022                return -1;
14023            }
14024        }
14025        assert(PyLong_Check(iobj));
14026    }
14027    else {
14028        iobj = v;
14029        Py_INCREF(iobj);
14030    }
14031
14032    if (PyLong_CheckExact(v)
14033        && arg->width == -1 && arg->prec == -1
14034        && !(arg->flags & (F_SIGN | F_BLANK))
14035        && type != 'X')
14036    {
14037        /* Fast path */
14038        int alternate = arg->flags & F_ALT;
14039        int base;
14040
14041        switch(type)
14042        {
14043            default:
14044                assert(0 && "'type' not in [diuoxX]");
14045            case 'd':
14046            case 'i':
14047            case 'u':
14048                base = 10;
14049                break;
14050            case 'o':
14051                base = 8;
14052                break;
14053            case 'x':
14054            case 'X':
14055                base = 16;
14056                break;
14057        }
14058
14059        if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14060            Py_DECREF(iobj);
14061            return -1;
14062        }
14063        Py_DECREF(iobj);
14064        return 1;
14065    }
14066
14067    res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
14068    Py_DECREF(iobj);
14069    if (res == NULL)
14070        return -1;
14071    *p_output = res;
14072    return 0;
14073
14074wrongtype:
14075    switch(type)
14076    {
14077        case 'o':
14078        case 'x':
14079        case 'X':
14080            PyErr_Format(PyExc_TypeError,
14081                    "%%%c format: an integer is required, "
14082                    "not %.200s",
14083                    type, Py_TYPE(v)->tp_name);
14084            break;
14085        default:
14086            PyErr_Format(PyExc_TypeError,
14087                    "%%%c format: a number is required, "
14088                    "not %.200s",
14089                    type, Py_TYPE(v)->tp_name);
14090            break;
14091    }
14092    return -1;
14093}
14094
14095static Py_UCS4
14096formatchar(PyObject *v)
14097{
14098    /* presume that the buffer is at least 3 characters long */
14099    if (PyUnicode_Check(v)) {
14100        if (PyUnicode_GET_LENGTH(v) == 1) {
14101            return PyUnicode_READ_CHAR(v, 0);
14102        }
14103        goto onError;
14104    }
14105    else {
14106        PyObject *iobj;
14107        long x;
14108        /* make sure number is a type of integer */
14109        if (!PyLong_Check(v)) {
14110            iobj = PyNumber_Index(v);
14111            if (iobj == NULL) {
14112                goto onError;
14113            }
14114            v = iobj;
14115            Py_DECREF(iobj);
14116        }
14117        /* Integer input truncated to a character */
14118        x = PyLong_AsLong(v);
14119        if (x == -1 && PyErr_Occurred())
14120            goto onError;
14121
14122        if (x < 0 || x > MAX_UNICODE) {
14123            PyErr_SetString(PyExc_OverflowError,
14124                            "%c arg not in range(0x110000)");
14125            return (Py_UCS4) -1;
14126        }
14127
14128        return (Py_UCS4) x;
14129    }
14130
14131  onError:
14132    PyErr_SetString(PyExc_TypeError,
14133                    "%c requires int or char");
14134    return (Py_UCS4) -1;
14135}
14136
14137/* Parse options of an argument: flags, width, precision.
14138   Handle also "%(name)" syntax.
14139
14140   Return 0 if the argument has been formatted into arg->str.
14141   Return 1 if the argument has been written into ctx->writer,
14142   Raise an exception and return -1 on error. */
14143static int
14144unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14145                         struct unicode_format_arg_t *arg)
14146{
14147#define FORMAT_READ(ctx) \
14148        PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14149
14150    PyObject *v;
14151
14152    if (arg->ch == '(') {
14153        /* Get argument value from a dictionary. Example: "%(name)s". */
14154        Py_ssize_t keystart;
14155        Py_ssize_t keylen;
14156        PyObject *key;
14157        int pcount = 1;
14158
14159        if (ctx->dict == NULL) {
14160            PyErr_SetString(PyExc_TypeError,
14161                            "format requires a mapping");
14162            return -1;
14163        }
14164        ++ctx->fmtpos;
14165        --ctx->fmtcnt;
14166        keystart = ctx->fmtpos;
14167        /* Skip over balanced parentheses */
14168        while (pcount > 0 && --ctx->fmtcnt >= 0) {
14169            arg->ch = FORMAT_READ(ctx);
14170            if (arg->ch == ')')
14171                --pcount;
14172            else if (arg->ch == '(')
14173                ++pcount;
14174            ctx->fmtpos++;
14175        }
14176        keylen = ctx->fmtpos - keystart - 1;
14177        if (ctx->fmtcnt < 0 || pcount > 0) {
14178            PyErr_SetString(PyExc_ValueError,
14179                            "incomplete format key");
14180            return -1;
14181        }
14182        key = PyUnicode_Substring(ctx->fmtstr,
14183                                  keystart, keystart + keylen);
14184        if (key == NULL)
14185            return -1;
14186        if (ctx->args_owned) {
14187            Py_DECREF(ctx->args);
14188            ctx->args_owned = 0;
14189        }
14190        ctx->args = PyObject_GetItem(ctx->dict, key);
14191        Py_DECREF(key);
14192        if (ctx->args == NULL)
14193            return -1;
14194        ctx->args_owned = 1;
14195        ctx->arglen = -1;
14196        ctx->argidx = -2;
14197    }
14198
14199    /* Parse flags. Example: "%+i" => flags=F_SIGN. */
14200    while (--ctx->fmtcnt >= 0) {
14201        arg->ch = FORMAT_READ(ctx);
14202        ctx->fmtpos++;
14203        switch (arg->ch) {
14204        case '-': arg->flags |= F_LJUST; continue;
14205        case '+': arg->flags |= F_SIGN; continue;
14206        case ' ': arg->flags |= F_BLANK; continue;
14207        case '#': arg->flags |= F_ALT; continue;
14208        case '0': arg->flags |= F_ZERO; continue;
14209        }
14210        break;
14211    }
14212
14213    /* Parse width. Example: "%10s" => width=10 */
14214    if (arg->ch == '*') {
14215        v = unicode_format_getnextarg(ctx);
14216        if (v == NULL)
14217            return -1;
14218        if (!PyLong_Check(v)) {
14219            PyErr_SetString(PyExc_TypeError,
14220                            "* wants int");
14221            return -1;
14222        }
14223        arg->width = PyLong_AsSsize_t(v);
14224        if (arg->width == -1 && PyErr_Occurred())
14225            return -1;
14226        if (arg->width < 0) {
14227            arg->flags |= F_LJUST;
14228            arg->width = -arg->width;
14229        }
14230        if (--ctx->fmtcnt >= 0) {
14231            arg->ch = FORMAT_READ(ctx);
14232            ctx->fmtpos++;
14233        }
14234    }
14235    else if (arg->ch >= '0' && arg->ch <= '9') {
14236        arg->width = arg->ch - '0';
14237        while (--ctx->fmtcnt >= 0) {
14238            arg->ch = FORMAT_READ(ctx);
14239            ctx->fmtpos++;
14240            if (arg->ch < '0' || arg->ch > '9')
14241                break;
14242            /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14243               mixing signed and unsigned comparison. Since arg->ch is between
14244               '0' and '9', casting to int is safe. */
14245            if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14246                PyErr_SetString(PyExc_ValueError,
14247                                "width too big");
14248                return -1;
14249            }
14250            arg->width = arg->width*10 + (arg->ch - '0');
14251        }
14252    }
14253
14254    /* Parse precision. Example: "%.3f" => prec=3 */
14255    if (arg->ch == '.') {
14256        arg->prec = 0;
14257        if (--ctx->fmtcnt >= 0) {
14258            arg->ch = FORMAT_READ(ctx);
14259            ctx->fmtpos++;
14260        }
14261        if (arg->ch == '*') {
14262            v = unicode_format_getnextarg(ctx);
14263            if (v == NULL)
14264                return -1;
14265            if (!PyLong_Check(v)) {
14266                PyErr_SetString(PyExc_TypeError,
14267                                "* wants int");
14268                return -1;
14269            }
14270            arg->prec = _PyLong_AsInt(v);
14271            if (arg->prec == -1 && PyErr_Occurred())
14272                return -1;
14273            if (arg->prec < 0)
14274                arg->prec = 0;
14275            if (--ctx->fmtcnt >= 0) {
14276                arg->ch = FORMAT_READ(ctx);
14277                ctx->fmtpos++;
14278            }
14279        }
14280        else if (arg->ch >= '0' && arg->ch <= '9') {
14281            arg->prec = arg->ch - '0';
14282            while (--ctx->fmtcnt >= 0) {
14283                arg->ch = FORMAT_READ(ctx);
14284                ctx->fmtpos++;
14285                if (arg->ch < '0' || arg->ch > '9')
14286                    break;
14287                if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14288                    PyErr_SetString(PyExc_ValueError,
14289                                    "precision too big");
14290                    return -1;
14291                }
14292                arg->prec = arg->prec*10 + (arg->ch - '0');
14293            }
14294        }
14295    }
14296
14297    /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14298    if (ctx->fmtcnt >= 0) {
14299        if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14300            if (--ctx->fmtcnt >= 0) {
14301                arg->ch = FORMAT_READ(ctx);
14302                ctx->fmtpos++;
14303            }
14304        }
14305    }
14306    if (ctx->fmtcnt < 0) {
14307        PyErr_SetString(PyExc_ValueError,
14308                        "incomplete format");
14309        return -1;
14310    }
14311    return 0;
14312
14313#undef FORMAT_READ
14314}
14315
14316/* Format one argument. Supported conversion specifiers:
14317
14318   - "s", "r", "a": any type
14319   - "i", "d", "u": int or float
14320   - "o", "x", "X": int
14321   - "e", "E", "f", "F", "g", "G": float
14322   - "c": int or str (1 character)
14323
14324   When possible, the output is written directly into the Unicode writer
14325   (ctx->writer). A string is created when padding is required.
14326
14327   Return 0 if the argument has been formatted into *p_str,
14328          1 if the argument has been written into ctx->writer,
14329         -1 on error. */
14330static int
14331unicode_format_arg_format(struct unicode_formatter_t *ctx,
14332                          struct unicode_format_arg_t *arg,
14333                          PyObject **p_str)
14334{
14335    PyObject *v;
14336    _PyUnicodeWriter *writer = &ctx->writer;
14337
14338    if (ctx->fmtcnt == 0)
14339        ctx->writer.overallocate = 0;
14340
14341    if (arg->ch == '%') {
14342        if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
14343            return -1;
14344        return 1;
14345    }
14346
14347    v = unicode_format_getnextarg(ctx);
14348    if (v == NULL)
14349        return -1;
14350
14351
14352    switch (arg->ch) {
14353    case 's':
14354    case 'r':
14355    case 'a':
14356        if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14357            /* Fast path */
14358            if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14359                return -1;
14360            return 1;
14361        }
14362
14363        if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14364            *p_str = v;
14365            Py_INCREF(*p_str);
14366        }
14367        else {
14368            if (arg->ch == 's')
14369                *p_str = PyObject_Str(v);
14370            else if (arg->ch == 'r')
14371                *p_str = PyObject_Repr(v);
14372            else
14373                *p_str = PyObject_ASCII(v);
14374        }
14375        break;
14376
14377    case 'i':
14378    case 'd':
14379    case 'u':
14380    case 'o':
14381    case 'x':
14382    case 'X':
14383    {
14384        int ret = mainformatlong(v, arg, p_str, writer);
14385        if (ret != 0)
14386            return ret;
14387        arg->sign = 1;
14388        break;
14389    }
14390
14391    case 'e':
14392    case 'E':
14393    case 'f':
14394    case 'F':
14395    case 'g':
14396    case 'G':
14397        if (arg->width == -1 && arg->prec == -1
14398            && !(arg->flags & (F_SIGN | F_BLANK)))
14399        {
14400            /* Fast path */
14401            if (formatfloat(v, arg, NULL, writer) == -1)
14402                return -1;
14403            return 1;
14404        }
14405
14406        arg->sign = 1;
14407        if (formatfloat(v, arg, p_str, NULL) == -1)
14408            return -1;
14409        break;
14410
14411    case 'c':
14412    {
14413        Py_UCS4 ch = formatchar(v);
14414        if (ch == (Py_UCS4) -1)
14415            return -1;
14416        if (arg->width == -1 && arg->prec == -1) {
14417            /* Fast path */
14418            if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
14419                return -1;
14420            return 1;
14421        }
14422        *p_str = PyUnicode_FromOrdinal(ch);
14423        break;
14424    }
14425
14426    default:
14427        PyErr_Format(PyExc_ValueError,
14428                     "unsupported format character '%c' (0x%x) "
14429                     "at index %zd",
14430                     (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14431                     (int)arg->ch,
14432                     ctx->fmtpos - 1);
14433        return -1;
14434    }
14435    if (*p_str == NULL)
14436        return -1;
14437    assert (PyUnicode_Check(*p_str));
14438    return 0;
14439}
14440
14441static int
14442unicode_format_arg_output(struct unicode_formatter_t *ctx,
14443                          struct unicode_format_arg_t *arg,
14444                          PyObject *str)
14445{
14446    Py_ssize_t len;
14447    enum PyUnicode_Kind kind;
14448    void *pbuf;
14449    Py_ssize_t pindex;
14450    Py_UCS4 signchar;
14451    Py_ssize_t buflen;
14452    Py_UCS4 maxchar;
14453    Py_ssize_t sublen;
14454    _PyUnicodeWriter *writer = &ctx->writer;
14455    Py_UCS4 fill;
14456
14457    fill = ' ';
14458    if (arg->sign && arg->flags & F_ZERO)
14459        fill = '0';
14460
14461    if (PyUnicode_READY(str) == -1)
14462        return -1;
14463
14464    len = PyUnicode_GET_LENGTH(str);
14465    if ((arg->width == -1 || arg->width <= len)
14466        && (arg->prec == -1 || arg->prec >= len)
14467        && !(arg->flags & (F_SIGN | F_BLANK)))
14468    {
14469        /* Fast path */
14470        if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14471            return -1;
14472        return 0;
14473    }
14474
14475    /* Truncate the string for "s", "r" and "a" formats
14476       if the precision is set */
14477    if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14478        if (arg->prec >= 0 && len > arg->prec)
14479            len = arg->prec;
14480    }
14481
14482    /* Adjust sign and width */
14483    kind = PyUnicode_KIND(str);
14484    pbuf = PyUnicode_DATA(str);
14485    pindex = 0;
14486    signchar = '\0';
14487    if (arg->sign) {
14488        Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14489        if (ch == '-' || ch == '+') {
14490            signchar = ch;
14491            len--;
14492            pindex++;
14493        }
14494        else if (arg->flags & F_SIGN)
14495            signchar = '+';
14496        else if (arg->flags & F_BLANK)
14497            signchar = ' ';
14498        else
14499            arg->sign = 0;
14500    }
14501    if (arg->width < len)
14502        arg->width = len;
14503
14504    /* Prepare the writer */
14505    maxchar = writer->maxchar;
14506    if (!(arg->flags & F_LJUST)) {
14507        if (arg->sign) {
14508            if ((arg->width-1) > len)
14509                maxchar = Py_MAX(maxchar, fill);
14510        }
14511        else {
14512            if (arg->width > len)
14513                maxchar = Py_MAX(maxchar, fill);
14514        }
14515    }
14516    if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14517        Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
14518        maxchar = Py_MAX(maxchar, strmaxchar);
14519    }
14520
14521    buflen = arg->width;
14522    if (arg->sign && len == arg->width)
14523        buflen++;
14524    if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
14525        return -1;
14526
14527    /* Write the sign if needed */
14528    if (arg->sign) {
14529        if (fill != ' ') {
14530            PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14531            writer->pos += 1;
14532        }
14533        if (arg->width > len)
14534            arg->width--;
14535    }
14536
14537    /* Write the numeric prefix for "x", "X" and "o" formats
14538       if the alternate form is used.
14539       For example, write "0x" for the "%#x" format. */
14540    if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14541        assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14542        assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14543        if (fill != ' ') {
14544            PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14545            PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14546            writer->pos += 2;
14547            pindex += 2;
14548        }
14549        arg->width -= 2;
14550        if (arg->width < 0)
14551            arg->width = 0;
14552        len -= 2;
14553    }
14554
14555    /* Pad left with the fill character if needed */
14556    if (arg->width > len && !(arg->flags & F_LJUST)) {
14557        sublen = arg->width - len;
14558        FILL(writer->kind, writer->data, fill, writer->pos, sublen);
14559        writer->pos += sublen;
14560        arg->width = len;
14561    }
14562
14563    /* If padding with spaces: write sign if needed and/or numeric prefix if
14564       the alternate form is used */
14565    if (fill == ' ') {
14566        if (arg->sign) {
14567            PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14568            writer->pos += 1;
14569        }
14570        if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14571            assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14572            assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14573            PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14574            PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14575            writer->pos += 2;
14576            pindex += 2;
14577        }
14578    }
14579
14580    /* Write characters */
14581    if (len) {
14582        _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14583                                      str, pindex, len);
14584        writer->pos += len;
14585    }
14586
14587    /* Pad right with the fill character if needed */
14588    if (arg->width > len) {
14589        sublen = arg->width - len;
14590        FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
14591        writer->pos += sublen;
14592    }
14593    return 0;
14594}
14595
14596/* Helper of PyUnicode_Format(): format one arg.
14597   Return 0 on success, raise an exception and return -1 on error. */
14598static int
14599unicode_format_arg(struct unicode_formatter_t *ctx)
14600{
14601    struct unicode_format_arg_t arg;
14602    PyObject *str;
14603    int ret;
14604
14605    arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
14606    arg.flags = 0;
14607    arg.width = -1;
14608    arg.prec = -1;
14609    arg.sign = 0;
14610    str = NULL;
14611
14612    ret = unicode_format_arg_parse(ctx, &arg);
14613    if (ret == -1)
14614        return -1;
14615
14616    ret = unicode_format_arg_format(ctx, &arg, &str);
14617    if (ret == -1)
14618        return -1;
14619
14620    if (ret != 1) {
14621        ret = unicode_format_arg_output(ctx, &arg, str);
14622        Py_DECREF(str);
14623        if (ret == -1)
14624            return -1;
14625    }
14626
14627    if (ctx->dict && (ctx->argidx < ctx->arglen) && arg.ch != '%') {
14628        PyErr_SetString(PyExc_TypeError,
14629                        "not all arguments converted during string formatting");
14630        return -1;
14631    }
14632    return 0;
14633}
14634
14635PyObject *
14636PyUnicode_Format(PyObject *format, PyObject *args)
14637{
14638    struct unicode_formatter_t ctx;
14639
14640    if (format == NULL || args == NULL) {
14641        PyErr_BadInternalCall();
14642        return NULL;
14643    }
14644
14645    ctx.fmtstr = PyUnicode_FromObject(format);
14646    if (ctx.fmtstr == NULL)
14647        return NULL;
14648    if (PyUnicode_READY(ctx.fmtstr) == -1) {
14649        Py_DECREF(ctx.fmtstr);
14650        return NULL;
14651    }
14652    ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14653    ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14654    ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14655    ctx.fmtpos = 0;
14656
14657    _PyUnicodeWriter_Init(&ctx.writer);
14658    ctx.writer.min_length = ctx.fmtcnt + 100;
14659    ctx.writer.overallocate = 1;
14660
14661    if (PyTuple_Check(args)) {
14662        ctx.arglen = PyTuple_Size(args);
14663        ctx.argidx = 0;
14664    }
14665    else {
14666        ctx.arglen = -1;
14667        ctx.argidx = -2;
14668    }
14669    ctx.args_owned = 0;
14670    if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
14671        ctx.dict = args;
14672    else
14673        ctx.dict = NULL;
14674    ctx.args = args;
14675
14676    while (--ctx.fmtcnt >= 0) {
14677        if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14678            Py_ssize_t nonfmtpos;
14679
14680            nonfmtpos = ctx.fmtpos++;
14681            while (ctx.fmtcnt >= 0 &&
14682                   PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14683                ctx.fmtpos++;
14684                ctx.fmtcnt--;
14685            }
14686            if (ctx.fmtcnt < 0) {
14687                ctx.fmtpos--;
14688                ctx.writer.overallocate = 0;
14689            }
14690
14691            if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
14692                                                nonfmtpos, ctx.fmtpos) < 0)
14693                goto onError;
14694        }
14695        else {
14696            ctx.fmtpos++;
14697            if (unicode_format_arg(&ctx) == -1)
14698                goto onError;
14699        }
14700    }
14701
14702    if (ctx.argidx < ctx.arglen && !ctx.dict) {
14703        PyErr_SetString(PyExc_TypeError,
14704                        "not all arguments converted during string formatting");
14705        goto onError;
14706    }
14707
14708    if (ctx.args_owned) {
14709        Py_DECREF(ctx.args);
14710    }
14711    Py_DECREF(ctx.fmtstr);
14712    return _PyUnicodeWriter_Finish(&ctx.writer);
14713
14714  onError:
14715    Py_DECREF(ctx.fmtstr);
14716    _PyUnicodeWriter_Dealloc(&ctx.writer);
14717    if (ctx.args_owned) {
14718        Py_DECREF(ctx.args);
14719    }
14720    return NULL;
14721}
14722
14723static PyObject *
14724unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
14725
14726static PyObject *
14727unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14728{
14729    PyObject *x = NULL;
14730    static char *kwlist[] = {"object", "encoding", "errors", 0};
14731    char *encoding = NULL;
14732    char *errors = NULL;
14733
14734    if (type != &PyUnicode_Type)
14735        return unicode_subtype_new(type, args, kwds);
14736    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
14737                                     kwlist, &x, &encoding, &errors))
14738        return NULL;
14739    if (x == NULL)
14740        _Py_RETURN_UNICODE_EMPTY();
14741    if (encoding == NULL && errors == NULL)
14742        return PyObject_Str(x);
14743    else
14744        return PyUnicode_FromEncodedObject(x, encoding, errors);
14745}
14746
14747static PyObject *
14748unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14749{
14750    PyObject *unicode, *self;
14751    Py_ssize_t length, char_size;
14752    int share_wstr, share_utf8;
14753    unsigned int kind;
14754    void *data;
14755
14756    assert(PyType_IsSubtype(type, &PyUnicode_Type));
14757
14758    unicode = unicode_new(&PyUnicode_Type, args, kwds);
14759    if (unicode == NULL)
14760        return NULL;
14761    assert(_PyUnicode_CHECK(unicode));
14762    if (PyUnicode_READY(unicode) == -1) {
14763        Py_DECREF(unicode);
14764        return NULL;
14765    }
14766
14767    self = type->tp_alloc(type, 0);
14768    if (self == NULL) {
14769        Py_DECREF(unicode);
14770        return NULL;
14771    }
14772    kind = PyUnicode_KIND(unicode);
14773    length = PyUnicode_GET_LENGTH(unicode);
14774
14775    _PyUnicode_LENGTH(self) = length;
14776#ifdef Py_DEBUG
14777    _PyUnicode_HASH(self) = -1;
14778#else
14779    _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14780#endif
14781    _PyUnicode_STATE(self).interned = 0;
14782    _PyUnicode_STATE(self).kind = kind;
14783    _PyUnicode_STATE(self).compact = 0;
14784    _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
14785    _PyUnicode_STATE(self).ready = 1;
14786    _PyUnicode_WSTR(self) = NULL;
14787    _PyUnicode_UTF8_LENGTH(self) = 0;
14788    _PyUnicode_UTF8(self) = NULL;
14789    _PyUnicode_WSTR_LENGTH(self) = 0;
14790    _PyUnicode_DATA_ANY(self) = NULL;
14791
14792    share_utf8 = 0;
14793    share_wstr = 0;
14794    if (kind == PyUnicode_1BYTE_KIND) {
14795        char_size = 1;
14796        if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
14797            share_utf8 = 1;
14798    }
14799    else if (kind == PyUnicode_2BYTE_KIND) {
14800        char_size = 2;
14801        if (sizeof(wchar_t) == 2)
14802            share_wstr = 1;
14803    }
14804    else {
14805        assert(kind == PyUnicode_4BYTE_KIND);
14806        char_size = 4;
14807        if (sizeof(wchar_t) == 4)
14808            share_wstr = 1;
14809    }
14810
14811    /* Ensure we won't overflow the length. */
14812    if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
14813        PyErr_NoMemory();
14814        goto onError;
14815    }
14816    data = PyObject_MALLOC((length + 1) * char_size);
14817    if (data == NULL) {
14818        PyErr_NoMemory();
14819        goto onError;
14820    }
14821
14822    _PyUnicode_DATA_ANY(self) = data;
14823    if (share_utf8) {
14824        _PyUnicode_UTF8_LENGTH(self) = length;
14825        _PyUnicode_UTF8(self) = data;
14826    }
14827    if (share_wstr) {
14828        _PyUnicode_WSTR_LENGTH(self) = length;
14829        _PyUnicode_WSTR(self) = (wchar_t *)data;
14830    }
14831
14832    Py_MEMCPY(data, PyUnicode_DATA(unicode),
14833              kind * (length + 1));
14834    assert(_PyUnicode_CheckConsistency(self, 1));
14835#ifdef Py_DEBUG
14836    _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14837#endif
14838    Py_DECREF(unicode);
14839    return self;
14840
14841onError:
14842    Py_DECREF(unicode);
14843    Py_DECREF(self);
14844    return NULL;
14845}
14846
14847PyDoc_STRVAR(unicode_doc,
14848"str(object='') -> str\n\
14849str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
14850\n\
14851Create a new string object from the given object. If encoding or\n\
14852errors is specified, then the object must expose a data buffer\n\
14853that will be decoded using the given encoding and error handler.\n\
14854Otherwise, returns the result of object.__str__() (if defined)\n\
14855or repr(object).\n\
14856encoding defaults to sys.getdefaultencoding().\n\
14857errors defaults to 'strict'.");
14858
14859static PyObject *unicode_iter(PyObject *seq);
14860
14861PyTypeObject PyUnicode_Type = {
14862    PyVarObject_HEAD_INIT(&PyType_Type, 0)
14863    "str",              /* tp_name */
14864    sizeof(PyUnicodeObject),        /* tp_size */
14865    0,                  /* tp_itemsize */
14866    /* Slots */
14867    (destructor)unicode_dealloc,    /* tp_dealloc */
14868    0,                  /* tp_print */
14869    0,                  /* tp_getattr */
14870    0,                  /* tp_setattr */
14871    0,                  /* tp_reserved */
14872    unicode_repr,           /* tp_repr */
14873    &unicode_as_number,         /* tp_as_number */
14874    &unicode_as_sequence,       /* tp_as_sequence */
14875    &unicode_as_mapping,        /* tp_as_mapping */
14876    (hashfunc) unicode_hash,        /* tp_hash*/
14877    0,                  /* tp_call*/
14878    (reprfunc) unicode_str,     /* tp_str */
14879    PyObject_GenericGetAttr,        /* tp_getattro */
14880    0,                  /* tp_setattro */
14881    0,                  /* tp_as_buffer */
14882    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
14883    Py_TPFLAGS_UNICODE_SUBCLASS,    /* tp_flags */
14884    unicode_doc,            /* tp_doc */
14885    0,                  /* tp_traverse */
14886    0,                  /* tp_clear */
14887    PyUnicode_RichCompare,      /* tp_richcompare */
14888    0,                  /* tp_weaklistoffset */
14889    unicode_iter,           /* tp_iter */
14890    0,                  /* tp_iternext */
14891    unicode_methods,            /* tp_methods */
14892    0,                  /* tp_members */
14893    0,                  /* tp_getset */
14894    &PyBaseObject_Type,         /* tp_base */
14895    0,                  /* tp_dict */
14896    0,                  /* tp_descr_get */
14897    0,                  /* tp_descr_set */
14898    0,                  /* tp_dictoffset */
14899    0,                  /* tp_init */
14900    0,                  /* tp_alloc */
14901    unicode_new,            /* tp_new */
14902    PyObject_Del,           /* tp_free */
14903};
14904
14905/* Initialize the Unicode implementation */
14906
14907int _PyUnicode_Init(void)
14908{
14909    /* XXX - move this array to unicodectype.c ? */
14910    Py_UCS2 linebreak[] = {
14911        0x000A, /* LINE FEED */
14912        0x000D, /* CARRIAGE RETURN */
14913        0x001C, /* FILE SEPARATOR */
14914        0x001D, /* GROUP SEPARATOR */
14915        0x001E, /* RECORD SEPARATOR */
14916        0x0085, /* NEXT LINE */
14917        0x2028, /* LINE SEPARATOR */
14918        0x2029, /* PARAGRAPH SEPARATOR */
14919    };
14920
14921    /* Init the implementation */
14922    _Py_INCREF_UNICODE_EMPTY();
14923    if (!unicode_empty)
14924        Py_FatalError("Can't create empty string");
14925    Py_DECREF(unicode_empty);
14926
14927    if (PyType_Ready(&PyUnicode_Type) < 0)
14928        Py_FatalError("Can't initialize 'unicode'");
14929
14930    /* initialize the linebreak bloom filter */
14931    bloom_linebreak = make_bloom_mask(
14932        PyUnicode_2BYTE_KIND, linebreak,
14933        Py_ARRAY_LENGTH(linebreak));
14934
14935    if (PyType_Ready(&EncodingMapType) < 0)
14936         Py_FatalError("Can't initialize encoding map type");
14937
14938    if (PyType_Ready(&PyFieldNameIter_Type) < 0)
14939        Py_FatalError("Can't initialize field name iterator type");
14940
14941    if (PyType_Ready(&PyFormatterIter_Type) < 0)
14942        Py_FatalError("Can't initialize formatter iter type");
14943
14944    return 0;
14945}
14946
14947/* Finalize the Unicode implementation */
14948
14949int
14950PyUnicode_ClearFreeList(void)
14951{
14952    return 0;
14953}
14954
14955void
14956_PyUnicode_Fini(void)
14957{
14958    int i;
14959
14960    Py_CLEAR(unicode_empty);
14961
14962    for (i = 0; i < 256; i++)
14963        Py_CLEAR(unicode_latin1[i]);
14964    _PyUnicode_ClearStaticStrings();
14965    (void)PyUnicode_ClearFreeList();
14966}
14967
14968void
14969PyUnicode_InternInPlace(PyObject **p)
14970{
14971    PyObject *s = *p;
14972    PyObject *t;
14973#ifdef Py_DEBUG
14974    assert(s != NULL);
14975    assert(_PyUnicode_CHECK(s));
14976#else
14977    if (s == NULL || !PyUnicode_Check(s))
14978        return;
14979#endif
14980    /* If it's a subclass, we don't really know what putting
14981       it in the interned dict might do. */
14982    if (!PyUnicode_CheckExact(s))
14983        return;
14984    if (PyUnicode_CHECK_INTERNED(s))
14985        return;
14986    if (interned == NULL) {
14987        interned = PyDict_New();
14988        if (interned == NULL) {
14989            PyErr_Clear(); /* Don't leave an exception */
14990            return;
14991        }
14992    }
14993    /* It might be that the GetItem call fails even
14994       though the key is present in the dictionary,
14995       namely when this happens during a stack overflow. */
14996    Py_ALLOW_RECURSION
14997    t = PyDict_GetItem(interned, s);
14998    Py_END_ALLOW_RECURSION
14999
15000    if (t) {
15001        Py_INCREF(t);
15002        Py_DECREF(*p);
15003        *p = t;
15004        return;
15005    }
15006
15007    PyThreadState_GET()->recursion_critical = 1;
15008    if (PyDict_SetItem(interned, s, s) < 0) {
15009        PyErr_Clear();
15010        PyThreadState_GET()->recursion_critical = 0;
15011        return;
15012    }
15013    PyThreadState_GET()->recursion_critical = 0;
15014    /* The two references in interned are not counted by refcnt.
15015       The deallocator will take care of this */
15016    Py_REFCNT(s) -= 2;
15017    _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
15018}
15019
15020void
15021PyUnicode_InternImmortal(PyObject **p)
15022{
15023    PyUnicode_InternInPlace(p);
15024    if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
15025        _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
15026        Py_INCREF(*p);
15027    }
15028}
15029
15030PyObject *
15031PyUnicode_InternFromString(const char *cp)
15032{
15033    PyObject *s = PyUnicode_FromString(cp);
15034    if (s == NULL)
15035        return NULL;
15036    PyUnicode_InternInPlace(&s);
15037    return s;
15038}
15039
15040void
15041_Py_ReleaseInternedUnicodeStrings(void)
15042{
15043    PyObject *keys;
15044    PyObject *s;
15045    Py_ssize_t i, n;
15046    Py_ssize_t immortal_size = 0, mortal_size = 0;
15047
15048    if (interned == NULL || !PyDict_Check(interned))
15049        return;
15050    keys = PyDict_Keys(interned);
15051    if (keys == NULL || !PyList_Check(keys)) {
15052        PyErr_Clear();
15053        return;
15054    }
15055
15056    /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
15057       detector, interned unicode strings are not forcibly deallocated;
15058       rather, we give them their stolen references back, and then clear
15059       and DECREF the interned dict. */
15060
15061    n = PyList_GET_SIZE(keys);
15062    fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
15063            n);
15064    for (i = 0; i < n; i++) {
15065        s = PyList_GET_ITEM(keys, i);
15066        if (PyUnicode_READY(s) == -1) {
15067            assert(0 && "could not ready string");
15068            fprintf(stderr, "could not ready string\n");
15069        }
15070        switch (PyUnicode_CHECK_INTERNED(s)) {
15071        case SSTATE_NOT_INTERNED:
15072            /* XXX Shouldn't happen */
15073            break;
15074        case SSTATE_INTERNED_IMMORTAL:
15075            Py_REFCNT(s) += 1;
15076            immortal_size += PyUnicode_GET_LENGTH(s);
15077            break;
15078        case SSTATE_INTERNED_MORTAL:
15079            Py_REFCNT(s) += 2;
15080            mortal_size += PyUnicode_GET_LENGTH(s);
15081            break;
15082        default:
15083            Py_FatalError("Inconsistent interned string state.");
15084        }
15085        _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
15086    }
15087    fprintf(stderr, "total size of all interned strings: "
15088            "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15089            "mortal/immortal\n", mortal_size, immortal_size);
15090    Py_DECREF(keys);
15091    PyDict_Clear(interned);
15092    Py_CLEAR(interned);
15093}
15094
15095
15096/********************* Unicode Iterator **************************/
15097
15098typedef struct {
15099    PyObject_HEAD
15100    Py_ssize_t it_index;
15101    PyObject *it_seq;    /* Set to NULL when iterator is exhausted */
15102} unicodeiterobject;
15103
15104static void
15105unicodeiter_dealloc(unicodeiterobject *it)
15106{
15107    _PyObject_GC_UNTRACK(it);
15108    Py_XDECREF(it->it_seq);
15109    PyObject_GC_Del(it);
15110}
15111
15112static int
15113unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15114{
15115    Py_VISIT(it->it_seq);
15116    return 0;
15117}
15118
15119static PyObject *
15120unicodeiter_next(unicodeiterobject *it)
15121{
15122    PyObject *seq, *item;
15123
15124    assert(it != NULL);
15125    seq = it->it_seq;
15126    if (seq == NULL)
15127        return NULL;
15128    assert(_PyUnicode_CHECK(seq));
15129
15130    if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15131        int kind = PyUnicode_KIND(seq);
15132        void *data = PyUnicode_DATA(seq);
15133        Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15134        item = PyUnicode_FromOrdinal(chr);
15135        if (item != NULL)
15136            ++it->it_index;
15137        return item;
15138    }
15139
15140    Py_DECREF(seq);
15141    it->it_seq = NULL;
15142    return NULL;
15143}
15144
15145static PyObject *
15146unicodeiter_len(unicodeiterobject *it)
15147{
15148    Py_ssize_t len = 0;
15149    if (it->it_seq)
15150        len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
15151    return PyLong_FromSsize_t(len);
15152}
15153
15154PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15155
15156static PyObject *
15157unicodeiter_reduce(unicodeiterobject *it)
15158{
15159    if (it->it_seq != NULL) {
15160        return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
15161                             it->it_seq, it->it_index);
15162    } else {
15163        PyObject *u = PyUnicode_FromUnicode(NULL, 0);
15164        if (u == NULL)
15165            return NULL;
15166        return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
15167    }
15168}
15169
15170PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15171
15172static PyObject *
15173unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15174{
15175    Py_ssize_t index = PyLong_AsSsize_t(state);
15176    if (index == -1 && PyErr_Occurred())
15177        return NULL;
15178    if (it->it_seq != NULL) {
15179        if (index < 0)
15180            index = 0;
15181        else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15182            index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15183        it->it_index = index;
15184    }
15185    Py_RETURN_NONE;
15186}
15187
15188PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15189
15190static PyMethodDef unicodeiter_methods[] = {
15191    {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
15192     length_hint_doc},
15193    {"__reduce__",      (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15194     reduce_doc},
15195    {"__setstate__",    (PyCFunction)unicodeiter_setstate, METH_O,
15196     setstate_doc},
15197    {NULL,      NULL}       /* sentinel */
15198};
15199
15200PyTypeObject PyUnicodeIter_Type = {
15201    PyVarObject_HEAD_INIT(&PyType_Type, 0)
15202    "str_iterator",         /* tp_name */
15203    sizeof(unicodeiterobject),      /* tp_basicsize */
15204    0,                  /* tp_itemsize */
15205    /* methods */
15206    (destructor)unicodeiter_dealloc,    /* tp_dealloc */
15207    0,                  /* tp_print */
15208    0,                  /* tp_getattr */
15209    0,                  /* tp_setattr */
15210    0,                  /* tp_reserved */
15211    0,                  /* tp_repr */
15212    0,                  /* tp_as_number */
15213    0,                  /* tp_as_sequence */
15214    0,                  /* tp_as_mapping */
15215    0,                  /* tp_hash */
15216    0,                  /* tp_call */
15217    0,                  /* tp_str */
15218    PyObject_GenericGetAttr,        /* tp_getattro */
15219    0,                  /* tp_setattro */
15220    0,                  /* tp_as_buffer */
15221    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15222    0,                  /* tp_doc */
15223    (traverseproc)unicodeiter_traverse, /* tp_traverse */
15224    0,                  /* tp_clear */
15225    0,                  /* tp_richcompare */
15226    0,                  /* tp_weaklistoffset */
15227    PyObject_SelfIter,          /* tp_iter */
15228    (iternextfunc)unicodeiter_next,     /* tp_iternext */
15229    unicodeiter_methods,            /* tp_methods */
15230    0,
15231};
15232
15233static PyObject *
15234unicode_iter(PyObject *seq)
15235{
15236    unicodeiterobject *it;
15237
15238    if (!PyUnicode_Check(seq)) {
15239        PyErr_BadInternalCall();
15240        return NULL;
15241    }
15242    if (PyUnicode_READY(seq) == -1)
15243        return NULL;
15244    it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15245    if (it == NULL)
15246        return NULL;
15247    it->it_index = 0;
15248    Py_INCREF(seq);
15249    it->it_seq = seq;
15250    _PyObject_GC_TRACK(it);
15251    return (PyObject *)it;
15252}
15253
15254
15255size_t
15256Py_UNICODE_strlen(const Py_UNICODE *u)
15257{
15258    int res = 0;
15259    while(*u++)
15260        res++;
15261    return res;
15262}
15263
15264Py_UNICODE*
15265Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15266{
15267    Py_UNICODE *u = s1;
15268    while ((*u++ = *s2++));
15269    return s1;
15270}
15271
15272Py_UNICODE*
15273Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15274{
15275    Py_UNICODE *u = s1;
15276    while ((*u++ = *s2++))
15277        if (n-- == 0)
15278            break;
15279    return s1;
15280}
15281
15282Py_UNICODE*
15283Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15284{
15285    Py_UNICODE *u1 = s1;
15286    u1 += Py_UNICODE_strlen(u1);
15287    Py_UNICODE_strcpy(u1, s2);
15288    return s1;
15289}
15290
15291int
15292Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15293{
15294    while (*s1 && *s2 && *s1 == *s2)
15295        s1++, s2++;
15296    if (*s1 && *s2)
15297        return (*s1 < *s2) ? -1 : +1;
15298    if (*s1)
15299        return 1;
15300    if (*s2)
15301        return -1;
15302    return 0;
15303}
15304
15305int
15306Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15307{
15308    Py_UNICODE u1, u2;
15309    for (; n != 0; n--) {
15310        u1 = *s1;
15311        u2 = *s2;
15312        if (u1 != u2)
15313            return (u1 < u2) ? -1 : +1;
15314        if (u1 == '\0')
15315            return 0;
15316        s1++;
15317        s2++;
15318    }
15319    return 0;
15320}
15321
15322Py_UNICODE*
15323Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15324{
15325    const Py_UNICODE *p;
15326    for (p = s; *p; p++)
15327        if (*p == c)
15328            return (Py_UNICODE*)p;
15329    return NULL;
15330}
15331
15332Py_UNICODE*
15333Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15334{
15335    const Py_UNICODE *p;
15336    p = s + Py_UNICODE_strlen(s);
15337    while (p != s) {
15338        p--;
15339        if (*p == c)
15340            return (Py_UNICODE*)p;
15341    }
15342    return NULL;
15343}
15344
15345Py_UNICODE*
15346PyUnicode_AsUnicodeCopy(PyObject *unicode)
15347{
15348    Py_UNICODE *u, *copy;
15349    Py_ssize_t len, size;
15350
15351    if (!PyUnicode_Check(unicode)) {
15352        PyErr_BadArgument();
15353        return NULL;
15354    }
15355    u = PyUnicode_AsUnicodeAndSize(unicode, &len);
15356    if (u == NULL)
15357        return NULL;
15358    /* Ensure we won't overflow the size. */
15359    if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
15360        PyErr_NoMemory();
15361        return NULL;
15362    }
15363    size = len + 1; /* copy the null character */
15364    size *= sizeof(Py_UNICODE);
15365    copy = PyMem_Malloc(size);
15366    if (copy == NULL) {
15367        PyErr_NoMemory();
15368        return NULL;
15369    }
15370    memcpy(copy, u, size);
15371    return copy;
15372}
15373
15374/* A _string module, to export formatter_parser and formatter_field_name_split
15375   to the string.Formatter class implemented in Python. */
15376
15377static PyMethodDef _string_methods[] = {
15378    {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15379     METH_O, PyDoc_STR("split the argument as a field name")},
15380    {"formatter_parser", (PyCFunction) formatter_parser,
15381     METH_O, PyDoc_STR("parse the argument as a format string")},
15382    {NULL, NULL}
15383};
15384
15385static struct PyModuleDef _string_module = {
15386    PyModuleDef_HEAD_INIT,
15387    "_string",
15388    PyDoc_STR("string helper module"),
15389    0,
15390    _string_methods,
15391    NULL,
15392    NULL,
15393    NULL,
15394    NULL
15395};
15396
15397PyMODINIT_FUNC
15398PyInit__string(void)
15399{
15400    return PyModule_Create(&_string_module);
15401}
15402
15403
15404#ifdef __cplusplus
15405}
15406#endif
15407